from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


import os
os.chdir('drive/MyDrive/DL2024_201810776/week12')

%load_ext autoreload
%autoreload 2


import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf


class PositionalEncoding(tf.keras.layers.Layer):
  def __init__(self, position, d_model):
    super(PositionalEncoding, self).__init__()
    self.pos_encoding = self.positional_encoding(position, d_model)

  def get_angles(self, position, i, d_model):
    angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
    return position * angles

  def positional_encoding(self, position, d_model):
    angle_rads = self.get_angles(
        position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
        i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
        d_model=d_model)

    # 배열의 짝수 인덱스(2i)에는 사인 함수 적용
    sines = tf.math.sin(angle_rads[:, 0::2])

    # 배열의 홀수 인덱스(2i+1)에는 코사인 함수 적용
    cosines = tf.math.cos(angle_rads[:, 1::2])

    angle_rads = np.zeros(angle_rads.shape)
    angle_rads[:, 0::2] = sines
    angle_rads[:, 1::2] = cosines
    pos_encoding = tf.constant(angle_rads)
    pos_encoding = pos_encoding[tf.newaxis, ...]

    print(pos_encoding.shape)
    return tf.cast(pos_encoding, tf.float32)

  def call(self, inputs):
    return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]


sample_pos_encoding = PositionalEncoding(50, 128)

plt.pcolormesh(sample_pos_encoding.pos_encoding.numpy()[0], cmap='RdBu')
plt.xlabel('Depth')
plt.xlim((0, 128))
plt.ylabel('Position')
plt.colorbar()
plt.show()

(1, 50, 128)


def scaled_dot_product_attention(query, key, value, mask):
  # query 크기 : (batch_size, num_heads, query의 문장 길이, d_model/num_heads)
  # key 크기 : (batch_size, num_heads, key의 문장 길이, d_model/num_heads)
  # value 크기 : (batch_size, num_heads, value의 문장 길이, d_model/num_heads)
  # padding_mask : (batch_size, 1, 1, key의 문장 길이)

  # Q와 K의 곱. 어텐션 스코어 행렬.
  matmul_qk = tf.matmul(query, key, transpose_b=True)

  # 스케일링
  # dk의 루트값으로 나눠준다.
  depth = tf.cast(tf.shape(key)[-1], tf.float32)
  logits = matmul_qk / tf.math.sqrt(depth)

  # 마스킹. 어텐션 스코어 행렬의 마스킹 할 위치에 매우 작은 음수값을 넣는다.
  # 매우 작은 값이므로 소프트맥스 함수를 지나면 행렬의 해당 위치의 값은 0이 된다.
  if mask is not None:
    logits += (mask * -1e9)

  # 소프트맥스 함수는 마지막 차원인 key의 문장 길이 방향으로 수행된다.
  # attention weight : (batch_size, num_heads, query의 문장 길이, key의 문장 길이)
  attention_weights = tf.nn.softmax(logits, axis=-1)

  # output : (batch_size, num_heads, query의 문장 길이, d_model/num_heads)
  output = tf.matmul(attention_weights, value)

  return output, attention_weights


class MultiHeadAttention(tf.keras.layers.Layer):

  def __init__(self, d_model, num_heads, name="multi_head_attention"):
    super(MultiHeadAttention, self).__init__(name=name)
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    # d_model을 num_heads로 나눈 값.
    # 논문 기준 : 64
    self.depth = d_model // self.num_heads

    # WQ, WK, WV에 해당하는 밀집층 정의
    self.query_dense = tf.keras.layers.Dense(units=d_model)
    self.key_dense = tf.keras.layers.Dense(units=d_model)
    self.value_dense = tf.keras.layers.Dense(units=d_model)

    # WO에 해당하는 밀집층 정의
    self.dense = tf.keras.layers.Dense(units=d_model)

  # num_heads 개수만큼 q, k, v를 split하는 함수
  def split_heads(self, inputs, batch_size):
    inputs = tf.reshape(
        inputs, shape=(batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(inputs, perm=[0, 2, 1, 3])

  def call(self, inputs):
    query, key, value, mask = inputs['query'], inputs['key'], inputs[
        'value'], inputs['mask']
    batch_size = tf.shape(query)[0]

    # 1. WQ, WK, WV에 해당하는 밀집층 지나기
    # q : (batch_size, query의 문장 길이, d_model)
    # k : (batch_size, key의 문장 길이, d_model)
    # v : (batch_size, value의 문장 길이, d_model)
    # 참고) 인코더(k, v)-디코더(q) 어텐션에서는 query 길이와 key, value의 길이는 다를 수 있다.
    query = self.query_dense(query)
    key = self.key_dense(key)
    value = self.value_dense(value)

    # 2. 헤드 나누기
    # q : (batch_size, num_heads, query의 문장 길이, d_model/num_heads)
    # k : (batch_size, num_heads, key의 문장 길이, d_model/num_heads)
    # v : (batch_size, num_heads, value의 문장 길이, d_model/num_heads)
    query = self.split_heads(query, batch_size)
    key = self.split_heads(key, batch_size)
    value = self.split_heads(value, batch_size)

    # 3. 스케일드 닷 프로덕트 어텐션. 앞서 구현한 함수 사용.
    # (batch_size, num_heads, query의 문장 길이, d_model/num_heads)
    scaled_attention, _ = scaled_dot_product_attention(query, key, value, mask)
    # (batch_size, query의 문장 길이, num_heads, d_model/num_heads)
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

    # 4. 헤드 연결(concatenate)하기
    # (batch_size, query의 문장 길이, d_model)
    concat_attention = tf.reshape(scaled_attention,
                                  (batch_size, -1, self.d_model))

    # 5. WO에 해당하는 밀집층 지나기
    # (batch_size, query의 문장 길이, d_model)
    outputs = self.dense(concat_attention)

    return outputs


def create_padding_mask(x):
  mask = tf.cast(tf.math.equal(x, 0), tf.float32)
  # (batch_size, 1, 1, key의 문장 길이)
  return mask[:, tf.newaxis, tf.newaxis, :]


def encoder_layer(dff, d_model, num_heads, dropout, name="encoder_layer"):
  inputs = tf.keras.Input(shape=(None, d_model), name="inputs")

  # 인코더는 패딩 마스크 사용
  padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

  # 멀티-헤드 어텐션 (첫번째 서브층 / 셀프 어텐션)
  attention = MultiHeadAttention(
      d_model, num_heads, name="attention")({
          'query': inputs, 'key': inputs, 'value': inputs, # Q = K = V
          'mask': padding_mask # 패딩 마스크 사용
      })

  # 드롭아웃 + 잔차 연결과 층 정규화
  attention = tf.keras.layers.Dropout(rate=dropout)(attention)
  attention = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(inputs + attention)

  # 포지션 와이즈 피드 포워드 신경망 (두번째 서브층)
  outputs = tf.keras.layers.Dense(units=dff, activation='relu')(attention)
  outputs = tf.keras.layers.Dense(units=d_model)(outputs)

  # 드롭아웃 + 잔차 연결과 층 정규화
  outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
  outputs = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention + outputs)

  return tf.keras.Model(
      inputs=[inputs, padding_mask], outputs=outputs, name=name)


def encoder(vocab_size, num_layers, dff,
            d_model, num_heads, dropout,
            name="encoder"):
  inputs = tf.keras.Input(shape=(None,), name="inputs")

  # 인코더는 패딩 마스크 사용
  padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

  # 포지셔널 인코딩 + 드롭아웃
  embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
  embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
  embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
  outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

  # 인코더를 num_layers개 쌓기
  for i in range(num_layers):
    outputs = encoder_layer(dff=dff, d_model=d_model, num_heads=num_heads,
        dropout=dropout, name="encoder_layer_{}".format(i),
    )([outputs, padding_mask])

  return tf.keras.Model(
      inputs=[inputs, padding_mask], outputs=outputs, name=name)


def create_look_ahead_mask(x):
  seq_len = tf.shape(x)[1]
  look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
  padding_mask = create_padding_mask(x) # 패딩 마스크도 포함
  return tf.maximum(look_ahead_mask, padding_mask)


def decoder_layer(dff, d_model, num_heads, dropout, name="decoder_layer"):
  inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
  enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")

  # 디코더는 룩어헤드 마스크(첫번째 서브층)와 패딩 마스크(두번째 서브층) 둘 다 사용.
  look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name="look_ahead_mask")
  padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

  # 멀티-헤드 어텐션 (첫번째 서브층 / 마스크드 셀프 어텐션)
  attention1 = MultiHeadAttention(
      d_model, num_heads, name="attention_1")(inputs={
          'query': inputs, 'key': inputs, 'value': inputs, # Q = K = V
          'mask': look_ahead_mask # 룩어헤드 마스크
      })

  # 잔차 연결과 층 정규화
  attention1 = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention1 + inputs)

  # 멀티-헤드 어텐션 (두번째 서브층 / 디코더-인코더 어텐션)
  attention2 = MultiHeadAttention(
      d_model, num_heads, name="attention_2")(inputs={
          'query': attention1, 'key': enc_outputs, 'value': enc_outputs, # Q != K = V
          'mask': padding_mask # 패딩 마스크
      })

  # 드롭아웃 + 잔차 연결과 층 정규화
  attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
  attention2 = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention2 + attention1)

  # 포지션 와이즈 피드 포워드 신경망 (세번째 서브층)
  outputs = tf.keras.layers.Dense(units=dff, activation='relu')(attention2)
  outputs = tf.keras.layers.Dense(units=d_model)(outputs)

  # 드롭아웃 + 잔차 연결과 층 정규화
  outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
  outputs = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(outputs + attention2)

  return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)


def decoder(vocab_size, num_layers, dff,
            d_model, num_heads, dropout,
            name='decoder'):
  inputs = tf.keras.Input(shape=(None,), name='inputs')
  enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')

  # 디코더는 룩어헤드 마스크(첫번째 서브층)와 패딩 마스크(두번째 서브층) 둘 다 사용.
  look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name='look_ahead_mask')
  padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

  # 포지셔널 인코딩 + 드롭아웃
  embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
  embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
  embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
  outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

  # 디코더를 num_layers개 쌓기
  for i in range(num_layers):
    outputs = decoder_layer(dff=dff, d_model=d_model, num_heads=num_heads,
        dropout=dropout, name='decoder_layer_{}'.format(i),
    )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])

  return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)


def transformer(vocab_size, num_layers, dff,
                d_model, num_heads, dropout,
                name="transformer"):

  # 인코더의 입력
  inputs = tf.keras.Input(shape=(None,), name="inputs")

  # 디코더의 입력
  dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")

  # 인코더의 패딩 마스크
  enc_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='enc_padding_mask')(inputs)

  # 디코더의 룩어헤드 마스크(첫번째 서브층)
  look_ahead_mask = tf.keras.layers.Lambda(
      create_look_ahead_mask, output_shape=(1, None, None),
      name='look_ahead_mask')(dec_inputs)

  # 디코더의 패딩 마스크(두번째 서브층)
  dec_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='dec_padding_mask')(inputs)

  # 인코더의 출력은 enc_outputs. 디코더로 전달
  enc_outputs = encoder(vocab_size=vocab_size, num_layers=num_layers, dff=dff,
      d_model=d_model, num_heads=num_heads, dropout=dropout,
  )(inputs=[inputs, enc_padding_mask]) # 인코더의 입력은 입력 문장과 패딩 마스크

  # 디코더의 출력은 dec_outputs. 출력층으로 전달
  dec_outputs = decoder(vocab_size=vocab_size, num_layers=num_layers, dff=dff,
      d_model=d_model, num_heads=num_heads, dropout=dropout,
  )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])

  # 다음 단어 예측을 위한 출력층
  outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)

  return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)


small_transformer = transformer(
    vocab_size = 9000,
    num_layers = 4,
    dff = 512,
    d_model = 128,
    num_heads = 4,
    dropout = 0.3,
    name="small_transformer")

tf.keras.utils.plot_model(
    small_transformer, to_file='small_transformer.png', show_shapes=True)

(1, 9000, 128)
(1, 9000, 128)


def loss_function(y_true, y_pred):
  y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))

  loss = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)

  mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
  loss = tf.multiply(loss, mask)

  return tf.reduce_mean(loss)


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = d_model
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)  # Explicitly cast step to float32
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        return tf.math.minimum(arg1, arg2) * self.d_model**-0.5
    def get_config(self):
        config = {
            "d_model": self.d_model,
            "warmup_steps": self.warmup_steps,
        }
        return config


sample_learning_rate = CustomSchedule(d_model=128)

plt.plot(sample_learning_rate(tf.range(200000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

Text(0.5, 0, 'Train Step')


import pandas as pd
import urllib.request
import tensorflow_datasets as tfds
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import re


train_data = pd.read_csv('dataset/ChatBotData.csv')
train_data.head()


print('챗봇 샘플의 개수 :', len(train_data))

챗봇 샘플의 개수 : 11823


questions = []
for sentence in train_data['Q']:
    # 구두점에 대해서 띄어쓰기
    # ex) 12시 땡! -> 12시 땡 !
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    questions.append(sentence)


answers = []
for sentence in train_data['A']:
    # 구두점에 대해서 띄어쓰기
    # ex) 12시 땡! -> 12시 땡 !
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    answers.append(sentence)


print(questions[:5])
print(answers[:5])

['12시 땡 !', '1지망 학교 떨어졌어', '3박4일 놀러가고 싶다', '3박4일 정도 놀러가고 싶다', 'PPL 심하네']
['하루가 또 가네요 .', '위로해 드립니다 .', '여행은 언제나 좋죠 .', '여행은 언제나 좋죠 .', '눈살이 찌푸려지죠 .']


# 서브워드텍스트인코더를 사용하여 질문과 답변을 모두 포함한 단어 집합(Vocabulary) 생성
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size=2**13)

# 시작 토큰과 종료 토큰에 대한 정수 부여.
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]

# 시작 토큰과 종료 토큰을 고려하여 단어 집합의 크기를 + 2
VOCAB_SIZE = tokenizer.vocab_size + 2


print('시작 토큰 번호 :',START_TOKEN)
print('종료 토큰 번호 :',END_TOKEN)
print('단어 집합의 크기 :',VOCAB_SIZE)

시작 토큰 번호 : [8178]
종료 토큰 번호 : [8179]
단어 집합의 크기 : 8180


# 서브워드텍스트인코더 토크나이저의 .encode()를 사용하여 텍스트 시퀀스를 정수 시퀀스로 변환.
print('Tokenized sample question: {}'.format(tokenizer.encode(questions[20])))

Tokenized sample question: [5766, 611, 3509, 141, 685, 3747, 849]


# 서브워드텍스트인코더 토크나이저의 .encode()와 decode() 테스트해보기

# 임의의 입력 문장을 sample_string에 저장
sample_string = questions[20]

# encode() : 텍스트 시퀀스 --> 정수 시퀀스
tokenized_string = tokenizer.encode(sample_string)
print ('정수 인코딩 후의 문장 {}'.format(tokenized_string))

# decode() : 정수 시퀀스 --> 텍스트 시퀀스
original_string = tokenizer.decode(tokenized_string)
print ('기존 문장: {}'.format(original_string))

정수 인코딩 후의 문장 [5766, 611, 3509, 141, 685, 3747, 849]
기존 문장: 가스비 비싼데 감기 걸리겠어


# 각 정수는 각 단어와 어떻게 mapping되는지 병렬로 출력
# 서브워드텍스트인코더는 의미있는 단위의 서브워드로 토크나이징한다. 띄어쓰기 단위 X 형태소 분석 단위 X
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer.decode([ts])))

5766 ----> 가스
611 ----> 비 
3509 ----> 비싼
141 ----> 데 
685 ----> 감기 
3747 ----> 걸리
849 ----> 겠어


# 최대 길이를 40으로 정의
MAX_LENGTH = 40

# 토큰화 / 정수 인코딩 / 시작 토큰과 종료 토큰 추가 / 패딩
def tokenize_and_filter(inputs, outputs):
  tokenized_inputs, tokenized_outputs = [], []

  for (sentence1, sentence2) in zip(inputs, outputs):
    # encode(토큰화 + 정수 인코딩), 시작 토큰과 종료 토큰 추가
    sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
    sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN

    tokenized_inputs.append(sentence1)
    tokenized_outputs.append(sentence2)

  # 패딩
  tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_inputs, maxlen=MAX_LENGTH, padding='post')
  tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_outputs, maxlen=MAX_LENGTH, padding='post')

  return tokenized_inputs, tokenized_outputs


questions, answers = tokenize_and_filter(questions, answers)


print('질문 데이터의 크기(shape) :', questions.shape)
print('답변 데이터의 크기(shape) :', answers.shape)

질문 데이터의 크기(shape) : (11823, 40)
답변 데이터의 크기(shape) : (11823, 40)


idx = 9
print(questions[idx])
print(answers[idx])

[8178 2161  782 7612  204   84  183  352 1258 8179    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[8178  350 3985   16   32    1 8179    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]


print('단어 집합의 크기(Vocab size): {}'.format(VOCAB_SIZE))
print('전체 샘플의 수(Number of samples): {}'.format(len(questions)))

단어 집합의 크기(Vocab size): 8180
전체 샘플의 수(Number of samples): 11823


# 텐서플로우 dataset을 이용하여 셔플(shuffle)을 수행하되, 배치 크기로 데이터를 묶는다.
# 또한 이 과정에서 교사 강요(teacher forcing)을 사용하기 위해서 디코더의 입력과 실제값 시퀀스를 구성한다.
BATCH_SIZE = 64
BUFFER_SIZE = 20000

# 디코더의 실제값 시퀀스에서는 시작 토큰을 제거해야 한다.
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': questions,
        'dec_inputs': answers[:, :-1] # 디코더의 입력. 마지막 패딩 토큰이 제거된다.
    },
    {
        'outputs': answers[:, 1:]  # 맨 처음 토큰이 제거된다. 다시 말해 시작 토큰이 제거된다.
    },
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)


# 임의의 샘플에 대해서 [:, :-1]과 [:, 1:]이 어떤 의미를 가지는지 테스트해본다.
print(answers[0]) # 기존 샘플
print(answers[:1][:, :-1]) # 마지막 패딩 토큰 제거하면서 길이가 39가 된다.
print(answers[:1][:, 1:]) # 맨 처음 토큰이 제거된다. 다시 말해 시작 토큰이 제거된다. 길이는 역시 39가 된다.

[8178 3844   74 7894    1 8179    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[[8178 3844   74 7894    1 8179    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]
[[3844   74 7894    1 8179    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]


tf.keras.backend.clear_session()

# Hyper-parameters
NUM_LAYERS = 2
D_MODEL = 256
NUM_HEADS = 8
DFF = 512
DROPOUT = 0.1

model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

(1, 8180, 256)
(1, 8180, 256)


MAX_LENGTH = 40

learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
  # ensure labels have shape (batch_size, MAX_LENGTH - 1)
  y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
  return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])


EPOCHS = 50

model.fit(dataset, epochs=EPOCHS)

Epoch 1/50
185/185 [==============================] - 45s 121ms/step - loss: 1.4468 - accuracy: 0.0261
Epoch 2/50
185/185 [==============================] - 14s 75ms/step - loss: 1.1741 - accuracy: 0.0494
Epoch 3/50
185/185 [==============================] - 12s 65ms/step - loss: 1.0053 - accuracy: 0.0506
Epoch 4/50
185/185 [==============================] - 12s 65ms/step - loss: 0.9293 - accuracy: 0.0543
Epoch 5/50
185/185 [==============================] - 12s 63ms/step - loss: 0.8713 - accuracy: 0.0574
Epoch 6/50
185/185 [==============================] - 12s 64ms/step - loss: 0.8116 - accuracy: 0.0618
Epoch 7/50
185/185 [==============================] - 11s 62ms/step - loss: 0.7450 - accuracy: 0.0681
Epoch 8/50
185/185 [==============================] - 11s 60ms/step - loss: 0.6709 - accuracy: 0.0759
Epoch 9/50
185/185 [==============================] - 11s 61ms/step - loss: 0.5922 - accuracy: 0.0845
Epoch 10/50
185/185 [==============================] - 11s 62ms/step - loss: 0.5098 - accuracy: 0.0940
Epoch 11/50
185/185 [==============================] - 12s 63ms/step - loss: 0.4269 - accuracy: 0.1039
Epoch 12/50
185/185 [==============================] - 11s 61ms/step - loss: 0.3459 - accuracy: 0.1152
Epoch 13/50
185/185 [==============================] - 11s 61ms/step - loss: 0.2710 - accuracy: 0.1259
Epoch 14/50
185/185 [==============================] - 12s 62ms/step - loss: 0.2056 - accuracy: 0.1363
Epoch 15/50
185/185 [==============================] - 11s 61ms/step - loss: 0.1513 - accuracy: 0.1454
Epoch 16/50
185/185 [==============================] - 11s 61ms/step - loss: 0.1096 - accuracy: 0.1529
Epoch 17/50
185/185 [==============================] - 12s 64ms/step - loss: 0.0796 - accuracy: 0.1587
Epoch 18/50
185/185 [==============================] - 11s 61ms/step - loss: 0.0618 - accuracy: 0.1620
Epoch 19/50
185/185 [==============================] - 11s 61ms/step - loss: 0.0515 - accuracy: 0.1635
Epoch 20/50
185/185 [==============================] - 12s 62ms/step - loss: 0.0453 - accuracy: 0.1646
Epoch 21/50
185/185 [==============================] - 11s 59ms/step - loss: 0.0428 - accuracy: 0.1649
Epoch 22/50
185/185 [==============================] - 11s 59ms/step - loss: 0.0410 - accuracy: 0.1651
Epoch 23/50
185/185 [==============================] - 11s 61ms/step - loss: 0.0369 - accuracy: 0.1660
Epoch 24/50
185/185 [==============================] - 11s 60ms/step - loss: 0.0323 - accuracy: 0.1671
Epoch 25/50
185/185 [==============================] - 11s 62ms/step - loss: 0.0287 - accuracy: 0.1680
Epoch 26/50
185/185 [==============================] - 11s 60ms/step - loss: 0.0249 - accuracy: 0.1688
Epoch 27/50
185/185 [==============================] - 11s 59ms/step - loss: 0.0229 - accuracy: 0.1694
Epoch 28/50
185/185 [==============================] - 11s 61ms/step - loss: 0.0201 - accuracy: 0.1701
Epoch 29/50
185/185 [==============================] - 11s 61ms/step - loss: 0.0184 - accuracy: 0.1706
Epoch 30/50
185/185 [==============================] - 11s 61ms/step - loss: 0.0167 - accuracy: 0.1709
Epoch 31/50
185/185 [==============================] - 11s 59ms/step - loss: 0.0155 - accuracy: 0.1713
Epoch 32/50
185/185 [==============================] - 11s 61ms/step - loss: 0.0143 - accuracy: 0.1716
Epoch 33/50
185/185 [==============================] - 11s 60ms/step - loss: 0.0129 - accuracy: 0.1719
Epoch 34/50
185/185 [==============================] - 11s 60ms/step - loss: 0.0126 - accuracy: 0.1719
Epoch 35/50
185/185 [==============================] - 11s 61ms/step - loss: 0.0110 - accuracy: 0.1725
Epoch 36/50
185/185 [==============================] - 11s 61ms/step - loss: 0.0114 - accuracy: 0.1723
Epoch 37/50
185/185 [==============================] - 12s 63ms/step - loss: 0.0099 - accuracy: 0.1727
Epoch 38/50
185/185 [==============================] - 11s 60ms/step - loss: 0.0098 - accuracy: 0.1727
Epoch 39/50
185/185 [==============================] - 11s 60ms/step - loss: 0.0093 - accuracy: 0.1729
Epoch 40/50
185/185 [==============================] - 12s 63ms/step - loss: 0.0089 - accuracy: 0.1729
Epoch 41/50
185/185 [==============================] - 11s 61ms/step - loss: 0.0081 - accuracy: 0.1732
Epoch 42/50
185/185 [==============================] - 11s 60ms/step - loss: 0.0079 - accuracy: 0.1731
Epoch 43/50
185/185 [==============================] - 11s 60ms/step - loss: 0.0078 - accuracy: 0.1732
Epoch 44/50
185/185 [==============================] - 11s 59ms/step - loss: 0.0070 - accuracy: 0.1734
Epoch 45/50
185/185 [==============================] - 11s 60ms/step - loss: 0.0069 - accuracy: 0.1735
Epoch 46/50
185/185 [==============================] - 11s 61ms/step - loss: 0.0069 - accuracy: 0.1734
Epoch 47/50
185/185 [==============================] - 11s 60ms/step - loss: 0.0064 - accuracy: 0.1736
Epoch 48/50
185/185 [==============================] - 11s 60ms/step - loss: 0.0061 - accuracy: 0.1736
Epoch 49/50
185/185 [==============================] - 11s 58ms/step - loss: 0.0060 - accuracy: 0.1736
Epoch 50/50
185/185 [==============================] - 11s 59ms/step - loss: 0.0056 - accuracy: 0.1737

<keras.src.callbacks.History at 0x7bf9ebd1a110>


model.save_weights('tansformer_weight')


check_model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

check_model.load_weights('tansformer_weight')

(1, 8180, 256)
(1, 8180, 256)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7bf9eaef4b80>


def preprocess_sentence(sentence):
  sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
  sentence = sentence.strip()
  return sentence

def evaluate(sentence):
  sentence = preprocess_sentence(sentence)

  sentence = tf.expand_dims(
      START_TOKEN + tokenizer.encode(sentence) + END_TOKEN, axis=0)

  output = tf.expand_dims(START_TOKEN, 0)

  # 디코더의 예측 시작
  for i in range(MAX_LENGTH):
    predictions = model(inputs=[sentence, output], training=False)

    # 현재(마지막) 시점의 예측 단어를 받아온다.
    predictions = predictions[:, -1:, :]
    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

    # 만약 마지막 시점의 예측 단어가 종료 토큰이라면 예측을 중단
    if tf.equal(predicted_id, END_TOKEN[0]):
      break

    # 마지막 시점의 예측 단어를 출력에 연결한다.
    # 이는 for문을 통해서 디코더의 입력으로 사용될 예정이다.
    output = tf.concat([output, predicted_id], axis=-1)

  return tf.squeeze(output, axis=0)


def predict(sentence):
  prediction = evaluate(sentence)

  predicted_sentence = tokenizer.decode(
      [i for i in prediction if i < tokenizer.vocab_size])

  print('Input: {}'.format(sentence))
  print('Output: {}'.format(predicted_sentence))

  return predicted_sentence


output = predict("사랑해")

Input: 사랑해
Output: 하늘 만큼 땅 만큼 사랑해요 .


output = predict("밥 먹었어?")

Input: 밥 먹었어?
Output: 저는 배터리가 밥이예요 .


output = predict("나랑 쇼핑하러 가자")

Input: 나랑 쇼핑하러 가자
Output: 좋아요 !


output = predict("바다로 여행가고 싶어")

Input: 바다로 여행가고 싶어
Output: 여행 좋죠 !

	Self Attention	기존의 Attention
Q = Query	입력 문장의 모든 Word Vector	t 시점의 디코더 셀에서의 은닉 상태
K = Keys	입력 문장의 모든 Word Vector	모든 시점의 인코더 셀의 은닉 상태들
V = Values	입력 문장의 모든 Word Vector	모든 시점의 인코더 셀의 은닉 상태들

[딥러닝] 생성형 모델 : VAE / GAN / Diffusion (+ 이미지 생성 및 복원해보기) (0)	2024.06.17
[딥러닝] 기계 번역 : Seq2Seq와 Attention (+ 모델 학습시켜 다국어 번역해보기) (0)	2024.06.11
[딥러닝] NLP : 자연어 처리 기본 (+ 영화 리뷰글 긍정/부정 판단해보기) (0)	2024.06.09
[딥러닝] 기억하는 신경망 : RNN, 그리고 개선 모델 (LSTM, GRU) (0)	2024.06.08
[딥러닝] CNN : ResNet 모델로 동물 이미지 분류하기(CIFAR 이미지셋) (0)	2024.06.08

	Q	A
0	12시 땡!	하루가 또 가네요.
1	1지망 학교 떨어졌어	위로해 드립니다.
2	3박4일 놀러가고 싶다	여행은 언제나 좋죠.
3	3박4일 정도 놀러가고 싶다	여행은 언제나 좋죠.
4	PPL 심하네	눈살이 찌푸려지죠.

데굴데굴 개발자의 기록

CATEGORIES

[딥러닝] Transformer : 소개와 동작 원리 (+ 간단한 챗봇 만들기) /category/Data%20Science/%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D%20%26%20%EB%94%A5%EB%9F%AC%EB%8B%9D

Seq2Seq (Sequence-to-Sequence)

RNN(LSTM/GRU) 기반 모델의 한계

Attention

아이디어

Transformer

Self Attention

Self Attention의 과정

1. Query, Key, Value(Q, K, V) 벡터 얻기

2. Attention Value 계산하기

Transformer의 메커니즘

Encoder : Input에 대한 처리

Positional Encoding

Encoder : Self Attention Layer

Encoder : 출력

Decoder : Input에 대한 처리

Decoder : Self Attention Layer

Decoder : Encoder-Decoder Attention Layer

Decoder : 출력

Transformer 정리

Attention

여러 개의 인코더와 디코더

Transformer 모델의 확장

BERT (2018)(Pretraining of Deep Bidirectional Transformers for Language Understanding)

GPT(Generative Pre-trained Transformer)

간단한 챗봇 만들기 : Transformer 모델 구성 및 학습

Positional Encoding 준비¶

MultiHeadAttention 준비¶

Encoder Layer 정의¶

1. 입력¶

2. Self-Attention (첫 번째 Sub layer)¶

3. Feed-forwarding (두 번째 Sub layer)¶

4. 드롭아웃 및 잔차 연결 및 층 정규화¶

5. 출력¶

Encoder¶

1. 입력 처리¶

2. 여러 개의 Encoder 쌓기¶

3. 인코더의 최종 출력 반환¶

Decoder Layer 정의¶

1. 입력¶

2. Masked self-attention (첫 번째 Sub layer):¶

3. 잔차 연결과 정규화¶

4. Decoder-Encoder Attention (두 번째 Sub layer):¶

5. Feed-fowarding (세 번째 Sub layer):¶

6. 출력:¶

Decoder¶

1. 입력 처리¶

2. 여러 개의 Decoder 쌓기¶

3. 디코더의 최종 출력 반환¶

Transformer 아키텍처 정의¶

1. 인코더와 디코더의 입력¶

2. 인코더와 디코더 모델 호출¶

3. 출력층¶

모델 아키텍쳐 보기¶

Loss Function¶

Transformer의 학습률 스케줄링 정의¶

1. 데이터 로드¶

2. 데이터 전처리 및 토큰화, 임베딩 벡터로 변환¶

서브워드 텍스트 인코더¶

임베딩 벡터 출력¶

3. 모델 학습¶

1. 하이퍼 파라미터 설정¶

2. 학습률 스케줄링 설정¶

3. 모델 설정¶

4. 결과 확인¶

'Data Science > 머신러닝 & 딥러닝' 카테고리의 다른 글

BELATED ARTICLES

NOTICE

ARCHIVE

RECENTPOST

RECENTCOMMENT

티스토리툴바

[딥러닝] Transformer : 소개와 동작 원리 (+ 간단한 챗봇 만들기)
/category/Data%20Science/%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D%20%26%20%EB%94%A5%EB%9F%AC%EB%8B%9D

2. Attention Value 계산하기