OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 22, 2025

Commit

7f390c3

verified ·

1 Parent(s): b6c9959

Update Inference.py

Browse files

Files changed (1) hide show

Inference.py +292 -171

Inference.py CHANGED Viewed

@@ -1,233 +1,354 @@
-import json
-import numpy as np
-import pandas as pd
-import tensorflow as tf
-from tensorflow.keras import layers
-import sentencepiece as spm
 import requests
-# ⬇️ 토크나이저 불러오기
-sp = spm.SentencePieceProcessor()
-sp.load("ko_unigram.model")
-# ⬇️ 특수 토큰 ID 추출
-pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
-start_id = sp.piece_to_id("<start>")
-sep_id = sp.piece_to_id("<sep>")
-end_id = sp.piece_to_id("<end>")
-unk_id = sp.piece_to_id("<unk>")
 vocab_size = sp.get_piece_size()
 print(f"✅ Vocabulary size: {vocab_size}")
-# ⬇️ 텍스트 <-> ID 변환 함수
 def text_to_ids(text):
     return sp.encode(text, out_type=int)
 def ids_to_text(ids):
     return sp.decode(ids)
-max_len = 230
-batch_size = 128
-class Lo(layers.Layer):
-    def __init__(self, d_model):
         super().__init__()
-        # 내부 계산은 float32로 유지
-        self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
-        self.p = layers.Dense(96, use_bias=True, dtype='float32')
-        self._out_dtype = 'float32'
     def call(self, x):
-        # x may be bfloat16; cast to float32 for stable intermediate computation
-        x_f32 = tf.cast(x, tf.float32)
-        x = self.proj(x_f32)
-        x = tf.nn.gelu(x)
-        x = self.p(x)
-        # cast back to model dtype for consistency
-        return tf.cast(x, self._out_dtype)
-class LoSoU(layers.Layer):
-    """
-    안정화된 LoSoU 레이어 (동적 alpha 사용)
-    - alpha 값을 입력에 따라 동적으로 계산: alpha = sigmoid(Linear(x))
-    - 누적합 대신 지수이동평균(EMA) 사용 (alpha: smoothing factor)
-    - 내부 계산은 float32로 수행 (TPU bfloat16 안정성 향상)
-    - EMA 결과 클리핑 및 작은 epsilon 적용
-    - 안전한 split 처리 (짝수 차원 가정; 아니라면 마지막 차원 pad 필요)
-    """
     def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()
-        # 대부분 연산을 float32로 수행
         self.d_model = d_model
         self.clip_value = float(clip_value)
         self.eps = float(eps)
-        # projection / gating layers in float32
-        self.Q = layers.Dense(96, dtype='float32')
-        self.K = layers.Dense(96, dtype='float32')
-        self.V = layers.Dense(96, activation='gelu', dtype='float32')
-        self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
-        # 동적 alpha 계산을 위한 레이어
-        # alpha는 [0, 1] 범위여야 하므로 sigmoid 사용
-        # 입력 x의 d_model 차원을 사용하여 각 샘플에 대해 alpha 계산
-        # 예: (B, L, d_model) -> (B, L, 1) -> (B, L, 1) with sigmoid
-        # 또는 (B, L, d_model) -> (B, L, d_model) -> global reduce -> (B, L, 1)
-        # 간단히 각 위치에 대해 동일한 alpha 사용 (입력의 평균 기반)
-        # 또는 위치별로 다르게 사용 (각 위치에 대해 계산)
-        # 여기서는 위치별로 다르게 계산 (B, L, 1)
-        self.alpha_linear = layers.Dense(1, activation='sigmoid', dtype='float32')
-    def _ema_over_time(self, score, alpha_dynamic):
-        # score: (B, L, D) float32 in [0,1] roughly
-        # alpha_dynamic: (B, L, 1) float32 in [0,1]
-        # transpose to (L, B, D) to scan over time steps
-        seq = tf.transpose(score, perm=[1, 0, 2])  # (L, B, D)
-        alpha_seq = tf.transpose(alpha_dynamic, perm=[1, 0, 2])  # (L, B, 1)
-        def step(prev_ema, inputs):
-            x_t, alpha_t = inputs
-            # prev_ema: (B, D), x_t: (B, D), alpha_t: (B, 1)
-            new = alpha_t * x_t + (1.0 - alpha_t) * prev_ema
-            return new
-        # 초기값을 첫 step 값으로 설정
-        init = seq[0]  # (B, D)
-        first_alpha = alpha_seq[0]  # (B, 1)
-        # scan의 elems는 (L-1, B, D) 및 (L-1, B, 1) 이어야 함
-        remaining_seq = seq[1:]  # (L-1, B, D)
-        remaining_alpha = alpha_seq[1:]  # (L-1, B, 1)
-        # elems는 두 텐서의 튜플로 구성: (x_t, alpha_t)
-        elems = (remaining_seq, remaining_alpha)
-        ema_seq = tf.scan(fn=step, elems=elems, initializer=init)
-        # 초기값 포함
-        ema_seq = tf.concat([tf.expand_dims(init, 0), ema_seq], axis=0)  # (L, B, D)
-        # transpose back to (B, L, D)
-        ema = tf.transpose(ema_seq, perm=[1, 0, 2])
-        return ema
-    def call(self, x):
-        # x: (B, L, d_model) maybe bfloat16 or float32
-        # cast to float32 for all internal computations
         x_f32 = tf.cast(x, tf.float32)
         residual = x_f32
-        # Q, K, V
-        q = self.Q(x_f32)   # (B, L, 96)
-        k = self.K(x_f32)   # (B, L, 96)
-        V = tf.cast(self.V(x), tf.float32)  # ensure V's output is float32
-        # gating signals in (0,1)
-        g_q = tf.nn.sigmoid(q)
-        g_k = tf.nn.tanh(k)
-        # elementwise product -> bounded roughly [0,1]
         score = g_q * g_k
-        # 동적 alpha 계산: (B, L, d_model) -> (B, L, 1)
-        alpha_dynamic = self.alpha_linear(x_f32) * 0.8 + 0.1 # (B, L, 1)
-        # 필요시 alpha_dynamic에 대한 후처리 (예: min/max 등) 가능
-        # ex: alpha_dynamic = tf.clip_by_value(alpha_dynamic, 0.01, 0.99)
-        # EMA across time (stable alternative to cumsum)
-        score_ema = self._ema_over_time(score, alpha_dynamic)
-        # optionally normalize by (mean + eps) across last dim to reduce scale variations
-        mean_last = tf.reduce_mean(score_ema, axis=-1, keepdims=True)  # (B, L, 1)
-        denom = tf.maximum(mean_last, self.eps)
-        score_norm = score_ema / denom
-        # clip to avoid extremes
         score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
-        # combine with V
-        x_comb = score_clipped * V  # (B, L, d_model)
-        out = self.proj(x_comb)  # (B, L, d_model)
-        out = self.norm(out)
-        # cast back to original dtype for downstream layers
         return tf.cast(out, x.dtype)
-class Block(layers.Layer):
-    def __init__(self, d_model, hyper_n):
-        super().__init__()
-        self.losou = [LoSoU(d_model) for _ in range(hyper_n)]
-    def call(self, x):
-        for losou in self.losou:
-            x = losou(x)
-        return x
-class ReLaM(tf.keras.Model):
-    def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
         super().__init__()
-        self.token_embedding = layers.Embedding(vocab_size, 128)
-        self.pos_embedding = layers.Embedding(max_seq_len, 128)
-        self.blocks = [Block(d_model, hyper_n=1) for _ in range(n_layers)]
-        self.proj = layers.Dense(128)
-        self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
-    def call(self, x, training=False):
-        batch_size, seq_len = tf.shape(x)[0], tf.shape(x)[1]
-        positions = tf.range(seq_len)[tf.newaxis, :]
-        x = self.token_embedding(x) + self.pos_embedding(positions)
-        for block in self.blocks:
-            x = block(x)
-        x = self.proj(x)
-        x = self.ln_f(x)
-        embedding_matrix = tf.cast(self.token_embedding.embeddings, x.dtype)
-        logits = tf.matmul(x, embedding_matrix, transpose_b=True)
-        return tf.cast(logits, tf.float32)
-# 모델 생성
-model = ReLaM(
-    vocab_size=vocab_size,
-    max_seq_len=max_len,
-    d_model=256,
-    n_layers=1
-)
-dummy_input = tf.zeros((1, max_len), dtype=tf.int32)
-_ = model(dummy_input)
-model.load_weights('/content/Cobra.weights.h5')
 print("모델 가중치 로드 완료!")
-def generate_text_topp(model, prompt, max_len=100, max_gen=98, p=0.9, temperature=0.8, min_len=30):
     model_input = text_to_ids(f"<start> {prompt} <sep>")
     model_input = model_input[:max_len]
     generated = list(model_input)
     for step in range(max_gen):
-        if len(generated) > max_len:
             input_seq = generated[-max_len:]
         else:
             input_seq = generated
         input_padded = np.pad(input_seq, (0, max_len - len(input_seq)), constant_values=pad_id)
         input_tensor = tf.convert_to_tensor([input_padded])
-        logits = model(input_tensor, training=False)
         next_token_logits = logits[0, len(input_seq) - 1].numpy()
         next_token_logits[end_id] -= 5.0
         next_token_logits[pad_id] -= 10.0
         probs = tf.nn.softmax(next_token_logits / temperature).numpy()
         sorted_indices = np.argsort(probs)[::-1]
         sorted_probs = probs[sorted_indices]
         cumulative_probs = np.cumsum(sorted_probs)
         cutoff = np.searchsorted(cumulative_probs, p)
         top_indices = sorted_indices[:cutoff + 1]
         top_probs = sorted_probs[:cutoff + 1]
         top_probs /= np.sum(top_probs)
         next_token_id = np.random.choice(top_indices, p=top_probs)
         if next_token_id == end_id and len(generated) >= min_len:
             break
         generated.append(int(next_token_id))
-    return ids_to_text(generated)
 print("\n\n===== 생성 결과 =====")
-print(generate_text_topp(model, "제가 이따가 버스를 타야 해서 준비 좀 해야겠어요. 재미있는 대화였습니다!", p=0.8))

+import tensorflow as tf
+from tensorflow.keras import layers, Model
+import numpy as np
+import tensorflow.keras.backend as K
+from tensorflow.keras import mixed_precision
+import sentencepiece as spm
+import os, json
 import requests
+print('1')
+tf.get_logger().setLevel("ERROR")
+SEED = 42
+tf.random.set_seed(SEED)
+np.random.seed(SEED)
+max_len = 150 # 기존 코드에서 200으로 설정됨
+batch_size = 128
+# TPU 초기화 (기존 코드와 동일)
+try:
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    strategy = tf.distribute.TPUStrategy(resolver)
+    print("✅ TPU 초기화 완료:", resolver.cluster_spec().as_dict())
+    on_tpu = True
+except Exception as e:
+    print("⚠️ TPU 미사용, GPU/CPU로 진행:", e)
+    strategy = tf.distribute.get_strategy()
+    on_tpu = False
+# Mixed precision (기존 코드와 동일)
+policy = mixed_precision.Policy("mixed_bfloat16" if on_tpu else "float32")
+mixed_precision.set_global_policy(policy)
+print("✅ Mixed precision:", policy)
+# =======================
+# 1) 파일 다운로드 및 토크나이저 초기화 (기존 코드와 동일)
+# =======================
+def download_file(url, save_path):
+    r = requests.get(url, stream=True)
+    r.raise_for_status()
+    with open(save_path, "wb") as f:
+        for chunk in r.iter_content(8192*2):
+            f.write(chunk)
+    print(f"✅ {save_path} 저장됨")
+DATA_PATH = "converted.jsonl"
+TOKENIZER_PATH = "ko_unigram.model"
+if not os.path.exists(DATA_PATH):
+    download_file(
+        "https://huggingface.co/datasets/Yuchan5386/TinyInst/resolve/main/output.jsonl?download=true",
+        DATA_PATH
+    )
+if not os.path.exists(TOKENIZER_PATH):
+    download_file(
+        "https://huggingface.co/datasets/Yuchan5386/TinyInst/resolve/main/ko_unigram.model?download=true",
+        TOKENIZER_PATH
+    )
+sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
+pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
+start_id = sp.piece_to_id("<start>")
+sep_id = sp.piece_to_id("<sep>")
+end_id = sp.piece_to_id("<end>")
+unk_id = sp.piece_to_id("<unk>")
 vocab_size = sp.get_piece_size()
 print(f"✅ Vocabulary size: {vocab_size}")
 def text_to_ids(text):
     return sp.encode(text, out_type=int)
 def ids_to_text(ids):
     return sp.decode(ids)
+# =======================
+# 3) 모델 레이어 (기존 코드 유지)
+# =======================
+class SwiGLU(layers.Layer):
+    def __init__(self, d_model, d_ff):
         super().__init__()
+        self.proj = layers.Dense(d_ff)
+        self.out = layers.Dense(d_model)
     def call(self, x):
+        x_proj = self.proj(x)
+        x_val, x_gate = tf.split(x_proj, 2, axis=-1)
+        return self.out(x_val * tf.nn.silu(x_gate))
+class gMLPBlock(layers.Layer):
+    def __init__(self, d_model, seq_len, dropout=0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.seq_len = seq_len
+        self.norm = layers.LayerNormalization(epsilon=1e-6)
+        # FFN: Channel Expansion
+        # d_model * 4로 확장
+        self.channel_proj = layers.Dense(d_model * 4, use_bias=True)
+        self.dropout = layers.Dropout(dropout)
+        # Spatial Gating Unit (SGU)
+        self.sgu_norm = layers.LayerNormalization(epsilon=1e-6)
+        self.sgu_proj = layers.Dense(seq_len, use_bias=False)
+        # 출력 차원을 d_model * 2 (U의 차원)로 설정
+        self.sgu_final = layers.Dense(d_model * 2, use_bias=True)
+        self.out_proj = layers.Dense(d_model, use_bias=True)
+    def call(self, x, training=False):
+        # 1. Norm and Channel Expansion
+        residual = x
+        x_norm = self.norm(x)
+        x_proj = self.channel_proj(x_norm) # Shape: (B, L, 4*D)
+        # 2. Split (U and V streams)
+        u, v = tf.split(x_proj, 2, axis=-1) # u, v Shape: (B, L, 2*D)
+        # 3. Spatial Gating Unit (SGU)
+        v_norm = self.sgu_norm(v)
+        v_norm_T = tf.transpose(v_norm, perm=[0, 2, 1]) # (B, 2D, L)
+        # 💡 토큰 믹싱 발생 (시퀀스 축으로 Dense 적용)
+        v_proj = self.sgu_proj(v_norm_T) # (B, 2D, L)
+        v_proj_T = tf.transpose(v_proj, perm=[0, 2, 1]) # (B, L, 2D)
+        # 4. Activation and Gate Generation
+        # 표준 gMLP는 U에 GELU를 적용하고 V는 선형 게이트로 사용
+        # 여기서는 U에 GELU를 적용
+        u_act = tf.nn.gelu(u)
+        v_gate = self.sgu_final(v_proj_T) # Shape: (B, L, 2*D)
+        # 5. Gating and Contraction
+        z = u_act * v_gate # 게이팅
+        z = self.dropout(z, training=training)
+        out = self.out_proj(z) # Shape: (B, L, D)
+        # 6. Residual Connection
+        return residual + out
+class CrossBlock(layers.Layer):
+    def __init__(self, clip_value=5.0, eps=1e-6): # 💡 d_model 인자 추가
+        super().__init__()
+        self.clip_value = clip_value
+        self.eps = eps
+        # 💡 수정: 출력 차원을 1에서 d_model로 변경
+    def call(self, x, z):
+        # a의 shape: (Batch, Seq_len, D_model)
+        g_q = (tf.nn.tanh(x) + 1.0) / 2.0
+        g_k = (tf.nn.tanh(z) + 1.0) / 2.0
+        score = (g_q * g_k)
+        score = tf.cumsum(score, axis=1)
+        seq_len = tf.shape(score)[1]
+        # [1, 2, 3, ..., L]을 D_model 차원으로 확장
+        count_for_mean = tf.cast(tf.range(seq_len) + 1, score.dtype)
+        count_for_mean = tf.reshape(count_for_mean, (1, seq_len, 1))
+        # 누적합을 현재까지의 토큰 개수로 나누어 평균 누적합 계산 (B, L, D)
+        score_mean = score / count_for_mean
+        # 정규화 분모 설정
+        denom = tf.maximum(score_mean, self.eps)
+        score_norm = score / denom
+        # -----------------------------------------------
+        score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
+        y = score_clipped * z
+        return y
+class LoU(layers.Layer):
     def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()
         self.d_model = d_model
         self.clip_value = float(clip_value)
         self.eps = float(eps)
+        self.Q = layers.Dense(d_model, dtype='float32')
+        self.K = layers.Dense(d_model, dtype='float32')
+        self.V = layers.Dense(d_model, dtype='float32')
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
+        self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
+        self.glu = SwiGLU(d_model, 320)
+        self.cross = CrossBlock()
+    def call(self, x, z):
         x_f32 = tf.cast(x, tf.float32)
         residual = x_f32
+        x_f32 = self.norm1(x)
+        q = self.Q(x_f32)
+        k = self.K(x_f32)
+        V = self.V(x_f32)
+        g_q = (tf.nn.tanh(q) + 1.0) / 2.0
+        g_k = (tf.nn.tanh(k) + 1.0) / 2.0
         score = g_q * g_k
+        score = tf.cumsum(score, axis=1) # (B, L, D)
+        # 💡 수정된 부분: 현재 토큰까지의 누적합 평균으로 정규화
+        seq_len = tf.shape(score)[1]
+        # [1, 2, 3, ..., L]을 D_model 차원으로 확장
+        count_for_mean = tf.cast(tf.range(seq_len) + 1, score.dtype)
+        count_for_mean = tf.reshape(count_for_mean, (1, seq_len, 1))
+        # 누적합을 현재까지의 토큰 개수로 나누어 평균 누적합 계산 (B, L, D)
+        score_mean = score / count_for_mean
+        # 정규화 분모 설정
+        denom = tf.maximum(score_mean, self.eps)
+        score_norm = score / denom
+        # -----------------------------------------------
         score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
+        x_comb = score_clipped * V
+        out = self.norm(x_comb + residual)
+        out = self.cross(out, z)
+        out = self.glu(out)
         return tf.cast(out, x.dtype)
+# =======================
+# 4) AlphaS2S 모델 (기존 코드 유지)
+# =======================
+class AlphaS2S(tf.keras.Model):
+    def __init__(self, num_layers, d_model, num_heads, input_vocab_size, target_vocab_size, max_len=200, dropout=0.1):
         super().__init__()
+        self.max_len = max_len
+        self.d_model = d_model
+        # 인코더와 디코더 임베딩 및 위치 임베딩은 모두 max_len을 사용
+        self.enc_embedding = layers.Embedding(input_vocab_size, d_model)
+        self.enc_pos_embedding = layers.Embedding(max_len, d_model)
+        self.dec_embedding = layers.Embedding(target_vocab_size, d_model)
+        self.dec_pos_embedding = layers.Embedding(max_len, d_model)
+        # EncoderBlock과 LoU는 기존 코드와 동일한 구조
+        self.enc_layers = [gMLPBlock(d_model, seq_len=max_len) for _ in range(num_layers)]
+        self.dec_layers = [LoU(d_model) for _ in range(num_layers)]
+        self.final_layer = layers.Dense(target_vocab_size, use_bias=False)
+    def call(self, inputs, training=False):
+        # enc_inputs와 dec_inputs는 동일한 시퀀스 (Unified Input)
+        enc_inputs = inputs["enc_inputs"]
+        dec_inputs = inputs["dec_inputs"]
+        enc_pos = tf.range(tf.shape(enc_inputs)[1])[tf.newaxis, :]
+        dec_pos = tf.range(tf.shape(dec_inputs)[1])[tf.newaxis, :]
+        # 인코더 실행
+        x = self.enc_embedding(enc_inputs) + self.enc_pos_embedding(enc_pos)
+        # Note: 마스크 없음 -> Bi-directional (BERT-like Encoder)
+        for layer in self.enc_layers: x = layer(x, training=training)
+        enc_out = x # 인코더의 최종 출력 (디코더의 'z' 입력)
+        # 디코더 실행
+        y = self.dec_embedding(dec_inputs) + self.dec_pos_embedding(dec_pos)
+        # Note: LoU는 내부적으로 EMA를 사용하며, 일반적인 Cross-Attention 블록의 역할을 수행
+        for layer in self.dec_layers: y = layer(y, enc_out, training=training)
+        return self.final_layer(y)
+# 가중치 저장
+chat_model = AlphaS2S(num_layers=4, d_model=160, num_heads=8,
+                             input_vocab_size=vocab_size, target_vocab_size=vocab_size, max_len=max_len)
+dummy_input = {
+        "enc_inputs": tf.zeros((1, max_len), dtype=tf.int32),
+        "dec_inputs": tf.zeros((1, max_len), dtype=tf.int32)
+    }
+_ = chat_model(dummy_input)
+chat_model.load_weights('/kaggle/working/chat_model.weights.h5')
 print("모델 가중치 로드 완료!")
+# =======================
+# 6) 추론 함수 (기존 코드 유지)
+# =======================
+def generate_text_topp(model, prompt, max_len=150, max_gen=100, p=0.9, temperature=0.8, min_len=20):
+    # 인코더 입력은 <start> Prompt <sep> 만 사용
     model_input = text_to_ids(f"<start> {prompt} <sep>")
     model_input = model_input[:max_len]
     generated = list(model_input)
     for step in range(max_gen):
+        current_len = len(generated)
+        # 현재까지 생성된 시퀀스를 입력으로 사용
+        if current_len > max_len:
             input_seq = generated[-max_len:]
         else:
             input_seq = generated
+        # 패딩
         input_padded = np.pad(input_seq, (0, max_len - len(input_seq)), constant_values=pad_id)
         input_tensor = tf.convert_to_tensor([input_padded])
+        # 모델 추론 (enc_inputs, dec_inputs 모두 동일한 시퀀스를 사용)
+        dummy_input = {
+            "enc_inputs": input_tensor,
+            "dec_inputs": input_tensor
+        }
+        logits = model(dummy_input, training=False)
+        # 다음 토큰의 로짓은 시퀀스의 마지막 토큰 위치에서 가져옴 (0-based index: current_len - 1)
+        # 하지만 패딩 후 input_tensor의 실제 시퀀스 길이는 len(input_seq)
         next_token_logits = logits[0, len(input_seq) - 1].numpy()
+        # 특수 토큰 생성 억제
         next_token_logits[end_id] -= 5.0
         next_token_logits[pad_id] -= 10.0
         probs = tf.nn.softmax(next_token_logits / temperature).numpy()
         sorted_indices = np.argsort(probs)[::-1]
         sorted_probs = probs[sorted_indices]
+        # Top-p (Nucleus) Sampling
         cumulative_probs = np.cumsum(sorted_probs)
         cutoff = np.searchsorted(cumulative_probs, p)
         top_indices = sorted_indices[:cutoff + 1]
         top_probs = sorted_probs[:cutoff + 1]
         top_probs /= np.sum(top_probs)
         next_token_id = np.random.choice(top_indices, p=top_probs)
         if next_token_id == end_id and len(generated) >= min_len:
             break
         generated.append(int(next_token_id))
+    # <start> 토큰 제거 및 <sep> 이전 부분 제거
+    try:
+        sep_index = generated.index(sep_id)
+        # <sep> 이후부터 <end> 이전까지의 응답만 반환
+        result_ids = generated[sep_index + 1:]
+        try:
+            end_index = result_ids.index(end_id)
+            result_ids = result_ids[:end_index]
+        except ValueError:
+            pass
+        return ids_to_text(result_ids)
+    except ValueError:
+        return ids_to_text(generated) # <sep>이 없으면 전체 반환
 print("\n\n===== 생성 결과 =====")
+# 모델이 1 epoch만 학습되었으므로 의미 있는 결과가 아닐 수 있습니다.
+print(generate_text_topp(chat_model, "제가 이따가 버스를 타야 해서 준비 좀 해야겠어요. 재미있는 대화였습니다!", p=0.9))