OpenLab-NLP commited on
Commit
67804dc
ยท
verified ยท
1 Parent(s): 870a9a7

Update Test.py

Browse files
Files changed (1) hide show
  1. Test.py +223 -348
Test.py CHANGED
@@ -1,362 +1,237 @@
1
- !pip install sentencepiece
 
2
  import sentencepiece as spm
3
-
4
- # ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
5
- import os, json, numpy as np, tensorflow as tf
6
  import requests
7
- print('1')
8
-
9
- tf.get_logger().setLevel("ERROR")
10
- SEED = 42
11
- tf.random.set_seed(SEED)
12
- np.random.seed(SEED)
13
-
14
- # TPU ์ดˆ๊ธฐํ™”
15
- try:
16
- resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
17
- tf.tpu.experimental.initialize_tpu_system(resolver)
18
- strategy = tf.distribute.TPUStrategy(resolver)
19
- print("โœ… TPU ์ดˆ๊ธฐํ™” ์™„๋ฃŒ:", resolver.cluster_spec().as_dict())
20
- on_tpu = True
21
- except Exception as e:
22
- print("โš ๏ธ TPU ๋ฏธ์‚ฌ์šฉ, GPU/CPU๋กœ ์ง„ํ–‰:", e)
23
- strategy = tf.distribute.get_strategy()
24
- on_tpu = False
25
-
26
- # Mixed precision
27
- from tensorflow.keras import mixed_precision
28
- import tensorflow as tf
29
- from tensorflow.keras import layers, activations, initializers
30
- policy = mixed_precision.Policy("mixed_bfloat16" if on_tpu else "float32")
31
- mixed_precision.set_global_policy(policy)
32
- print("โœ… Mixed precision:", policy)
33
-
34
- # =======================
35
- # 1) ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ
36
- # =======================
37
- def download_file(url, save_path):
38
- r = requests.get(url, stream=True)
39
- r.raise_for_status()
40
- with open(save_path, "wb") as f:
41
- for chunk in r.iter_content(8192):
42
- f.write(chunk)
43
- print(f"โœ… {save_path} ์ €์žฅ๋จ")
44
-
45
- DATA_PATH = "converted.jsonl"
46
- TOKENIZER_PATH = "ko_unigram.model"
47
-
48
- if not os.path.exists(DATA_PATH):
49
- download_file(
50
- "https://huggingface.co/datasets/Yuchan5386/SFT/resolve/main/data_shuffled_1.jsonl?download=true",
51
- DATA_PATH
52
- )
53
-
54
- if not os.path.exists(TOKENIZER_PATH):
55
- download_file(
56
- "https://huggingface.co/Yuchan5386/inlam-70m-instruct/resolve/main/unigram.model?download=true",
57
- TOKENIZER_PATH
58
- )
59
 
60
- sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
 
 
 
 
 
 
 
 
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
63
- start_id = sp.piece_to_id("<start>")
64
- sep_id = sp.piece_to_id("<sep>")
65
- end_id = sp.piece_to_id("<end>")
66
- unk_id = sp.piece_to_id("<unk>")
67
  vocab_size = sp.get_piece_size()
68
- print(f"โœ… Vocabulary size: {vocab_size}")
69
-
70
- max_len = 1024
71
- batch_size = 128
72
-
73
- def text_to_ids(text):
74
- return sp.encode(text, out_type=int)
75
- def ids_to_text(ids):
76
- return sp.decode(ids)
77
-
78
- def jsonl_stream(file_path):
79
- with open(file_path, "r", encoding="utf-8") as f:
80
- for line in f:
81
- data = json.loads(line)
82
- conversations = data.get("conversations", [])
83
- for i in range(0, len(conversations) - 1, 2):
84
- human_msg = conversations[i]
85
- gpt_msg = conversations[i + 1]
86
- if human_msg.get("from") != "human" or gpt_msg.get("from") != "gpt":
87
- continue
88
- prompt = human_msg.get("value", "").strip()
89
- response = gpt_msg.get("value", "").strip()
90
- full = f"<start> {prompt} <sep> {response} <end>"
91
- if "<sep>" not in full:
92
- continue
93
- sep_index = full.index("<sep>")
94
- input_text = full[:sep_index + len("<sep>")].strip()
95
- target_text = full[sep_index + len("<sep>"):].strip()
96
-
97
- input_ids = text_to_ids(input_text)
98
- target_ids = text_to_ids(target_text + " <end>")
99
-
100
- available_len = max_len - len(input_ids)
101
- if available_len <= 0:
102
- input_ids = input_ids[-max_len:]
103
- target_ids = []
104
- target_mask = [0] * len(input_ids)
105
- else:
106
- target_ids = target_ids[:available_len]
107
- target_mask = [0] * len(input_ids) + [1] * len(target_ids)
108
-
109
- full_input = input_ids + target_ids
110
- pad_len = max_len - len(full_input)
111
- full_input += [pad_id] * pad_len
112
- target_mask += [0] * pad_len
113
-
114
- target_seq = full_input[1:] + [end_id]
115
- target_seq = target_seq[:max_len]
116
-
117
- masked_target = [
118
- t if m == 1 else pad_id
119
- for t, m in zip(target_seq, target_mask)
120
- ]
121
-
122
- yield (
123
- tf.convert_to_tensor(full_input, dtype=tf.int32),
124
- tf.convert_to_tensor(masked_target, dtype=tf.int32)
125
- )
126
 
127
  dataset = tf.data.Dataset.from_generator(
128
- lambda: jsonl_stream(DATA_PATH),
129
- output_signature=(
130
- tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
131
- tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
132
- ),
133
- )
134
- dataset = dataset.shuffle(1000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
135
-
136
- with strategy.scope():
137
- dist_dataset = strategy.experimental_distribute_dataset(dataset)
138
-
139
- class RotaryPositionalEmbedding(tf.keras.layers.Layer):
140
- def __init__(self, dim):
141
- super().__init__()
142
- inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
143
- self.inv_freq = tf.constant(inv_freq, dtype=tf.float32)
144
 
145
- def call(self, x):
146
- b, h, s, d = tf.unstack(tf.shape(x))
147
- t = tf.range(s, dtype=tf.float32)
148
- freqs = tf.einsum('i,j->ij', t, self.inv_freq)
149
- dtype = x.dtype
150
- emb_sin = tf.cast(tf.sin(freqs), dtype)
151
- emb_cos = tf.cast(tf.cos(freqs), dtype)
152
- emb_cos = tf.reshape(emb_cos, [1,1,s,-1])
153
- emb_sin = tf.reshape(emb_sin, [1,1,s,-1])
154
- x1, x2 = x[..., ::2], x[..., 1::2]
155
- x_rot = tf.stack([x1*emb_cos - x2*emb_sin, x1*emb_sin + x2*emb_cos], axis=-1)
156
- x_rot = tf.reshape(x_rot, tf.shape(x))
157
- return x_rot
158
-
159
- class SwiGLU(tf.keras.layers.Layer):
160
- def __init__(self, d_model, d_ff):
161
  super().__init__()
162
- self.proj = tf.keras.layers.Dense(d_ff)
163
- self.out = tf.keras.layers.Dense(d_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  def call(self, x):
165
- x_proj = self.proj(x)
166
- x_val, x_gate = tf.split(x_proj, 2, axis=-1)
167
- return self.out(x_val * tf.nn.silu(x_gate))
168
-
169
- class FlashAttentionMHA(layers.Layer):
170
- def __init__(self, d_model, num_heads=8, dropout_rate=0.1):
171
- super().__init__()
172
- self.d_model = d_model
173
- self.num_heads = num_heads
174
- self.dh = d_model // num_heads
175
-
176
- self.q_proj = layers.Dense(d_model, use_bias=False)
177
- self.k_proj = layers.Dense(d_model, use_bias=False)
178
- self.v_proj = layers.Dense(d_model, use_bias=False)
179
- self.out_proj = layers.Dense(d_model, use_bias=False)
180
- self.dropout = layers.Dropout(dropout_rate)
181
- self.rope = RotaryPositionalEmbedding(self.dh)
182
-
183
- @tf.function(jit_compile=True)
184
- def call(self, x, training=False, causal=False):
185
- B, N, D = tf.shape(x)[0], tf.shape(x)[1], x.shape[2]
186
-
187
- # Q,K,V: (B, N, num_heads, dh)
188
- Q = tf.reshape(self.q_proj(x), [B, N, self.num_heads, self.dh])
189
- K = tf.reshape(self.k_proj(x), [B, N, self.num_heads, self.dh])
190
- V = tf.reshape(self.v_proj(x), [B, N, self.num_heads, self.dh])
191
-
192
- # transpose for attention: (B, num_heads, N, dh)
193
- Q = tf.transpose(Q, [0,2,1,3])
194
- K = tf.transpose(K, [0,2,1,3])
195
- V = tf.transpose(V, [0,2,1,3])
196
-
197
- # ROPE ์ ์šฉ
198
- Q = self.rope(Q)
199
- K = self.rope(K)
200
-
201
- # Scaled dot-product
202
- scale = tf.cast(self.dh ** -0.5, x.dtype)
203
- Q = Q * scale
204
- attn_scores = tf.matmul(Q, K, transpose_b=True)
205
-
206
- if causal:
207
- mask = tf.linalg.band_part(tf.ones((N,N), dtype=x.dtype), -1, 0)
208
- attn_scores = attn_scores * mask - 1e9 * (1 - mask)
209
-
210
- attn_weights = tf.nn.softmax(attn_scores, axis=-1)
211
- attn_weights = self.dropout(attn_weights, training=training)
212
- out = tf.matmul(attn_weights, V) # (B, h, N, dh)
213
- out = tf.transpose(out, [0,2,1,3])
214
- out = tf.reshape(out, [B, N, D])
215
- out = self.out_proj(out)
216
- return out
217
-
218
-
219
- class GPTBlock(tf.keras.layers.Layer):
220
- def __init__(self, d_model, d_ff, num_heads=12, dropout_rate=0.1, adapter_dim=64):
221
- super().__init__()
222
- self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
223
- self.mha = FlashAttentionMHA(d_model, num_heads, dropout_rate=dropout_rate)
224
- self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
225
- self.adapter_down = tf.keras.layers.Dense(adapter_dim, activation='gelu')
226
- self.adapter_up = tf.keras.layers.Dense(d_model)
227
- self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
228
- self.ffn = SwiGLU(d_model, d_ff)
229
- self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
230
-
231
- def call(self, x, training=False):
232
- x_norm = self.ln1(x)
233
- attn_out = self.mha(x_norm, training=training, causal=True)
234
- attn_out = self.dropout1(attn_out, training=training)
235
- adapter_out = self.adapter_up(self.adapter_down(attn_out))
236
- attn_out = attn_out + adapter_out
237
- x = x + attn_out
238
- ffn_out = self.ffn(self.ln2(x))
239
- x = x + self.dropout2(ffn_out, training=training)
240
- return x
241
-
242
- class InLaM(tf.keras.Model):
243
- def __init__(self, vocab_size, seq_len, d_model, d_ff, n_layers, num_heads=12, dropout_rate=0.1):
244
  super().__init__()
245
- self.vocab_size = vocab_size
246
- self.d_model = d_model
247
-
248
- # Embedding ๋ ˆ์ด์–ด (bfloat16)
249
- self.token_embedding = tf.keras.layers.Embedding(vocab_size, d_model, dtype="bfloat16")
250
-
251
- # Transformer Blocks
252
- self.blocks = [GPTBlock(d_model, d_ff, num_heads, dropout_rate) for _ in range(n_layers)]
253
-
254
- # Final LayerNorm
255
- self.ln_f = tf.keras.layers.LayerNormalization(epsilon=1e-5, dtype="bfloat16")
256
- def call(self, x, training=False):
257
- # Embedding
258
- x = self.token_embedding(x) # (batch, seq_len, d_model)
259
- for block in self.blocks:
260
- x = block(x, training=training)
261
 
262
- x = self.ln_f(x) # (batch, seq_len, d_model)
263
- embed_weights = self.token_embedding.weights[0] # (vocab_size, d_model)
264
- logits = tf.matmul(x, embed_weights, transpose_b=True) # (batch, seq_len, vocab_size)
265
-
266
- # float32๋กœ ์บ์ŠคํŒ… (์†์‹ค ๊ณ„์‚ฐ ๋“ฑ์—์„œ ์•ˆ์ •์„ฑ ํ™•๋ณด)
267
- return tf.cast(logits, tf.float32)
268
-
269
- # =======================
270
- # ์†์‹ค/๋ฉ”ํŠธ๋ฆญ ์ •์˜
271
- # =======================
272
- def smoothed_loss_keras(y_true, y_pred, eps=0.1):
273
- y_true = tf.cast(y_true, tf.int32)
274
- mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)
275
- vocab = tf.shape(y_pred)[-1]
276
- y_true_oh = tf.one_hot(y_true, depth=vocab, dtype=tf.float32)
277
- y_true_ls = (1.0 - eps) * y_true_oh + eps / tf.cast(vocab, tf.float32)
278
- log_probs = tf.nn.log_softmax(y_pred, axis=-1)
279
- per_tok = -tf.reduce_sum(y_true_ls * log_probs, axis=-1)
280
- per_tok = per_tok * mask
281
- return tf.reduce_sum(per_tok) / (tf.reduce_sum(mask) + 1e-8)
282
-
283
- def masked_accuracy(y_true, y_pred):
284
- y_true = tf.cast(y_true, tf.int32)
285
- mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)
286
- pred_id = tf.argmax(y_pred, axis=-1, output_type=tf.int32)
287
- acc = tf.cast(tf.equal(y_true, pred_id), tf.float32) * mask
288
- return tf.reduce_sum(acc) / (tf.reduce_sum(mask) + 1e-8)
289
-
290
- def masked_perplexity(y_true, y_pred, eps=0.1):
291
- y_true = tf.cast(y_true, tf.int32)
292
- mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)
293
- vocab = tf.shape(y_pred)[-1]
294
- y_true_oh = tf.one_hot(y_true, depth=vocab, dtype=tf.float32)
295
- y_true_ls = (1.0 - eps) * y_true_oh + eps / tf.cast(vocab, tf.float32)
296
- log_probs = tf.nn.log_softmax(y_pred, axis=-1)
297
- per_tok = -tf.reduce_sum(y_true_ls * log_probs, axis=-1)
298
- per_tok = per_tok * mask
299
- mean_loss = tf.reduce_sum(per_tok) / (tf.reduce_sum(mask) + 1e-8)
300
- return tf.exp(mean_loss)
301
-
302
-
303
- # =======================
304
- # ๋ชจ๋ธ ์ƒ์„ฑ & ์ปดํŒŒ์ผ
305
- # =======================
306
- with strategy.scope():
307
- model = InLaM(vocab_size=vocab_size, seq_len=max_len, d_model=768, d_ff=768*4, n_layers=12)
308
- dummy_input = tf.zeros((batch_size, max_len), dtype=tf.int32)
309
- _ = model(dummy_input, training=False)
310
- model.summary()
311
-
312
- optimizer = tf.keras.optimizers.Adam(1e-4, beta_1=0.9, beta_2=0.95, epsilon=1e-8, clipnorm=1.0)
313
- model.compile(optimizer=optimizer, loss=smoothed_loss_keras, metrics=[masked_accuracy, masked_perplexity])
314
-
315
- # ํ•™์Šต
316
- history = model.fit(dist_dataset, epochs=1, verbose=1)
317
-
318
- # =======================
319
- # ๊ฐ€์ค‘์น˜ ์ €์žฅ
320
- # =======================
321
- model.save_weights("tf_model.weights.h5")
322
- print("โœ… ๋ชจ๋ธ ๊ฐ€์ค‘์น˜ ์ €์žฅ ์™„๋ฃŒ!")
323
-
324
- # =======================
325
- # ์ƒ˜ํ”Œ ์ƒ์„ฑ ํ•จ์ˆ˜
326
- # =======================
327
- def generate_text_topp(model, prompt, max_len=115, max_gen=98, p=0.9, temperature=0.68, min_len=20):
328
- model_input = text_to_ids(f"<start> {prompt} <sep>")
329
- model_input = model_input[:max_len]
330
- generated = list(model_input)
331
-
332
- for step in range(max_gen):
333
- input_seq = generated[-max_len:] if len(generated) > max_len else generated
334
- input_padded = np.pad(input_seq, (0, max_len - len(input_seq)), constant_values=pad_id)
335
- input_tensor = tf.convert_to_tensor([input_padded], dtype=tf.int32)
336
-
337
- logits = model(input_tensor, training=False).numpy()[0, len(input_seq)-1]
338
- logits[end_id] -= 5.0
339
- logits[pad_id] -= 10.0
340
-
341
- probs = tf.nn.softmax(logits / temperature).numpy()
342
- sorted_idx = np.argsort(probs)[::-1]
343
- sorted_probs = probs[sorted_idx]
344
- cumulative = np.cumsum(sorted_probs)
345
- cutoff = np.searchsorted(cumulative, p)
346
- top_idx = sorted_idx[:cutoff + 1]
347
- top_probs = sorted_probs[:cutoff + 1] / sorted_probs[:cutoff + 1].sum()
348
-
349
- next_token = int(np.random.choice(top_idx, p=top_probs))
350
- if next_token == end_id and len(generated) >= min_len:
351
  break
352
- generated.append(next_token)
353
-
354
- return ids_to_text(generated)
355
-
356
- # =======================
357
- # ํ…Œ์ŠคํŠธ ์ƒ์„ฑ
358
- # =======================
359
- prompt = "์•ˆ๋…•ํ•˜์„ธ์š”! ํ•œ๊ตญ ๋ฐด๋“œ์— ๋Œ€ํ•ด ๊ถ๊ธˆํ•œ ๊ฒƒ์ด ์žˆ์–ด์š”!"
360
- sample_text = generate_text_topp(model, prompt, p=0.9)
361
- print("\n===== ์ƒ์„ฑ ๊ฒฐ๊ณผ =====\n")
362
- print(sample_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json, random, numpy as np, tensorflow as tf
2
+ from tensorflow.keras import layers, Model
3
  import sentencepiece as spm
 
 
 
4
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # ===============================
7
+ # 0๏ธโƒฃ ํ™˜๊ฒฝ ์„ค์ •
8
+ # ===============================
9
+ TOKENIZER_PATH = "bpe.model"
10
+ DATA_PATH = "corpus.txt" # 36M ๋ฌธ์žฅ ํ…์ŠคํŠธ ํŒŒ์ผ
11
+ MAX_LEN = 128
12
+ EMBED_DIM = 384
13
+ LATENT_DIM = 384
14
+ BATCH_SIZE = 400
15
+ NEGATIVE_RATIO = 1 # negative sample ์ˆ˜
16
 
17
+ def download_file(url, save_path):
18
+ if not os.path.exists(save_path):
19
+ print(f"Downloading {save_path} ...")
20
+ r = requests.get(url, stream=True)
21
+ r.raise_for_status()
22
+ with open(save_path, "wb") as f:
23
+ for chunk in r.iter_content(8192*2):
24
+ f.write(chunk)
25
+ print(f"โœ… {save_path} saved")
26
+
27
+ download_file("https://huggingface.co/datasets/OpenLab-NLP/ko-corpus/resolve/main/bpe.model?download=true", TOKENIZER_PATH)
28
+ download_file("https://huggingface.co/datasets/OpenLab-NLP/ko-corpus/resolve/main/shuffled_corpus%20(1).txt?download=true", DATA_PATH)
29
+
30
+ # ===============================
31
+ # 2๏ธโƒฃ ํ† ํฌ๋‚˜์ด์ € ์ค€๋น„
32
+ # ===============================
33
+ sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
34
  pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
 
 
 
 
35
  vocab_size = sp.get_piece_size()
36
+
37
+ def encode_sentence(sentence, max_len=MAX_LEN):
38
+ return sp.encode(sentence, out_type=int)[:max_len]
39
+
40
+ def pad_sentence(tokens):
41
+ return tokens + [pad_id]*(MAX_LEN - len(tokens))
42
+
43
+ def gen_pairs_streaming(txt_path=DATA_PATH, negative_ratio=NEGATIVE_RATIO):
44
+ with open(txt_path, "r", encoding="utf-8") as f:
45
+ sentences = [line.strip() for line in f if line.strip()]
46
+ while True:
47
+ for s1 in sentences:
48
+ # positive pair (์ž๊ธฐ ์ž์‹ )
49
+ x1 = pad_sentence(encode_sentence(s1))
50
+ yield (x1, x1), 1.0
51
+
52
+ # negative pairs (์ž๊ธฐ ์ž์‹  ์ œ์™ธ)
53
+ for _ in range(negative_ratio):
54
+ s2 = s1
55
+ while s2 == s1:
56
+ s2 = random.choice(sentences)
57
+ x2 = pad_sentence(encode_sentence(s2))
58
+ yield (x1, x2), 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  dataset = tf.data.Dataset.from_generator(
61
+ lambda: gen_pairs_streaming(),
62
+ output_types=((tf.int32, tf.int32), tf.float32),
63
+ output_shapes=(((MAX_LEN,), (MAX_LEN,)), ())
64
+ ).shuffle(1024).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ class EncoderBlock(tf.keras.layers.Layer):
67
+ def __init__(self, embed_dim=EMBED_DIM, ff_dim=1152, seq_len=MAX_LEN):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  super().__init__()
69
+ self.fc1 = layers.Dense(ff_dim)
70
+ self.fc2 = layers.Dense(embed_dim)
71
+ self.fc3 = layers.Dense(ff_dim)
72
+ self.fc4 = layers.Dense(embed_dim)
73
+
74
+ self.w_proj = self.add_weight(
75
+ shape=(embed_dim, embed_dim),
76
+ initializer="glorot_uniform",
77
+ trainable=True
78
+ )
79
+
80
+ self.alpha2 = layers.Dense(1)
81
+
82
+ self.ln = layers.LayerNormalization(epsilon=1e-5)
83
+ self.ln1 = layers.LayerNormalization(epsilon=1e-5)
84
+ self.ln2 = layers.LayerNormalization(epsilon=1e-5)
85
+
86
  def call(self, x):
87
+ x_norm = self.ln(x)
88
+ x = self.fc1(x_norm)
89
+ g, v = tf.split(x, 2, axis=-1)
90
+ x = tf.nn.silu(g) * v
91
+ x = self.fc2(x)
92
+
93
+ x = tf.matmul(x, x, transpose_b=True) # (B,L,L)
94
+ x = tf.tensordot(x, self.w_proj, axes=[-1, 0]) # (B,L,D)
95
+
96
+ v = tf.nn.softmax(self.alpha2(v), axis=1) * x
97
+ x_norm = x_norm + self.ln2(v)
98
+
99
+ x = self.fc3(x_norm)
100
+ g, v = tf.split(x, 2, axis=-1)
101
+ x = tf.nn.silu(g) * v
102
+ x = self.fc4(x)
103
+
104
+ return x_norm + self.ln1(x)
105
+
106
+
107
+ class L2NormLayer(layers.Layer):
108
+ def __init__(self, axis=1, epsilon=1e-10, **kwargs):
109
+ super().__init__(**kwargs)
110
+ self.axis = axis
111
+ self.epsilon = epsilon
112
+ def call(self, inputs):
113
+ return tf.math.l2_normalize(inputs, axis=self.axis, epsilon=self.epsilon)
114
+ def get_config(self):
115
+ return {"axis": self.axis, "epsilon": self.epsilon, **super().get_config()}
116
+
117
+ class SentenceEncoder(tf.keras.Model):
118
+ def __init__(self, vocab_size, embed_dim=384, latent_dim=384, max_len=128, pad_id=pad_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  super().__init__()
120
+ self.pad_id = pad_id
121
+ self.embed = layers.Embedding(vocab_size, embed_dim)
122
+ self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
123
+ self.blocks = [EncoderBlock() for _ in range(1)]
124
+ self.attn_pool = layers.Dense(1)
125
+ self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
126
+ self.latent = layers.Dense(latent_dim, activation=None) # tanh ์ œ๊ฑฐ
127
+ self.l2norm = L2NormLayer() # ์ถ”๊ฐ€
 
 
 
 
 
 
 
 
128
 
129
+ def call(self, x):
130
+ positions = tf.range(tf.shape(x)[1])[tf.newaxis, :]
131
+ x_embed = self.embed(x) + self.pos_embed(positions)
132
+ mask = tf.cast(tf.not_equal(x, self.pad_id), tf.float32)
133
+ x = x_embed
134
+ for block in self.blocks:
135
+ x = block(x)
136
+ x = self.ln_f(x)
137
+
138
+ scores = self.attn_pool(x)
139
+ scores = tf.where(tf.equal(mask[..., tf.newaxis], 0), -1e9, scores)
140
+ scores = tf.nn.softmax(scores, axis=1)
141
+ pooled = tf.reduce_sum(x * scores, axis=1)
142
+
143
+ latent = self.latent(pooled)
144
+ return self.l2norm(latent) # L2 ์ •๊ทœํ™” ํ›„ ๋ฐ˜ํ™˜
145
+
146
+ # ===============================
147
+ # 5๏ธโƒฃ Cosine similarity layer + Contrastive Loss
148
+ # ===============================
149
+ class CosineSimilarityLayer(layers.Layer):
150
+ def call(self, inputs):
151
+ v1, v2 = inputs
152
+ return tf.reduce_sum(v1 * v2, axis=-1) # ์ด๋ฏธ L2 ์ •๊ทœํ™”๋ผ์„œ dot product = cosine similarity
153
+
154
+ def contrastive_loss(margin=0.5):
155
+ def loss(y_true, y_pred):
156
+ y_true = tf.cast(y_true, tf.float32)
157
+ dist = 1 - y_pred
158
+ pos_loss = y_true * tf.square(dist)
159
+ neg_loss = (1 - y_true) * tf.square(tf.maximum(margin - dist, 0))
160
+ return tf.reduce_mean(pos_loss + neg_loss)
161
+ return loss
162
+
163
+ encoder = SentenceEncoder(vocab_size=vocab_size)
164
+
165
+ # ===============================
166
+ # 6๏ธโƒฃ ์‹œ์•” ๋ชจ๋ธ ์ •์˜
167
+ # ===============================
168
+ input1 = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32)
169
+ input2 = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32)
170
+ v1 = encoder(input1)
171
+ v2 = encoder(input2)
172
+ cos_sim = CosineSimilarityLayer()([v1, v2])
173
+ siamese_model = tf.keras.Model([input1, input2], cos_sim)
174
+ siamese_model.compile(optimizer=tf.keras.optimizers.Adam(1e-5), loss=contrastive_loss(margin=0.5))
175
+ siamese_model.summary()
176
+ # ===============================
177
+ # 7๏ธโƒฃ ํ•™์Šต
178
+ # ===============================
179
+ #steps_per_epoch = 36757266 // 400
180
+ steps_per_epoch = 1000000 // 400
181
+ # generator ๊ธฐ๋ฐ˜ streaming ํ•™์Šต
182
+ siamese_model.fit(dataset, epochs=1, steps_per_epoch=steps_per_epoch) # steps_per_epoch๋Š” ํ•„์š”์— ๋”ฐ๋ผ ์กฐ์ ˆ
183
+ encoder.save_weights("encoder.weights.h5")
184
+ siamese_model.save_weights("siamese_model.weights.h5")
185
+
186
+ # ===============================
187
+ # 8๏ธโƒฃ corpus ๋ฒกํ„ฐ ์ƒ์„ฑ + ์บ์‹ฑ (์•ˆ์ „ํ•˜๊ฒŒ ์ƒˆ๋กœ ์ƒ์„ฑ)
188
+ # ===============================
189
+ LIMIT = 1000 # ๊ฒ€์ƒ‰์šฉ corpus ๋ฌธ์žฅ ์ˆ˜
190
+ prompts = []
191
+
192
+ # prompts ๋จผ์ € ์ฝ๊ธฐ
193
+ with open(DATA_PATH, "r", encoding="utf-8") as f:
194
+ for i, line in enumerate(f):
195
+ if i >= LIMIT:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  break
197
+ line = line.strip()
198
+ if line:
199
+ prompts.append(line)
200
+
201
+ def get_sentence_vector(sentence):
202
+ tokens = pad_sentence(encode_sentence(sentence))
203
+ return encoder(np.array([tokens])).numpy()[0]
204
+
205
+ # corpus_vectors ํ•ญ์ƒ ์ƒˆ๋กœ ์ƒ์„ฑ (๊ธฐ์กด npy ๋ฌด์‹œ)
206
+ corpus_vectors = np.stack([get_sentence_vector(p) for p in prompts]).astype(np.float16)
207
+ np.save("corpus_vectors.npy", corpus_vectors)
208
+
209
+ # norms ๊ณ„์‚ฐ
210
+ corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
211
+
212
+ # ===============================
213
+ # 9๏ธโƒฃ ๊ฒ€์ƒ‰ ํ•จ์ˆ˜
214
+ # ===============================
215
+ def search(query, top_k=3):
216
+ q_vec = get_sentence_vector(query).astype(np.float16)
217
+ sims = corpus_vectors @ q_vec
218
+ sims /= (corpus_norms * np.linalg.norm(q_vec) + 1e-8)
219
+
220
+ # top_k ์•ˆ์ „ ์ฒ˜๋ฆฌ
221
+ top_k = min(top_k, len(prompts))
222
+ top_idx = np.argsort(sims)[::-1][:top_k]
223
+
224
+ return [(prompts[i], float(sims[i])) for i in top_idx]
225
+
226
+ # ===============================
227
+ # ๐Ÿ”Ÿ ํ…Œ์ŠคํŠธ
228
+ # ===============================
229
+ query = "์šฐ๋ฆฌ๊ฐ€ ํ•ธ๋“œํฐ, ๋ฐฐ๋ฅผ ์„ธ๊ณ„์—์„œ ์ œ์ผ ์ž˜ ๋งŒ๋“œ๋Š” ๊ฒƒ ์ด์ƒ์œผ๋กœ ์‚ฌ๋ž‘์„ ์ œ์ผ ์ž˜ ์‹ค์ฒœํ•  ์ˆ˜ ์žˆ๋Š” ๋Šฅ๋ ฅ, ์ž์งˆ, ์ €๋ ฅ์ด ์šฐ๋ฆฌ์—๊ฒŒ ์žˆ๋‹ค."
230
+ results = search(query)
231
+ for p, s in results:
232
+ print(f"Prompt: {p}\n์œ ์‚ฌ๋„: {s:.3f}\n---")
233
+
234
+ query = "์•ˆ๋…•ํ•˜์„ธ์š”! ์˜ค๋Š˜ ๋‚ ์”จ ์–ด๋–ค๊ฐ€์š”?"
235
+ results = search(query)
236
+ for p, s in results:
237
+ print(f"Prompt: {p}\n์œ ์‚ฌ๋„: {s:.3f}\n---")