OpenLab-NLP
/

model-prototype

Model card Files Files and versions

Yuchan commited on Nov 25, 2025

Commit

1512afd

·

verified ·

1 Parent(s): 395c8b3

Update Model.py

Files changed (1) hide show

Model.py +7 -4

Model.py CHANGED Viewed

@@ -77,9 +77,11 @@ def text_to_ids(text):
 def ids_to_text(ids):
     return sp.decode(ids)
-def txt_stream(file_path):
     with open(file_path, "r", encoding="utf-8") as f:
-        for line in f:
             text = line.strip()
             if not text:
                 continue
@@ -98,15 +100,16 @@ def txt_stream(file_path):
                 tf.convert_to_tensor(target, dtype=tf.int32)
             )
 dataset = tf.data.Dataset.from_generator(
-    lambda: txt_stream(DATA_PATH),
     output_signature=(
         tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
         tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
     )
 )
 dataset = dataset.shuffle(2000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
 with strategy.scope():

 def ids_to_text(ids):
     return sp.decode(ids)
+def txt_stream(file_path, num_lines=None):
     with open(file_path, "r", encoding="utf-8") as f:
+        for i, line in enumerate(f):
+            if num_lines is not None and i >= num_lines:
+                break  # 지정한 라인까지만 읽음
             text = line.strip()
             if not text:
                 continue
                 tf.convert_to_tensor(target, dtype=tf.int32)
             )
+# Dataset 생성 (예: 처음 10,000라인만)
 dataset = tf.data.Dataset.from_generator(
+    lambda: txt_stream(DATA_PATH, num_lines=10000),
     output_signature=(
         tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
         tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
     )
 )
 dataset = dataset.shuffle(2000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
 with strategy.scope():