Spaces:

Percy3822
/

Python_ai_attempt2

Sleeping

Percy3822 commited on Aug 9, 2025

Commit

70ef65d

verified ·

1 Parent(s): a88d3e2

Update train.py

Files changed (1) hide show

train.py CHANGED Viewed

@@ -30,14 +30,6 @@ def main():
     cols = ds.column_names
     print("🧾 Columns:", cols, flush=True)
-    # Accept either {"text": "..."} or {"prompt": "...", "completion": "..."}
-    def to_text(example):
-        if "text" in example:
-            return example["text"]
-        if "prompt" in example and "completion" in example:
-            return (str(example["prompt"]).rstrip() + "\n" + str(example["completion"]))
-        raise ValueError("Dataset must have 'text' or 'prompt' + 'completion'.")
     if a.subset and a.subset > 0:
         ds = ds.select(range(min(a.subset, len(ds))))
         print(f"✂ Subset: {len(ds)} rows", flush=True)
@@ -48,8 +40,16 @@ def main():
         tok.pad_token = tok.eos_token
     model = AutoModelForCausalLM.from_pretrained(a.model_name)
     def tokenize(batch):
-        texts = [to_text(x) for x in batch]
         return tok(texts, padding="max_length", truncation=True, max_length=a.block_size)
     print("🔁 Tokenizing…", flush=True)
@@ -66,7 +66,7 @@ def main():
         save_steps=200,
         save_total_limit=1,
         report_to=[],
-        fp16=False,  # CPU-friendly in Spaces
     )
     print("⚙ Trainer…", flush=True)
@@ -83,8 +83,4 @@ def main():
     print("✅ Done.", flush=True)
 if __name__ == "__main__":
-    try:
-        main()
-    except Exception as e:
-        print(f"❌ Error during training: {e}", flush=True)
-        raise

     cols = ds.column_names
     print("🧾 Columns:", cols, flush=True)
     if a.subset and a.subset > 0:
         ds = ds.select(range(min(a.subset, len(ds))))
         print(f"✂ Subset: {len(ds)} rows", flush=True)
         tok.pad_token = tok.eos_token
     model = AutoModelForCausalLM.from_pretrained(a.model_name)
+    # ✅ batched=True passes dict-of-lists
     def tokenize(batch):
+        if "text" in batch:
+            texts = batch["text"]
+        elif "prompt" in batch and "completion" in batch:
+            prompts = batch["prompt"]
+            completions = batch["completion"]
+            texts = [(str(p).rstrip() + "\n" + str(c)) for p, c in zip(prompts, completions)]
+        else:
+            raise ValueError("Dataset must have 'text' or 'prompt' + 'completion'.")
         return tok(texts, padding="max_length", truncation=True, max_length=a.block_size)
     print("🔁 Tokenizing…", flush=True)
         save_steps=200,
         save_total_limit=1,
         report_to=[],
+        fp16=False,
     )
     print("⚙ Trainer…", flush=True)
     print("✅ Done.", flush=True)
 if __name__ == "__main__":
+    main()