Percy3822 commited on
Commit
a89bf8e
·
verified ·
1 Parent(s): 0aef10f

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +23 -17
train.py CHANGED
@@ -1,19 +1,20 @@
1
- import argparse, os, traceback
 
2
  from pathlib import Path
3
  from datasets import load_dataset
4
  from transformers import (
5
  AutoTokenizer, AutoModelForCausalLM,
6
  DataCollatorForLanguageModeling, Trainer, TrainingArguments
7
  )
 
8
 
9
- ROOT = Path(__file__).resolve().parent # /home/user/app
10
- DONE = ROOT / "TRAIN_DONE" # <- write here
11
- ERRF = ROOT / "TRAIN_ERROR"
12
 
13
  def parse_args():
14
  ap = argparse.ArgumentParser()
15
  ap.add_argument("--dataset", required=True)
16
  ap.add_argument("--output", default=str(ROOT / "trained_model"))
 
17
  ap.add_argument("--model_name", default="Salesforce/codegen-350M-multi")
18
  ap.add_argument("--epochs", type=float, default=1.0)
19
  ap.add_argument("--batch_size", type=int, default=2)
@@ -24,6 +25,9 @@ def parse_args():
24
 
25
  def main():
26
  a = parse_args()
 
 
 
27
  print(f"📦 Loading dataset from: {a.dataset}", flush=True)
28
  ds = load_dataset("json", data_files=a.dataset, split="train")
29
  cols = ds.column_names
@@ -52,7 +56,7 @@ def main():
52
  collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)
53
 
54
  args = TrainingArguments(
55
- output_dir=a.output,
56
  overwrite_output_dir=True,
57
  per_device_train_batch_size=a.batch_size,
58
  num_train_epochs=a.epochs,
@@ -70,18 +74,20 @@ def main():
70
  print("🚀 Training…", flush=True)
71
  trainer.train()
72
 
73
- print(f"💾 Saving to {a.output}", flush=True)
74
- os.makedirs(a.output, exist_ok=True)
75
- trainer.save_model(a.output)
76
- tok.save_pretrained(a.output)
77
- DONE.write_text("ok") # <- SIGNAL!
 
 
 
 
 
 
 
 
78
  print("✅ Done.", flush=True)
79
 
80
  if __name__ == "__main__":
81
- try:
82
- DONE.unlink(missing_ok=True)
83
- ERRF.unlink(missing_ok=True)
84
- main()
85
- except Exception:
86
- ERRF.write_text(traceback.format_exc())
87
- raise
 
1
+ # train.py
2
+ import argparse, os, json
3
  from pathlib import Path
4
  from datasets import load_dataset
5
  from transformers import (
6
  AutoTokenizer, AutoModelForCausalLM,
7
  DataCollatorForLanguageModeling, Trainer, TrainingArguments
8
  )
9
+ import zipfile
10
 
11
+ ROOT = Path(_file_).resolve().parent
 
 
12
 
13
  def parse_args():
14
  ap = argparse.ArgumentParser()
15
  ap.add_argument("--dataset", required=True)
16
  ap.add_argument("--output", default=str(ROOT / "trained_model"))
17
+ ap.add_argument("--zip_path", default=str(ROOT / "trained_model.zip"))
18
  ap.add_argument("--model_name", default="Salesforce/codegen-350M-multi")
19
  ap.add_argument("--epochs", type=float, default=1.0)
20
  ap.add_argument("--batch_size", type=int, default=2)
 
25
 
26
  def main():
27
  a = parse_args()
28
+ out_dir = Path(a.output).resolve()
29
+ zip_path = Path(a.zip_path).resolve()
30
+
31
  print(f"📦 Loading dataset from: {a.dataset}", flush=True)
32
  ds = load_dataset("json", data_files=a.dataset, split="train")
33
  cols = ds.column_names
 
56
  collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)
57
 
58
  args = TrainingArguments(
59
+ output_dir=str(out_dir),
60
  overwrite_output_dir=True,
61
  per_device_train_batch_size=a.batch_size,
62
  num_train_epochs=a.epochs,
 
74
  print("🚀 Training…", flush=True)
75
  trainer.train()
76
 
77
+ print(f"💾 Saving to {out_dir}", flush=True)
78
+ os.makedirs(out_dir, exist_ok=True)
79
+ trainer.save_model(out_dir)
80
+ tok.save_pretrained(out_dir)
81
+
82
+ # Zip the folder ourselves (no flags, no UI dependency)
83
+ if zip_path.exists():
84
+ zip_path.unlink()
85
+ print(f"📦 Zipping → {zip_path.name}", flush=True)
86
+ with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
87
+ for p in out_dir.rglob("*"):
88
+ z.write(p, arcname=p.relative_to(out_dir))
89
+
90
  print("✅ Done.", flush=True)
91
 
92
  if __name__ == "__main__":
93
+ main()