Percy3822 commited on
Commit
62c9d8d
·
verified ·
1 Parent(s): 69630f9

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +4 -10
train.py CHANGED
@@ -1,5 +1,4 @@
1
- # train.py
2
- import argparse, os, json
3
  from pathlib import Path
4
  from datasets import load_dataset
5
  from transformers import (
@@ -8,11 +7,11 @@ from transformers import (
8
  )
9
  import zipfile
10
 
11
- ROOT = Path(__file__).resolve().parent
12
 
13
  def parse_args():
14
  ap = argparse.ArgumentParser()
15
- ap.add_argument("--dataset", required=True)
16
  ap.add_argument("--output", default=str(ROOT / "trained_model"))
17
  ap.add_argument("--zip_path", default=str(ROOT / "trained_model.zip"))
18
  ap.add_argument("--model_name", default="Salesforce/codegen-350M-multi")
@@ -20,7 +19,6 @@ def parse_args():
20
  ap.add_argument("--batch_size", type=int, default=2)
21
  ap.add_argument("--block_size", type=int, default=256)
22
  ap.add_argument("--learning_rate", type=float, default=5e-5)
23
- ap.add_argument("--subset", type=int, default=0)
24
  return ap.parse_args()
25
 
26
  def main():
@@ -33,10 +31,6 @@ def main():
33
  cols = ds.column_names
34
  print("🧾 Columns:", cols, flush=True)
35
 
36
- if a.subset and a.subset > 0:
37
- ds = ds.select(range(min(a.subset, len(ds))))
38
- print(f"✂ Subset: {len(ds)} rows", flush=True)
39
-
40
  tok = AutoTokenizer.from_pretrained(a.model_name, use_fast=True)
41
  if tok.pad_token is None and tok.eos_token is not None:
42
  tok.pad_token = tok.eos_token
@@ -79,7 +73,7 @@ def main():
79
  trainer.save_model(out_dir)
80
  tok.save_pretrained(out_dir)
81
 
82
- # Zip the folder ourselves (no flags, no UI dependency)
83
  if zip_path.exists():
84
  zip_path.unlink()
85
  print(f"📦 Zipping → {zip_path.name}", flush=True)
 
1
+ import argparse, os
 
2
  from pathlib import Path
3
  from datasets import load_dataset
4
  from transformers import (
 
7
  )
8
  import zipfile
9
 
10
+ ROOT = Path(_file_).resolve().parent
11
 
12
  def parse_args():
13
  ap = argparse.ArgumentParser()
14
+ ap.add_argument("--dataset", required=True, help="Path to .jsonl (or a folder you adapt later)")
15
  ap.add_argument("--output", default=str(ROOT / "trained_model"))
16
  ap.add_argument("--zip_path", default=str(ROOT / "trained_model.zip"))
17
  ap.add_argument("--model_name", default="Salesforce/codegen-350M-multi")
 
19
  ap.add_argument("--batch_size", type=int, default=2)
20
  ap.add_argument("--block_size", type=int, default=256)
21
  ap.add_argument("--learning_rate", type=float, default=5e-5)
 
22
  return ap.parse_args()
23
 
24
  def main():
 
31
  cols = ds.column_names
32
  print("🧾 Columns:", cols, flush=True)
33
 
 
 
 
 
34
  tok = AutoTokenizer.from_pretrained(a.model_name, use_fast=True)
35
  if tok.pad_token is None and tok.eos_token is not None:
36
  tok.pad_token = tok.eos_token
 
73
  trainer.save_model(out_dir)
74
  tok.save_pretrained(out_dir)
75
 
76
+ # Zip the folder
77
  if zip_path.exists():
78
  zip_path.unlink()
79
  print(f"📦 Zipping → {zip_path.name}", flush=True)