Spaces:
Sleeping
Sleeping
Update train.py
Browse files
train.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
import argparse, os, json
|
| 3 |
from pathlib import Path
|
| 4 |
from datasets import load_dataset
|
| 5 |
from transformers import (
|
|
@@ -8,11 +7,11 @@ from transformers import (
|
|
| 8 |
)
|
| 9 |
import zipfile
|
| 10 |
|
| 11 |
-
ROOT = Path(
|
| 12 |
|
| 13 |
def parse_args():
|
| 14 |
ap = argparse.ArgumentParser()
|
| 15 |
-
ap.add_argument("--dataset", required=True)
|
| 16 |
ap.add_argument("--output", default=str(ROOT / "trained_model"))
|
| 17 |
ap.add_argument("--zip_path", default=str(ROOT / "trained_model.zip"))
|
| 18 |
ap.add_argument("--model_name", default="Salesforce/codegen-350M-multi")
|
|
@@ -20,7 +19,6 @@ def parse_args():
|
|
| 20 |
ap.add_argument("--batch_size", type=int, default=2)
|
| 21 |
ap.add_argument("--block_size", type=int, default=256)
|
| 22 |
ap.add_argument("--learning_rate", type=float, default=5e-5)
|
| 23 |
-
ap.add_argument("--subset", type=int, default=0)
|
| 24 |
return ap.parse_args()
|
| 25 |
|
| 26 |
def main():
|
|
@@ -33,10 +31,6 @@ def main():
|
|
| 33 |
cols = ds.column_names
|
| 34 |
print("🧾 Columns:", cols, flush=True)
|
| 35 |
|
| 36 |
-
if a.subset and a.subset > 0:
|
| 37 |
-
ds = ds.select(range(min(a.subset, len(ds))))
|
| 38 |
-
print(f"✂ Subset: {len(ds)} rows", flush=True)
|
| 39 |
-
|
| 40 |
tok = AutoTokenizer.from_pretrained(a.model_name, use_fast=True)
|
| 41 |
if tok.pad_token is None and tok.eos_token is not None:
|
| 42 |
tok.pad_token = tok.eos_token
|
|
@@ -79,7 +73,7 @@ def main():
|
|
| 79 |
trainer.save_model(out_dir)
|
| 80 |
tok.save_pretrained(out_dir)
|
| 81 |
|
| 82 |
-
# Zip the folder
|
| 83 |
if zip_path.exists():
|
| 84 |
zip_path.unlink()
|
| 85 |
print(f"📦 Zipping → {zip_path.name}", flush=True)
|
|
|
|
| 1 |
+
import argparse, os
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
from datasets import load_dataset
|
| 4 |
from transformers import (
|
|
|
|
| 7 |
)
|
| 8 |
import zipfile
|
| 9 |
|
| 10 |
+
ROOT = Path(_file_).resolve().parent
|
| 11 |
|
| 12 |
def parse_args():
|
| 13 |
ap = argparse.ArgumentParser()
|
| 14 |
+
ap.add_argument("--dataset", required=True, help="Path to .jsonl (or a folder you adapt later)")
|
| 15 |
ap.add_argument("--output", default=str(ROOT / "trained_model"))
|
| 16 |
ap.add_argument("--zip_path", default=str(ROOT / "trained_model.zip"))
|
| 17 |
ap.add_argument("--model_name", default="Salesforce/codegen-350M-multi")
|
|
|
|
| 19 |
ap.add_argument("--batch_size", type=int, default=2)
|
| 20 |
ap.add_argument("--block_size", type=int, default=256)
|
| 21 |
ap.add_argument("--learning_rate", type=float, default=5e-5)
|
|
|
|
| 22 |
return ap.parse_args()
|
| 23 |
|
| 24 |
def main():
|
|
|
|
| 31 |
cols = ds.column_names
|
| 32 |
print("🧾 Columns:", cols, flush=True)
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
tok = AutoTokenizer.from_pretrained(a.model_name, use_fast=True)
|
| 35 |
if tok.pad_token is None and tok.eos_token is not None:
|
| 36 |
tok.pad_token = tok.eos_token
|
|
|
|
| 73 |
trainer.save_model(out_dir)
|
| 74 |
tok.save_pretrained(out_dir)
|
| 75 |
|
| 76 |
+
# Zip the folder
|
| 77 |
if zip_path.exists():
|
| 78 |
zip_path.unlink()
|
| 79 |
print(f"📦 Zipping → {zip_path.name}", flush=True)
|