Spaces:
Sleeping
Sleeping
Update train.py
Browse files
train.py
CHANGED
|
@@ -1,19 +1,20 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
from datasets import load_dataset
|
| 4 |
from transformers import (
|
| 5 |
AutoTokenizer, AutoModelForCausalLM,
|
| 6 |
DataCollatorForLanguageModeling, Trainer, TrainingArguments
|
| 7 |
)
|
|
|
|
| 8 |
|
| 9 |
-
ROOT = Path(
|
| 10 |
-
DONE = ROOT / "TRAIN_DONE" # <- write here
|
| 11 |
-
ERRF = ROOT / "TRAIN_ERROR"
|
| 12 |
|
| 13 |
def parse_args():
|
| 14 |
ap = argparse.ArgumentParser()
|
| 15 |
ap.add_argument("--dataset", required=True)
|
| 16 |
ap.add_argument("--output", default=str(ROOT / "trained_model"))
|
|
|
|
| 17 |
ap.add_argument("--model_name", default="Salesforce/codegen-350M-multi")
|
| 18 |
ap.add_argument("--epochs", type=float, default=1.0)
|
| 19 |
ap.add_argument("--batch_size", type=int, default=2)
|
|
@@ -24,6 +25,9 @@ def parse_args():
|
|
| 24 |
|
| 25 |
def main():
|
| 26 |
a = parse_args()
|
|
|
|
|
|
|
|
|
|
| 27 |
print(f"📦 Loading dataset from: {a.dataset}", flush=True)
|
| 28 |
ds = load_dataset("json", data_files=a.dataset, split="train")
|
| 29 |
cols = ds.column_names
|
|
@@ -52,7 +56,7 @@ def main():
|
|
| 52 |
collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)
|
| 53 |
|
| 54 |
args = TrainingArguments(
|
| 55 |
-
output_dir=
|
| 56 |
overwrite_output_dir=True,
|
| 57 |
per_device_train_batch_size=a.batch_size,
|
| 58 |
num_train_epochs=a.epochs,
|
|
@@ -70,18 +74,20 @@ def main():
|
|
| 70 |
print("🚀 Training…", flush=True)
|
| 71 |
trainer.train()
|
| 72 |
|
| 73 |
-
print(f"💾 Saving to {
|
| 74 |
-
os.makedirs(
|
| 75 |
-
trainer.save_model(
|
| 76 |
-
tok.save_pretrained(
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
print("✅ Done.", flush=True)
|
| 79 |
|
| 80 |
if __name__ == "__main__":
|
| 81 |
-
|
| 82 |
-
DONE.unlink(missing_ok=True)
|
| 83 |
-
ERRF.unlink(missing_ok=True)
|
| 84 |
-
main()
|
| 85 |
-
except Exception:
|
| 86 |
-
ERRF.write_text(traceback.format_exc())
|
| 87 |
-
raise
|
|
|
|
| 1 |
+
# train.py
|
| 2 |
+
import argparse, os, json
|
| 3 |
from pathlib import Path
|
| 4 |
from datasets import load_dataset
|
| 5 |
from transformers import (
|
| 6 |
AutoTokenizer, AutoModelForCausalLM,
|
| 7 |
DataCollatorForLanguageModeling, Trainer, TrainingArguments
|
| 8 |
)
|
| 9 |
+
import zipfile
|
| 10 |
|
| 11 |
+
ROOT = Path(_file_).resolve().parent
|
|
|
|
|
|
|
| 12 |
|
| 13 |
def parse_args():
|
| 14 |
ap = argparse.ArgumentParser()
|
| 15 |
ap.add_argument("--dataset", required=True)
|
| 16 |
ap.add_argument("--output", default=str(ROOT / "trained_model"))
|
| 17 |
+
ap.add_argument("--zip_path", default=str(ROOT / "trained_model.zip"))
|
| 18 |
ap.add_argument("--model_name", default="Salesforce/codegen-350M-multi")
|
| 19 |
ap.add_argument("--epochs", type=float, default=1.0)
|
| 20 |
ap.add_argument("--batch_size", type=int, default=2)
|
|
|
|
| 25 |
|
| 26 |
def main():
|
| 27 |
a = parse_args()
|
| 28 |
+
out_dir = Path(a.output).resolve()
|
| 29 |
+
zip_path = Path(a.zip_path).resolve()
|
| 30 |
+
|
| 31 |
print(f"📦 Loading dataset from: {a.dataset}", flush=True)
|
| 32 |
ds = load_dataset("json", data_files=a.dataset, split="train")
|
| 33 |
cols = ds.column_names
|
|
|
|
| 56 |
collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)
|
| 57 |
|
| 58 |
args = TrainingArguments(
|
| 59 |
+
output_dir=str(out_dir),
|
| 60 |
overwrite_output_dir=True,
|
| 61 |
per_device_train_batch_size=a.batch_size,
|
| 62 |
num_train_epochs=a.epochs,
|
|
|
|
| 74 |
print("🚀 Training…", flush=True)
|
| 75 |
trainer.train()
|
| 76 |
|
| 77 |
+
print(f"💾 Saving to {out_dir}", flush=True)
|
| 78 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 79 |
+
trainer.save_model(out_dir)
|
| 80 |
+
tok.save_pretrained(out_dir)
|
| 81 |
+
|
| 82 |
+
# Zip the folder ourselves (no flags, no UI dependency)
|
| 83 |
+
if zip_path.exists():
|
| 84 |
+
zip_path.unlink()
|
| 85 |
+
print(f"📦 Zipping → {zip_path.name}", flush=True)
|
| 86 |
+
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
|
| 87 |
+
for p in out_dir.rglob("*"):
|
| 88 |
+
z.write(p, arcname=p.relative_to(out_dir))
|
| 89 |
+
|
| 90 |
print("✅ Done.", flush=True)
|
| 91 |
|
| 92 |
if __name__ == "__main__":
|
| 93 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|