Spaces:
Sleeping
Sleeping
File size: 3,185 Bytes
62c9d8d 7e32f1f 68c2563 a89bf8e 68c2563 62c9d8d 6d266d1 68c2563 62c9d8d 7e32f1f a89bf8e 68c2563 a89bf8e 68c2563 6d266d1 68c2563 7e32f1f 68c2563 70ef65d 6d266d1 70ef65d 68c2563 a89bf8e 68c2563 6d266d1 68c2563 70ef65d 68c2563 a89bf8e 62c9d8d a89bf8e 68c2563 a688193 a89bf8e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import argparse, os
from pathlib import Path
from datasets import load_dataset
from transformers import (
AutoTokenizer, AutoModelForCausalLM,
DataCollatorForLanguageModeling, Trainer, TrainingArguments
)
import zipfile
ROOT = Path(_file_).resolve().parent
def parse_args():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, help="Path to .jsonl (or a folder you adapt later)")
ap.add_argument("--output", default=str(ROOT / "trained_model"))
ap.add_argument("--zip_path", default=str(ROOT / "trained_model.zip"))
ap.add_argument("--model_name", default="Salesforce/codegen-350M-multi")
ap.add_argument("--epochs", type=float, default=1.0)
ap.add_argument("--batch_size", type=int, default=2)
ap.add_argument("--block_size", type=int, default=256)
ap.add_argument("--learning_rate", type=float, default=5e-5)
return ap.parse_args()
def main():
a = parse_args()
out_dir = Path(a.output).resolve()
zip_path = Path(a.zip_path).resolve()
print(f"📦 Loading dataset from: {a.dataset}", flush=True)
ds = load_dataset("json", data_files=a.dataset, split="train")
cols = ds.column_names
print("🧾 Columns:", cols, flush=True)
tok = AutoTokenizer.from_pretrained(a.model_name, use_fast=True)
if tok.pad_token is None and tok.eos_token is not None:
tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(a.model_name)
def tokenize(batch):
if "text" in batch:
texts = batch["text"]
elif "prompt" in batch and "completion" in batch:
texts = [str(p).rstrip() + "\n" + str(c) for p, c in zip(batch["prompt"], batch["completion"])]
else:
raise ValueError("Dataset must have 'text' or 'prompt' + 'completion'.")
return tok(texts, padding="max_length", truncation=True, max_length=a.block_size)
print("🔁 Tokenizing…", flush=True)
tokenized = ds.map(tokenize, batched=True, remove_columns=cols)
collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)
args = TrainingArguments(
output_dir=str(out_dir),
overwrite_output_dir=True,
per_device_train_batch_size=a.batch_size,
num_train_epochs=a.epochs,
learning_rate=a.learning_rate,
logging_steps=5,
save_strategy="no",
report_to=[],
fp16=False,
)
print("⚙ Trainer…", flush=True)
trainer = Trainer(model=model, args=args, train_dataset=tokenized,
tokenizer=tok, data_collator=collator)
print("🚀 Training…", flush=True)
trainer.train()
print(f"💾 Saving to {out_dir}", flush=True)
os.makedirs(out_dir, exist_ok=True)
trainer.save_model(out_dir)
tok.save_pretrained(out_dir)
# Zip the folder
if zip_path.exists():
zip_path.unlink()
print(f"📦 Zipping → {zip_path.name}", flush=True)
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
for p in out_dir.rglob("*"):
z.write(p, arcname=p.relative_to(out_dir))
print("✅ Done.", flush=True)
if __name__ == "__main__":
main() |