Spaces:
Sleeping
Sleeping
File size: 3,212 Bytes
62c9d8d 7e32f1f 68c2563 a89bf8e 68c2563 99607fa 68c2563 a89bf8e 99607fa a89bf8e 68c2563 6d266d1 68c2563 7e32f1f 68c2563 99607fa 70ef65d 99607fa 68c2563 a89bf8e 68c2563 6d266d1 68c2563 70ef65d 68c2563 a89bf8e 62c9d8d a89bf8e 68c2563 a688193 a89bf8e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import argparse, os
from pathlib import Path
from datasets import load_dataset
from transformers import (
AutoTokenizer, AutoModelForCausalLM,
DataCollatorForLanguageModeling, Trainer, TrainingArguments
)
import zipfile
def parse_args():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, help="Path to .jsonl")
ap.add_argument("--output", required=True, help="Output model folder")
ap.add_argument("--zip_path", required=True, help="Path to write .zip")
ap.add_argument("--model_name", default="Salesforce/codegen-350M-multi")
ap.add_argument("--epochs", type=float, default=1.0)
ap.add_argument("--batch_size", type=int, default=2)
ap.add_argument("--block_size", type=int, default=256)
ap.add_argument("--learning_rate", type=float, default=5e-5)
return ap.parse_args()
def main():
a = parse_args()
out_dir = Path(a.output).resolve()
zip_path = Path(a.zip_path).resolve()
out_dir.parent.mkdir(parents=True, exist_ok=True)
print(f"📦 Loading dataset from: {a.dataset}", flush=True)
ds = load_dataset("json", data_files=a.dataset, split="train")
cols = ds.column_names
print("🧾 Columns:", cols, flush=True)
tok = AutoTokenizer.from_pretrained(a.model_name, use_fast=True)
if tok.pad_token is None and tok.eos_token is not None:
tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(a.model_name)
def to_text(batch):
if "text" in batch:
return batch["text"]
if "prompt" in batch and "completion" in batch:
return [str(p).rstrip() + "\n" + str(c) for p, c in zip(batch["prompt"], batch["completion"])]
raise ValueError("Dataset must have 'text' or 'prompt' + 'completion'.")
def tokenize(batch):
texts = to_text(batch)
return tok(texts, padding="max_length", truncation=True, max_length=a.block_size)
print("🔁 Tokenizing…", flush=True)
tokenized = ds.map(tokenize, batched=True, remove_columns=cols)
collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)
args = TrainingArguments(
output_dir=str(out_dir),
overwrite_output_dir=True,
per_device_train_batch_size=a.batch_size,
num_train_epochs=a.epochs,
learning_rate=a.learning_rate,
logging_steps=5,
save_strategy="no",
report_to=[],
fp16=False,
)
print("⚙ Trainer…", flush=True)
trainer = Trainer(model=model, args=args, train_dataset=tokenized,
tokenizer=tok, data_collator=collator)
print("🚀 Training…", flush=True)
trainer.train()
print(f"💾 Saving to {out_dir}", flush=True)
os.makedirs(out_dir, exist_ok=True)
trainer.save_model(out_dir)
tok.save_pretrained(out_dir)
# Zip the folder
if zip_path.exists():
zip_path.unlink()
print(f"📦 Zipping → {zip_path.name}", flush=True)
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
for p in out_dir.rglob("*"):
z.write(p, arcname=p.relative_to(out_dir))
print("✅ Done.", flush=True)
if __name__ == "__main__":
main() |