File size: 3,212 Bytes
62c9d8d
7e32f1f
68c2563
 
 
 
 
a89bf8e
68c2563
 
 
99607fa
 
 
68c2563
 
 
 
 
 
 
 
 
a89bf8e
 
99607fa
a89bf8e
68c2563
6d266d1
68c2563
 
7e32f1f
68c2563
 
 
 
 
99607fa
70ef65d
99607fa
 
 
 
 
 
 
68c2563
 
 
 
 
 
 
a89bf8e
68c2563
 
 
 
 
6d266d1
68c2563
70ef65d
68c2563
 
 
 
 
 
 
 
 
a89bf8e
 
 
 
 
62c9d8d
a89bf8e
 
 
 
 
 
 
68c2563
 
a688193
a89bf8e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import argparse, os
from pathlib import Path
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
)
import zipfile

def parse_args():
    ap = argparse.ArgumentParser()
    ap.add_argument("--dataset", required=True, help="Path to .jsonl")
    ap.add_argument("--output",  required=True, help="Output model folder")
    ap.add_argument("--zip_path", required=True, help="Path to write .zip")
    ap.add_argument("--model_name", default="Salesforce/codegen-350M-multi")
    ap.add_argument("--epochs", type=float, default=1.0)
    ap.add_argument("--batch_size", type=int, default=2)
    ap.add_argument("--block_size", type=int, default=256)
    ap.add_argument("--learning_rate", type=float, default=5e-5)
    return ap.parse_args()

def main():
    a = parse_args()
    out_dir  = Path(a.output).resolve()
    zip_path = Path(a.zip_path).resolve()
    out_dir.parent.mkdir(parents=True, exist_ok=True)

    print(f"📦 Loading dataset from: {a.dataset}", flush=True)
    ds = load_dataset("json", data_files=a.dataset, split="train")
    cols = ds.column_names
    print("🧾 Columns:", cols, flush=True)

    tok = AutoTokenizer.from_pretrained(a.model_name, use_fast=True)
    if tok.pad_token is None and tok.eos_token is not None:
        tok.pad_token = tok.eos_token
    model = AutoModelForCausalLM.from_pretrained(a.model_name)

    def to_text(batch):
        if "text" in batch:
            return batch["text"]
        if "prompt" in batch and "completion" in batch:
            return [str(p).rstrip() + "\n" + str(c) for p, c in zip(batch["prompt"], batch["completion"])]
        raise ValueError("Dataset must have 'text' or 'prompt' + 'completion'.")

    def tokenize(batch):
        texts = to_text(batch)
        return tok(texts, padding="max_length", truncation=True, max_length=a.block_size)

    print("🔁 Tokenizing…", flush=True)
    tokenized = ds.map(tokenize, batched=True, remove_columns=cols)
    collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

    args = TrainingArguments(
        output_dir=str(out_dir),
        overwrite_output_dir=True,
        per_device_train_batch_size=a.batch_size,
        num_train_epochs=a.epochs,
        learning_rate=a.learning_rate,
        logging_steps=5,
        save_strategy="no",
        report_to=[],
        fp16=False,
    )

    print("⚙ Trainer…", flush=True)
    trainer = Trainer(model=model, args=args, train_dataset=tokenized,
                      tokenizer=tok, data_collator=collator)

    print("🚀 Training…", flush=True)
    trainer.train()

    print(f"💾 Saving to {out_dir}", flush=True)
    os.makedirs(out_dir, exist_ok=True)
    trainer.save_model(out_dir)
    tok.save_pretrained(out_dir)

    # Zip the folder
    if zip_path.exists():
        zip_path.unlink()
    print(f"📦 Zipping → {zip_path.name}", flush=True)
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
        for p in out_dir.rglob("*"):
            z.write(p, arcname=p.relative_to(out_dir))

    print("✅ Done.", flush=True)

if __name__ == "__main__":
    main()