model_trainer / train.py
Percy3822's picture
Update train.py
10a7fed verified
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import os
import shutil
# Load dataset
dataset = load_dataset("json", data_files="python.jsonl")
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
# Tokenization
def tokenize(example):
full_text = example["prompt"] + example["completion"]
tokens = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized = dataset["train"].map(tokenize)
# Training
args = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=2,
num_train_epochs=1,
logging_steps=10,
save_strategy="no",
)
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized,
tokenizer=tokenizer,
)
trainer.train()
# Save model
trainer.save_model("trained_model")
tokenizer.save_pretrained("trained_model")
# Zip the model
shutil.make_archive("trained_model", 'zip', "trained_model")
print("✅ Training complete. Model zipped to 'trained_model.zip'")