quiz_model_trainer / train_model.py
Percy3822's picture
Update train_model.py
cf19834 verified
import json
import os
import zipfile
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
MODEL_NAME = "distilgpt2"
OUTPUT_DIR = "trained_model"
MAX_LENGTH = 128
# Load dataset from local JSONL
dataset = load_dataset("json", data_files={"train": "sample_dataset.jsonl"})
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
texts = []
for p, c in zip(examples["prompt"], examples["completion"]):
# Convert dict completion to string
if isinstance(c, dict):
c_str = f"Q: {c.get('question','')} Options: {c.get('options',[])} Answer: {c.get('answer','')} Explanation: {c.get('explanation','')}"
else:
c_str = str(c)
texts.append(p + " " + c_str)
tokens = tokenizer(texts, truncation=True, padding="max_length", max_length=MAX_LENGTH)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
overwrite_output_dir=True,
per_device_train_batch_size=2,
num_train_epochs=1,
logging_steps=1,
save_strategy="epoch",
learning_rate=2e-5,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
)
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
# Zip the trained model for download
zip_filename = "trained_model.zip"
with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(OUTPUT_DIR):
for file in files:
filepath = os.path.join(root, file)
zipf.write(filepath, os.path.relpath(filepath, OUTPUT_DIR))