File size: 1,812 Bytes
95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 2c1c9e4 95c2184 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
# Load dataset from Hugging Face Hub
dataset = load_dataset("Percy3822/quiz_model")
# Preprocess: combine prompt + completion into single string
def format_for_training(example):
# Convert dict completion to string if needed
if isinstance(example["completion"], dict):
example["completion"] = str(example["completion"])
return {"text": example["prompt"] + "\n" + example["completion"]}
dataset = dataset.map(format_for_training)
# Load tokenizer and model (small model for low VRAM)
model_name = "distilgpt2" # Small and fast for testing
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# Tokenize
def tokenize(batch):
return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)
dataset = dataset.map(tokenize, batched=True)
# Load model
model = AutoModelForCausalLM.from_pretrained(model_name)
# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Training args
training_args = TrainingArguments(
output_dir="./results",
overwrite_output_dir=True,
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=2,
num_train_epochs=1,
save_strategy="epoch",
logging_dir="./logs",
logging_steps=5,
push_to_hub=True,
hub_model_id="Percy3822/quiz_model",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["train"], # Use train for eval in testing
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.train()
# Push trained model to Hub
trainer.push_to_hub() |