File size: 1,812 Bytes
95c2184
2c1c9e4
95c2184
2c1c9e4
 
95c2184
2c1c9e4
95c2184
2c1c9e4
 
 
 
95c2184
 
 
2c1c9e4
 
95c2184
2c1c9e4
95c2184
2c1c9e4
 
 
95c2184
2c1c9e4
95c2184
2c1c9e4
95c2184
 
2c1c9e4
 
 
 
95c2184
 
 
2c1c9e4
 
95c2184
2c1c9e4
 
 
95c2184
2c1c9e4
 
95c2184
 
 
 
 
2c1c9e4
 
 
 
95c2184
 
 
2c1c9e4
 
95c2184
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Load dataset from Hugging Face Hub
dataset = load_dataset("Percy3822/quiz_model")

# Preprocess: combine prompt + completion into single string
def format_for_training(example):
    # Convert dict completion to string if needed
    if isinstance(example["completion"], dict):
        example["completion"] = str(example["completion"])
    return {"text": example["prompt"] + "\n" + example["completion"]}

dataset = dataset.map(format_for_training)

# Load tokenizer and model (small model for low VRAM)
model_name = "distilgpt2"  # Small and fast for testing
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize, batched=True)

# Load model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training args
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=5,
    push_to_hub=True,
    hub_model_id="Percy3822/quiz_model",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["train"],  # Use train for eval in testing
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

# Push trained model to Hub
trainer.push_to_hub()