File size: 1,291 Bytes
8033fa5
23b4c59
10a7fed
 
aa25c1f
10a7fed
c1c8351
aa25c1f
10a7fed
23b4c59
62af157
2ba6539
23b4c59
 
10a7fed
2ba6539
23b4c59
10a7fed
398ce43
de3a096
23b4c59
10a7fed
23b4c59
10a7fed
 
 
2ba6539
10a7fed
23b4c59
10a7fed
aa25c1f
 
 
 
10a7fed
 
2ba6539
aa25c1f
 
23b4c59
10a7fed
 
23b4c59
10a7fed
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import os
import shutil

# Load dataset
dataset = load_dataset("json", data_files="python.jsonl")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization
def tokenize(example):
    full_text = example["prompt"] + example["completion"]
    tokens = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized = dataset["train"].map(tokenize)

# Training
args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="no",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
)

trainer.train()

# Save model
trainer.save_model("trained_model")
tokenizer.save_pretrained("trained_model")

# Zip the model
shutil.make_archive("trained_model", 'zip', "trained_model")
print("✅ Training complete. Model zipped to 'trained_model.zip'")