Spaces:
Sleeping
Sleeping
File size: 1,291 Bytes
8033fa5 23b4c59 10a7fed aa25c1f 10a7fed c1c8351 aa25c1f 10a7fed 23b4c59 62af157 2ba6539 23b4c59 10a7fed 2ba6539 23b4c59 10a7fed 398ce43 de3a096 23b4c59 10a7fed 23b4c59 10a7fed 2ba6539 10a7fed 23b4c59 10a7fed aa25c1f 10a7fed 2ba6539 aa25c1f 23b4c59 10a7fed 23b4c59 10a7fed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import os
import shutil
# Load dataset
dataset = load_dataset("json", data_files="python.jsonl")
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
# Tokenization
def tokenize(example):
full_text = example["prompt"] + example["completion"]
tokens = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized = dataset["train"].map(tokenize)
# Training
args = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=2,
num_train_epochs=1,
logging_steps=10,
save_strategy="no",
)
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized,
tokenizer=tokenizer,
)
trainer.train()
# Save model
trainer.save_model("trained_model")
tokenizer.save_pretrained("trained_model")
# Zip the model
shutil.make_archive("trained_model", 'zip', "trained_model")
print("✅ Training complete. Model zipped to 'trained_model.zip'") |