from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer import os import shutil # Load dataset dataset = load_dataset("json", data_files="python.jsonl") # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained("distilgpt2") model = AutoModelForCausalLM.from_pretrained("distilgpt2") tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = tokenizer.pad_token_id # Tokenization def tokenize(example): full_text = example["prompt"] + example["completion"] tokens = tokenizer(full_text, truncation=True, padding="max_length", max_length=512) tokens["labels"] = tokens["input_ids"].copy() return tokens tokenized = dataset["train"].map(tokenize) # Training args = TrainingArguments( output_dir="./results", per_device_train_batch_size=2, num_train_epochs=1, logging_steps=10, save_strategy="no", ) trainer = Trainer( model=model, args=args, train_dataset=tokenized, tokenizer=tokenizer, ) trainer.train() # Save model trainer.save_model("trained_model") tokenizer.save_pretrained("trained_model") # Zip the model shutil.make_archive("trained_model", 'zip', "trained_model") print("✅ Training complete. Model zipped to 'trained_model.zip'")