from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments from datasets import load_dataset # Load the model and tokenizer model = GPT2LMHeadModel.from_pretrained("gpt2") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # Load and process the datasets (using the processed datasets from 'datasets.py') conversation_dataset = load_dataset("bavard/personachat_truecased") coding_dataset = load_dataset("lvwerra/stack-exchange-paired") math_dataset = load_dataset("allenai/math_qa") # Tokenize datasets (you can directly apply 'tokenize_function' from 'datasets.py' here) def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) conversation_dataset = conversation_dataset.map(tokenize_function, batched=True) coding_dataset = coding_dataset.map(tokenize_function, batched=True) math_dataset = math_dataset.map(tokenize_function, batched=True) # Combine the datasets into one (optional) train_dataset = conversation_dataset["train"] + coding_dataset["train"] + math_dataset["train"] # Define the training arguments training_args = TrainingArguments( output_dir="./output", # Directory to save the trained model num_train_epochs=3, # Number of training epochs per_device_train_batch_size=4, # Batch size per device during training per_device_eval_batch_size=4, # Batch size per device during evaluation logging_dir="./logs", # Directory for logs logging_steps=10, # Log every 10 steps save_steps=500 # Save model every 500 steps ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, # Your combined training dataset eval_dataset=conversation_dataset["test"] # Evaluation dataset (can use conversation test dataset) ) # Start training trainer.train()