# agent_1_train.py from datasets import load_dataset from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments # Load dataset dataset = load_dataset("wikitext", "wikitext-2-raw-v1") # Tokenizer tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token # Tiny GPT config (~20M params) config = GPT2Config( vocab_size=tokenizer.vocab_size, n_positions=128, n_ctx=128, n_embd=256, n_layer=4, n_head=4 ) model = GPT2LMHeadModel(config) # Tokenize dataset def tokenize_function(examples): return tokenizer(examples['text'], truncation=True, max_length=128, padding="max_length") tokenized_datasets = dataset.map(tokenize_function, batched=True) tokenized_datasets.set_format(type='torch', columns=['input_ids']) # Data collator data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Training arguments training_args = TrainingArguments( output_dir="./tiny-gpt", num_train_epochs=3, per_device_train_batch_size=2, save_steps=500, save_total_limit=2, logging_steps=50, learning_rate=5e-4, fp16=False ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets['train'], tokenizer=tokenizer, data_collator=data_collator ) # Train model trainer.train() # Save model model.save_pretrained("./tiny-gpt") tokenizer.save_pretrained("./tiny-gpt") print("Training complete! Model saved in ./tiny-gpt")