from datasets import load_dataset from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments # Step 1: Load dataset dataset = load_dataset("DetectiveShadow/MVPQuestion")["train"] # Optional: Rename columns if needed # dataset = dataset.rename_columns({"your_input_column": "input", "your_output_column": "output"}) # Step 2: Load tokenizer and model model_name = "t5-small" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name) # Step 3: Tokenization function def tokenize(example): input_enc = tokenizer(example["input"], truncation=True, padding="max_length", max_length=64) target_enc = tokenizer(example["output"], truncation=True, padding="max_length", max_length=64) input_enc["labels"] = target_enc["input_ids"] return input_enc tokenized = dataset.map(tokenize) # Step 4: Training configuration training_args = TrainingArguments( output_dir="./MVPTrivia", per_device_train_batch_size=8, num_train_epochs=3, logging_steps=10, save_strategy="epoch", push_to_hub=True, hub_model_id="DetectiveShadow/MVPTrivia", # This is where your model will go hub_strategy="every_save" ) # Step 5: Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized ) # Step 6: Train and push trainer.train() trainer.push_to_hub() tokenizer.push_to_hub("DetectiveShadow/MVPTrivia")