from transformers import MT5Tokenizer from datasets import load_from_disk import os # If you didn't save dataset yet, just import from prepare_data.py from prepare_data import dataset tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small") def tokenize(batch): x = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=32) y = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=32) x["labels"] = y["input_ids"] return x train_tok = dataset["train"].map(tokenize, batched=True, remove_columns=dataset["train"].column_names) val_tok = dataset["validation"].map(tokenize, batched=True, remove_columns=dataset["validation"].column_names) train_tok.save_to_disk("processed/train") val_tok.save_to_disk("processed/val") print("✅ Tokenized data saved")