Spaces:
Sleeping
Sleeping
| from transformers import MT5Tokenizer | |
| from datasets import load_from_disk | |
| import os | |
| # If you didn't save dataset yet, just import from prepare_data.py | |
| from prepare_data import dataset | |
| tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small") | |
| def tokenize(batch): | |
| x = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=32) | |
| y = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=32) | |
| x["labels"] = y["input_ids"] | |
| return x | |
| train_tok = dataset["train"].map(tokenize, batched=True, remove_columns=dataset["train"].column_names) | |
| val_tok = dataset["validation"].map(tokenize, batched=True, remove_columns=dataset["validation"].column_names) | |
| train_tok.save_to_disk("processed/train") | |
| val_tok.save_to_disk("processed/val") | |
| print("✅ Tokenized data saved") | |