Spaces:
Sleeping
Sleeping
File size: 837 Bytes
d14e502 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | from transformers import MT5Tokenizer
from datasets import load_from_disk
import os
# If you didn't save dataset yet, just import from prepare_data.py
from prepare_data import dataset
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
def tokenize(batch):
x = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=32)
y = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=32)
x["labels"] = y["input_ids"]
return x
train_tok = dataset["train"].map(tokenize, batched=True, remove_columns=dataset["train"].column_names)
val_tok = dataset["validation"].map(tokenize, batched=True, remove_columns=dataset["validation"].column_names)
train_tok.save_to_disk("processed/train")
val_tok.save_to_disk("processed/val")
print("✅ Tokenized data saved")
|