multilingual-transliteration / tokenize_save.py
HF Deploy
Deploy multilingual transliteration app to Hugging Face Spaces
d14e502
raw
history blame contribute delete
837 Bytes
from transformers import MT5Tokenizer
from datasets import load_from_disk
import os
# If you didn't save dataset yet, just import from prepare_data.py
from prepare_data import dataset
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
def tokenize(batch):
x = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=32)
y = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=32)
x["labels"] = y["input_ids"]
return x
train_tok = dataset["train"].map(tokenize, batched=True, remove_columns=dataset["train"].column_names)
val_tok = dataset["validation"].map(tokenize, batched=True, remove_columns=dataset["validation"].column_names)
train_tok.save_to_disk("processed/train")
val_tok.save_to_disk("processed/val")
print("✅ Tokenized data saved")