File size: 837 Bytes
d14e502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from transformers import MT5Tokenizer
from datasets import load_from_disk
import os

# If you didn't save dataset yet, just import from prepare_data.py
from prepare_data import dataset

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")

def tokenize(batch):
    x = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=32)
    y = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=32)
    x["labels"] = y["input_ids"]
    return x

train_tok = dataset["train"].map(tokenize, batched=True, remove_columns=dataset["train"].column_names)
val_tok   = dataset["validation"].map(tokenize, batched=True, remove_columns=dataset["validation"].column_names)



train_tok.save_to_disk("processed/train")
val_tok.save_to_disk("processed/val")

print("✅ Tokenized data saved")