Language.converters / t5_urdu_translation
CoderHassan's picture
Create t5_urdu_translation
b267691 verified
raw
history blame
1.68 kB
import torch
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
# Load dataset (replace 'your_dataset' with your actual dataset path or Hugging Face dataset name)
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'validation': 'validation.csv'})
# Preprocess dataset
def preprocess_function(examples):
inputs = ["translate English to Urdu: " + ex for ex in examples["English"]]
targets = examples["Urdu"]
model_inputs = tokenizer(inputs, max_length=512, truncation=True)
labels = tokenizer(targets, max_length=512, truncation=True).input_ids
model_inputs["labels"] = labels
return model_inputs
# Load T5 tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
# Tokenize datasets
tokenized_datasets = dataset.map(preprocess_function, batched=True)
# Define training arguments
training_args = TrainingArguments(
output_dir="./t5_urdu_translation",
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
save_total_limit=2,
predict_with_generate=True,
logging_dir="./logs",
)
# Define Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=tokenizer,
)
# Train model
trainer.train()
# Save model
trainer.save_model("./t5_urdu_translation")
tokenizer.save_pretrained("./t5_urdu_translation")