CoderHassan commited on
Commit
b267691
·
verified ·
1 Parent(s): cc8db4f

Create t5_urdu_translation

Browse files
Files changed (1) hide show
  1. t5_urdu_translation +53 -0
t5_urdu_translation ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from datasets import load_dataset
3
+ from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
4
+
5
+ # Load dataset (replace 'your_dataset' with your actual dataset path or Hugging Face dataset name)
6
+ dataset = load_dataset('csv', data_files={'train': 'train.csv', 'validation': 'validation.csv'})
7
+
8
+ # Preprocess dataset
9
+ def preprocess_function(examples):
10
+ inputs = ["translate English to Urdu: " + ex for ex in examples["English"]]
11
+ targets = examples["Urdu"]
12
+ model_inputs = tokenizer(inputs, max_length=512, truncation=True)
13
+ labels = tokenizer(targets, max_length=512, truncation=True).input_ids
14
+ model_inputs["labels"] = labels
15
+ return model_inputs
16
+
17
+ # Load T5 tokenizer and model
18
+ model_name = "t5-small"
19
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
20
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
21
+
22
+ # Tokenize datasets
23
+ tokenized_datasets = dataset.map(preprocess_function, batched=True)
24
+
25
+ # Define training arguments
26
+ training_args = TrainingArguments(
27
+ output_dir="./t5_urdu_translation",
28
+ evaluation_strategy="epoch",
29
+ learning_rate=5e-5,
30
+ per_device_train_batch_size=16,
31
+ per_device_eval_batch_size=16,
32
+ num_train_epochs=3,
33
+ weight_decay=0.01,
34
+ save_total_limit=2,
35
+ predict_with_generate=True,
36
+ logging_dir="./logs",
37
+ )
38
+
39
+ # Define Trainer
40
+ trainer = Trainer(
41
+ model=model,
42
+ args=training_args,
43
+ train_dataset=tokenized_datasets["train"],
44
+ eval_dataset=tokenized_datasets["validation"],
45
+ tokenizer=tokenizer,
46
+ )
47
+
48
+ # Train model
49
+ trainer.train()
50
+
51
+ # Save model
52
+ trainer.save_model("./t5_urdu_translation")
53
+ tokenizer.save_pretrained("./t5_urdu_translation")