File size: 3,018 Bytes
b6d0c9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import torch
torch.cuda.empty_cache()

# βœ… Clear GPU memory before training
import torch
torch.cuda.empty_cache()

# βœ… Load necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
import os

# βœ… Load Dataset
dataset = load_dataset("zainabfatima097/My_Dataset")  # Change to your dataset path

# βœ… Check available splits
print(f"Available dataset splits: {dataset.keys()}")

# βœ… If dataset has only 'validation' split, rename it to 'train'
if "train" not in dataset:
    dataset["train"] = dataset["validation"]

# βœ… Extract Text for Translation Task
source_lang = "en"
target_lang = "hi"

def preprocess_function(examples):
    """ Extracts input and target texts for translation """
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    return {"input_text": inputs, "target_text": targets}

# βœ… Apply Text Extraction
dataset = dataset.map(preprocess_function, batched=True)

# βœ… Load Tokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"  # Use your model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# βœ… Tokenization
def tokenize_function(examples):
    inputs = tokenizer(examples["input_text"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# βœ… Apply Tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["translation", "input_text", "target_text"])

# βœ… Set Train & Validation Splits
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets.get("validation", train_dataset)  # Use train if validation is missing

# βœ… Load Model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# βœ… Training Arguments (Handles Memory Issues)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,   # Reduce batch size to prevent OOM
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,   # Accumulate gradients to simulate larger batch
    fp16=True,  # Mixed precision to reduce memory
    optim="adamw_torch",  # More efficient optimizer
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
)

# βœ… Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# βœ… Train Model (Handling GPU Memory Errors)
try:
    trainer.train()
except torch.cuda.OutOfMemoryError:
    print("⚠️ CUDA Out of Memory! Switching to CPU...")
    os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable GPU
    model.to("cpu")
    trainer.train()

# βœ… Save Model
trainer.save_model("./final_model")
print("πŸŽ‰ Training complete! Model saved.")