File size: 3,018 Bytes
b6d0c9b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | import torch
torch.cuda.empty_cache()
# β
Clear GPU memory before training
import torch
torch.cuda.empty_cache()
# β
Load necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
import os
# β
Load Dataset
dataset = load_dataset("zainabfatima097/My_Dataset") # Change to your dataset path
# β
Check available splits
print(f"Available dataset splits: {dataset.keys()}")
# β
If dataset has only 'validation' split, rename it to 'train'
if "train" not in dataset:
dataset["train"] = dataset["validation"]
# β
Extract Text for Translation Task
source_lang = "en"
target_lang = "hi"
def preprocess_function(examples):
""" Extracts input and target texts for translation """
inputs = [ex[source_lang] for ex in examples["translation"]]
targets = [ex[target_lang] for ex in examples["translation"]]
return {"input_text": inputs, "target_text": targets}
# β
Apply Text Extraction
dataset = dataset.map(preprocess_function, batched=True)
# β
Load Tokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi" # Use your model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# β
Tokenization
def tokenize_function(examples):
inputs = tokenizer(examples["input_text"], truncation=True, padding="max_length", max_length=128)
targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128)
inputs["labels"] = targets["input_ids"]
return inputs
# β
Apply Tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["translation", "input_text", "target_text"])
# β
Set Train & Validation Splits
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets.get("validation", train_dataset) # Use train if validation is missing
# β
Load Model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# β
Training Arguments (Handles Memory Issues)
training_args = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=2, # Reduce batch size to prevent OOM
per_device_eval_batch_size=2,
gradient_accumulation_steps=4, # Accumulate gradients to simulate larger batch
fp16=True, # Mixed precision to reduce memory
optim="adamw_torch", # More efficient optimizer
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False
)
# β
Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
)
# β
Train Model (Handling GPU Memory Errors)
try:
trainer.train()
except torch.cuda.OutOfMemoryError:
print("β οΈ CUDA Out of Memory! Switching to CPU...")
os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable GPU
model.to("cpu")
trainer.train()
# β
Save Model
trainer.save_model("./final_model")
print("π Training complete! Model saved.") |