SAWiT.AI / fine_tune.py
zainabfatima097's picture
Upload fine_tune.py
b6d0c9b verified
import torch
torch.cuda.empty_cache()
# βœ… Clear GPU memory before training
import torch
torch.cuda.empty_cache()
# βœ… Load necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
import os
# βœ… Load Dataset
dataset = load_dataset("zainabfatima097/My_Dataset") # Change to your dataset path
# βœ… Check available splits
print(f"Available dataset splits: {dataset.keys()}")
# βœ… If dataset has only 'validation' split, rename it to 'train'
if "train" not in dataset:
dataset["train"] = dataset["validation"]
# βœ… Extract Text for Translation Task
source_lang = "en"
target_lang = "hi"
def preprocess_function(examples):
""" Extracts input and target texts for translation """
inputs = [ex[source_lang] for ex in examples["translation"]]
targets = [ex[target_lang] for ex in examples["translation"]]
return {"input_text": inputs, "target_text": targets}
# βœ… Apply Text Extraction
dataset = dataset.map(preprocess_function, batched=True)
# βœ… Load Tokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi" # Use your model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# βœ… Tokenization
def tokenize_function(examples):
inputs = tokenizer(examples["input_text"], truncation=True, padding="max_length", max_length=128)
targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128)
inputs["labels"] = targets["input_ids"]
return inputs
# βœ… Apply Tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["translation", "input_text", "target_text"])
# βœ… Set Train & Validation Splits
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets.get("validation", train_dataset) # Use train if validation is missing
# βœ… Load Model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# βœ… Training Arguments (Handles Memory Issues)
training_args = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=2, # Reduce batch size to prevent OOM
per_device_eval_batch_size=2,
gradient_accumulation_steps=4, # Accumulate gradients to simulate larger batch
fp16=True, # Mixed precision to reduce memory
optim="adamw_torch", # More efficient optimizer
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False
)
# βœ… Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
)
# βœ… Train Model (Handling GPU Memory Errors)
try:
trainer.train()
except torch.cuda.OutOfMemoryError:
print("⚠️ CUDA Out of Memory! Switching to CPU...")
os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable GPU
model.to("cpu")
trainer.train()
# βœ… Save Model
trainer.save_model("./final_model")
print("πŸŽ‰ Training complete! Model saved.")