import torch torch.cuda.empty_cache() # ✅ Clear GPU memory before training import torch torch.cuda.empty_cache() # ✅ Load necessary libraries from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer import os # ✅ Load Dataset dataset = load_dataset("zainabfatima097/My_Dataset") # Change to your dataset path # ✅ Check available splits print(f"Available dataset splits: {dataset.keys()}") # ✅ If dataset has only 'validation' split, rename it to 'train' if "train" not in dataset: dataset["train"] = dataset["validation"] # ✅ Extract Text for Translation Task source_lang = "en" target_lang = "hi" def preprocess_function(examples): """ Extracts input and target texts for translation """ inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] return {"input_text": inputs, "target_text": targets} # ✅ Apply Text Extraction dataset = dataset.map(preprocess_function, batched=True) # ✅ Load Tokenizer model_checkpoint = "Helsinki-NLP/opus-mt-en-hi" # Use your model tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # ✅ Tokenization def tokenize_function(examples): inputs = tokenizer(examples["input_text"], truncation=True, padding="max_length", max_length=128) targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128) inputs["labels"] = targets["input_ids"] return inputs # ✅ Apply Tokenization tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["translation", "input_text", "target_text"]) # ✅ Set Train & Validation Splits train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets.get("validation", train_dataset) # Use train if validation is missing # ✅ Load Model model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) # ✅ Training Arguments (Handles Memory Issues) training_args = TrainingArguments( output_dir="./results", per_device_train_batch_size=2, # Reduce batch size to prevent OOM per_device_eval_batch_size=2, gradient_accumulation_steps=4, # Accumulate gradients to simulate larger batch fp16=True, # Mixed precision to reduce memory optim="adamw_torch", # More efficient optimizer evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, push_to_hub=False ) # ✅ Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, ) # ✅ Train Model (Handling GPU Memory Errors) try: trainer.train() except torch.cuda.OutOfMemoryError: print("⚠️ CUDA Out of Memory! Switching to CPU...") os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable GPU model.to("cpu") trainer.train() # ✅ Save Model trainer.save_model("./final_model") print("🎉 Training complete! Model saved.")