|
|
import torch |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
|
|
|
import torch |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
|
|
|
from datasets import load_dataset |
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer |
|
|
import os |
|
|
|
|
|
|
|
|
dataset = load_dataset("zainabfatima097/My_Dataset") |
|
|
|
|
|
|
|
|
print(f"Available dataset splits: {dataset.keys()}") |
|
|
|
|
|
|
|
|
if "train" not in dataset: |
|
|
dataset["train"] = dataset["validation"] |
|
|
|
|
|
|
|
|
source_lang = "en" |
|
|
target_lang = "hi" |
|
|
|
|
|
def preprocess_function(examples): |
|
|
""" Extracts input and target texts for translation """ |
|
|
inputs = [ex[source_lang] for ex in examples["translation"]] |
|
|
targets = [ex[target_lang] for ex in examples["translation"]] |
|
|
return {"input_text": inputs, "target_text": targets} |
|
|
|
|
|
|
|
|
dataset = dataset.map(preprocess_function, batched=True) |
|
|
|
|
|
|
|
|
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
|
|
|
|
|
|
|
|
def tokenize_function(examples): |
|
|
inputs = tokenizer(examples["input_text"], truncation=True, padding="max_length", max_length=128) |
|
|
targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128) |
|
|
inputs["labels"] = targets["input_ids"] |
|
|
return inputs |
|
|
|
|
|
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["translation", "input_text", "target_text"]) |
|
|
|
|
|
|
|
|
train_dataset = tokenized_datasets["train"] |
|
|
eval_dataset = tokenized_datasets.get("validation", train_dataset) |
|
|
|
|
|
|
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir="./results", |
|
|
per_device_train_batch_size=2, |
|
|
per_device_eval_batch_size=2, |
|
|
gradient_accumulation_steps=4, |
|
|
fp16=True, |
|
|
optim="adamw_torch", |
|
|
evaluation_strategy="epoch", |
|
|
save_strategy="epoch", |
|
|
load_best_model_at_end=True, |
|
|
push_to_hub=False |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=eval_dataset, |
|
|
tokenizer=tokenizer, |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
trainer.train() |
|
|
except torch.cuda.OutOfMemoryError: |
|
|
print("β οΈ CUDA Out of Memory! Switching to CPU...") |
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "" |
|
|
model.to("cpu") |
|
|
trainer.train() |
|
|
|
|
|
|
|
|
trainer.save_model("./final_model") |
|
|
print("π Training complete! Model saved.") |