root
new files
97d5a09
#import the necessary libraries
import datasets
import numpy as np
import os
import pandas as pd
import random
import sentencepiece
import sacrebleu
import sacremoses
import tqdm
import transformers
import torch
import wandb
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from typing import List
import torch
class TranslationDataset(Dataset):
def __init__(self, source_sentences: List[str], target_sentences: List[str], tokenizer, max_length=32):
self.source_sentences = source_sentences
self.target_sentences = target_sentences
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.source_sentences)
def __getitem__(self, idx):
source_sentence = self.source_sentences[idx]
target_sentence = self.target_sentences[idx]
tokenized_source = self.tokenizer(source_sentence, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
tokenized_target = self.tokenizer(target_sentence, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
return tokenized_source, tokenized_target
# Next, let's define a function to load the sentences from the data files
def load_sentences(file_path):
with open(file_path, "r") as f:
sentences = f.read().split("\n")
# Remove any empty sentences
sentences = [sentence for sentence in sentences if sentence]
return sentences
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Sunbird/sunbird-mul-en-mbart-merged")
# Load sentences
source_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/train.lug")
target_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/train.en")
# Create the dataset
dataset = TranslationDataset(source_sentences, target_sentences, tokenizer)
# Similarly, you can create validation and test datasets
valid_source_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/val.lug")
valid_target_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/val.en")
vadi_dataset = TranslationDataset(valid_source_sentences, valid_target_sentences, tokenizer)
from transformers import AutoModelForSeq2SeqLM
# Initialize the model
model = AutoModelForSeq2SeqLM.from_pretrained("Sunbird/sunbird-mul-en-mbart-merged")
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers.optimization import Adafactor, AdafactorSchedule #Adafactor Optimizer.
import torch.nn.functional as F
# Create the DataLoader
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(vadi_dataset, batch_size=16, shuffle=True)
# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=1e-6, no_deprecation_warning=True)
# Move the model to the GPU
model = model.to("cuda")
def train_model(model, dataloader, val_dataloader, optimizer, num_epochs=5000, save_path="Total Combined Data V2 Aug 16 2023/Models/mul_en_base_v2.bin", early_stop=10):
# Set initial validation loss to positive infinity
best_val_loss = float("inf")
# Initialize early stopping counter
early_stop_counter = 0
# Training loop
for epoch in range(num_epochs):
# Training
model.train()
for batch in dataloader:
optimizer.zero_grad()
input_ids = batch[0]["input_ids"].squeeze().to("cuda")
attention_mask = batch[0]["attention_mask"].squeeze().to("cuda")
labels = batch[1]["input_ids"].squeeze().to("cuda")
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
# Validation
model.eval()
total_val_loss = 0
with torch.no_grad():
for batch in val_dataloader:
input_ids = batch[0]["input_ids"].squeeze().to("cuda")
attention_mask = batch[0]["attention_mask"].squeeze().to("cuda")
labels = batch[1]["input_ids"].squeeze().to("cuda")
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
total_val_loss += loss.item()
avg_val_loss = total_val_loss / len(val_dataloader)
print(f"Validation loss at epoch {epoch}: {avg_val_loss}")
# If validation loss improved, save the model and reset early stopping counter
if avg_val_loss < best_val_loss:
best_val_loss = avg_val_loss
torch.save(model.state_dict(), save_path)
early_stop_counter = 0
# If validation loss did not improve, increment early stopping counter
else:
early_stop_counter += 1
# If early stopping counter reached limit, stop training early
if early_stop_counter >= early_stop:
print("Early stopping triggered")
break
# Train the model
print("Training Begins Here!")
train_model(model, dataloader, val_dataloader, optimizer)
#bleu scores
print("Scoring Has Begun!")
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from sacrebleu import corpus_bleu
# Create the test dataset and DataLoader
test_source_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/test.lug")
test_target_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/test.en")
test_dataset = TranslationDataset(test_source_sentences, test_target_sentences, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=16)
# Evaluate the model
model.eval()
model.to("cuda")
predictions = []
actuals = []
with torch.no_grad():
for batch in test_dataloader:
input_ids = batch[0]["input_ids"].squeeze().to("cuda")
attention_mask = batch[0]["attention_mask"].squeeze().to("cuda")
labels = batch[1]["input_ids"].squeeze().to("cuda")
outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
# Convert output tokens to sentences
pred_sentences = [tokenizer.decode(tokens) for tokens in outputs]
actual_sentences = [tokenizer.decode(tokens) for tokens in labels]
predictions.extend(pred_sentences)
actuals.extend(actual_sentences)
# Compute BLEU score
bleu_score = corpus_bleu(predictions, [actuals]).score
print(f"BLEU score: {bleu_score}")