root

new files

97d5a09 over 2 years ago

6.46 kB

	#import the necessary libraries
	import datasets
	import numpy as np
	import os
	import pandas as pd
	import random
	import sentencepiece
	import sacrebleu
	import sacremoses
	import tqdm
	import transformers
	import torch
	import wandb


	from transformers import AutoTokenizer
	from torch.utils.data import Dataset
	from typing import List
	import torch

	class TranslationDataset(Dataset):
	def __init__(self, source_sentences: List[str], target_sentences: List[str], tokenizer, max_length=32):
	self.source_sentences = source_sentences
	self.target_sentences = target_sentences
	self.tokenizer = tokenizer
	self.max_length = max_length

	def __len__(self):
	return len(self.source_sentences)

	def __getitem__(self, idx):
	source_sentence = self.source_sentences[idx]
	target_sentence = self.target_sentences[idx]

	tokenized_source = self.tokenizer(source_sentence, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
	tokenized_target = self.tokenizer(target_sentence, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")

	return tokenized_source, tokenized_target


	# Next, let's define a function to load the sentences from the data files
	def load_sentences(file_path):
	with open(file_path, "r") as f:
	sentences = f.read().split("\n")
	# Remove any empty sentences
	sentences = [sentence for sentence in sentences if sentence]
	return sentences


	# Initialize the tokenizer
	tokenizer = AutoTokenizer.from_pretrained("Sunbird/sunbird-mul-en-mbart-merged")

	# Load sentences
	source_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/train.lug")
	target_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/train.en")

	# Create the dataset
	dataset = TranslationDataset(source_sentences, target_sentences, tokenizer)

	# Similarly, you can create validation and test datasets

	valid_source_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/val.lug")
	valid_target_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/val.en")

	vadi_dataset = TranslationDataset(valid_source_sentences, valid_target_sentences, tokenizer)


	from transformers import AutoModelForSeq2SeqLM

	# Initialize the model
	model = AutoModelForSeq2SeqLM.from_pretrained("Sunbird/sunbird-mul-en-mbart-merged")

	from torch.utils.data import DataLoader
	from transformers import AdamW
	from transformers.optimization import Adafactor, AdafactorSchedule #Adafactor Optimizer.
	import torch.nn.functional as F

	# Create the DataLoader
	dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
	val_dataloader = DataLoader(vadi_dataset, batch_size=16, shuffle=True)

	# Initialize the optimizer
	optimizer = AdamW(model.parameters(), lr=1e-6, no_deprecation_warning=True)

	# Move the model to the GPU
	model = model.to("cuda")

	def train_model(model, dataloader, val_dataloader, optimizer, num_epochs=5000, save_path="Total Combined Data V2 Aug 16 2023/Models/mul_en_base_v2.bin", early_stop=10):
	# Set initial validation loss to positive infinity
	best_val_loss = float("inf")
	# Initialize early stopping counter
	early_stop_counter = 0

	# Training loop
	for epoch in range(num_epochs):
	# Training
	model.train()
	for batch in dataloader:
	optimizer.zero_grad()
	input_ids = batch[0]["input_ids"].squeeze().to("cuda")
	attention_mask = batch[0]["attention_mask"].squeeze().to("cuda")
	labels = batch[1]["input_ids"].squeeze().to("cuda")
	outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
	loss = outputs.loss
	loss.backward()
	optimizer.step()

	# Validation
	model.eval()
	total_val_loss = 0
	with torch.no_grad():
	for batch in val_dataloader:
	input_ids = batch[0]["input_ids"].squeeze().to("cuda")
	attention_mask = batch[0]["attention_mask"].squeeze().to("cuda")
	labels = batch[1]["input_ids"].squeeze().to("cuda")
	outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
	loss = outputs.loss
	total_val_loss += loss.item()
	avg_val_loss = total_val_loss / len(val_dataloader)
	print(f"Validation loss at epoch {epoch}: {avg_val_loss}")

	# If validation loss improved, save the model and reset early stopping counter
	if avg_val_loss < best_val_loss:
	best_val_loss = avg_val_loss
	torch.save(model.state_dict(), save_path)
	early_stop_counter = 0
	# If validation loss did not improve, increment early stopping counter
	else:
	early_stop_counter += 1

	# If early stopping counter reached limit, stop training early
	if early_stop_counter >= early_stop:
	print("Early stopping triggered")
	break



	# Train the model
	print("Training Begins Here!")
	train_model(model, dataloader, val_dataloader, optimizer)


	#bleu scores
	print("Scoring Has Begun!")
	from transformers import AutoTokenizer
	from torch.utils.data import DataLoader
	from sacrebleu import corpus_bleu

	# Create the test dataset and DataLoader
	test_source_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/test.lug")
	test_target_sentences = load_sentences("Total Combined Data V2 Aug 16 2023/test.en")
	test_dataset = TranslationDataset(test_source_sentences, test_target_sentences, tokenizer)
	test_dataloader = DataLoader(test_dataset, batch_size=16)

	# Evaluate the model
	model.eval()
	model.to("cuda")

	predictions = []
	actuals = []

	with torch.no_grad():
	for batch in test_dataloader:
	input_ids = batch[0]["input_ids"].squeeze().to("cuda")
	attention_mask = batch[0]["attention_mask"].squeeze().to("cuda")
	labels = batch[1]["input_ids"].squeeze().to("cuda")
	outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

	# Convert output tokens to sentences
	pred_sentences = [tokenizer.decode(tokens) for tokens in outputs]
	actual_sentences = [tokenizer.decode(tokens) for tokens in labels]
	predictions.extend(pred_sentences)
	actuals.extend(actual_sentences)

	# Compute BLEU score
	bleu_score = corpus_bleu(predictions, [actuals]).score
	print(f"BLEU score: {bleu_score}")