In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast # MBART model and tokenizer classes
from tqdm import tqdm # progress bar for loops
import torch # PyTorch for tensors and device handling
import csv # CSV writer for output

In [None]:
# Load tokenizer and model (local path to your fine-tuned model)
model_path = "./combined_training/en_tgj_combined_model" # path to fine-tuned model directory (change if needed)
tokenizer = MBart50TokenizerFast.from_pretrained(model_path) # load tokenizer from model path
model = MBartForConditionalGeneration.from_pretrained(model_path) # load model weights and config
model.eval() # set model to evaluation mode (disables dropout)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # prefer GPU if available
model.to(device) # move model to selected device

In [None]:
# Parameters for tokenization and generation
src_lang_token = "en_XX" # MBART source language token to prepend
tgt_lang_token = "" # target language token / forced BOS for generation
batch_size = 16 # number of sentences per batch
max_length = 128 # maximum token length for tokenization and generation

In [None]:
# Read English sentences from a text file (one sentence per line)
with open("./sentences01.txt", "r", encoding="utf-8") as f: # input file path
 english_sentences = [line.strip() for line in f if line.strip()] # strip and ignore empty lines

In [None]:
# Prepend the MBART source language token to each sentence
prefixed_sentences = [f"{src_lang_token} {s}" for s in english_sentences] # required by MBART tokenizer

# Prepare a list to collect generated translations
translated_sentences = [] # will hold output strings

In [None]:
# Iterate through sentences in batches and generate translations
for i in tqdm(range(0, len(prefixed_sentences), batch_size), desc="Batch Translating"): # batching loop
 batch = prefixed_sentences[i:i+batch_size] # take a slice for this batch

 # Tokenize the batch and move tensors to the model device
 inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)

 with torch.no_grad(): # disable gradients for inference to save memory
 generated_tokens = model.generate(
 **inputs, # pass input_ids, attention_mask, etc.
 forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang_token), # ensure generation uses target language token
 max_length=max_length, # cap the generated length
 num_beams=5, # beam search for higher-quality decoding
 early_stopping=True, # stop once beams finish
 )

 # Decode token IDs to text and collect results
 outputs = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) # convert ids to strings
 translated_sentences.extend(outputs) # append batch outputs to final list

In [None]:
# Write aligned original and translated sentences to a CSV file
with open("./output_entgj_combined01.csv", "w", encoding="utf-8", newline="") as f: # output file path
 writer = csv.writer(f) # CSV writer object
 writer.writerow(["original", "translated"]) # write header row
 for src, tgt in zip(english_sentences, translated_sentences): # iterate aligned pairs
 writer.writerow([src, tgt]) # write each pair as a row