In [None]:
# Import model and training classes from Hugging Face Transformers
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments
# Import dataset utilities from the datasets library
from datasets import load_dataset, Dataset
# Import pandas for CSV/DF handling
import pandas as pd
# Import torch for device checks and tensors
import torch
# Import evaluate to load evaluation metrics
import evaluate
# Import numpy for numeric array manipulation
import numpy as np

In [None]:
# Path to the main corpus CSV file (expects two columns: source and target)
data_path = "./your/main/corpus.csv" # Two columns: 'en' and 't'
# Read the CSV into a pandas DataFrame
df = pd.read_csv(data_path)

In [None]:
# Display the first 3 rows of the DataFrame to inspect loaded data
df.head(3)

In [None]:
# Ensure the dataframe has correct column names and no list-type values
def ensure_text_columns(df):
 # If the DataFrame uses 'Src_lang'/'Tgt_lang', rename them to 'src'/'tgt'
 if 'Src_lang' in df.columns and 'Tgt_lang' in df.columns:
 df = df.rename(columns={"Src_lang": "src", "Tgt_lang": "tgt"})
 # blank line preserved for readability
 # Ensure all values are strings to avoid list/object types during tokenization
 df['src'] = df['src'].astype(str)
 df['tgt'] = df['tgt'].astype(str)
 # blank line preserved for readability
 return df # return the normalized DataFrame
# Apply the helper to the loaded DataFrame
df = ensure_text_columns(df)

In [None]:
# Re-inspect the DataFrame after normalization
df.head(3)

In [None]:
# Add language prefix tokens that will be prepended to source/target sentences
prefix_src = "src_lang_code" # placeholder source language token
prefix_tgt = "tgt_lang_code" # placeholder target language token

In [None]:
# Preprocessing function that adds language prefix tokens to each example
def preprocess(example):
 # change the prefix_src and prefix_tgt to change the translation direction
 return {
 "translation": {
 "src": f"{prefix_src} {example['src']}", # prepend source prefix
 "tgt": f"{prefix_tgt} {example['tgt']}" # prepend target prefix
 }
 }

In [None]:
# Rename columns (no-op here but kept for clarity) and apply preprocessing to create a Dataset
df = df.rename(columns={"src": "src", "tgt": "tgt"}) # explicit rename placeholder
# Convert pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)
# Apply preprocessing function to each example in the dataset
dataset = dataset.map(preprocess)

In [None]:
# Split the Dataset into training and validation sets
split_dataset = dataset.train_test_split(test_size=0.1, seed=42) # 10% for validation
# Extract train and validation Dataset objects
train_data = split_dataset["train"]
val_data = split_dataset["test"]

In [None]:
# Save processed train/validation splits to CSV files for later use
train_data.to_csv("train_set.csv", index=False) # write training set
val_data.to_csv("val_set.csv", index=False) # write validation set
print("Train and validation data saved successfully:") # confirmation
print(f"Train size: {len(train_data)}") # show train count
print(f"Validation size: {len(val_data)}") # show validation count

In [None]:
# Load the MBART-50 tokenizer (fast implementation)
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
# Add any custom special tokens required (e.g., our target language code)
tokenizer.add_special_tokens({'additional_special_tokens': ["tgt_lang_code"]})
# Register the new lang token in the tokenizer's lang_code mapping
tokenizer.lang_code_to_id["tgt_lang_code"] = len(tokenizer.lang_code_to_id)
# Rebuild reverse mapping from id to lang code (useful later)
tokenizer.id_to_lang_code = {v: k for k, v in tokenizer.lang_code_to_id.items()}

In [None]:
# Load the pretrained MBART model for conditional generation
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
# Resize model token embeddings to account for any newly added tokens in the tokenizer
model.resize_token_embeddings(len(tokenizer))

In [None]:
# --- Step 4: Tokenize data --- # tokenization settings and helper
# Maximum tokenization length for inputs/targets
max_length = 128
# blank line for readability
# Tokenization function applied to dataset examples
def tokenize_function(examples):
 # Tokenize source with padding/truncation to max_length
 inputs = tokenizer(examples["translation"]["src"], padding="max_length", truncation=True, max_length=max_length)
 # Tokenize target similarly
 targets = tokenizer(examples["translation"]["tgt"], padding="max_length", truncation=True, max_length=max_length)
 # Use tokenized target input_ids as labels for seq2seq training
 inputs["labels"] = targets["input_ids"]
 return inputs

In [None]:
# Tokenize the dataset using the helper function defined above
train_dataset = train_data.map(tokenize_function)
val_dataset = val_data.map(tokenize_function)

In [None]:
# Import evaluation utilities (repeated import is safe inside notebook but already imported above)
import evaluate
# numpy imported earlier; this duplicate import is harmless
import numpy as np
# blank line for readability
# Load metric implementations once to reuse inside compute_metrics
bleu_metric = evaluate.load("bleu")
meteor_metric = evaluate.load("meteor")
ter_metric = evaluate.load("ter")
chrf_metric = evaluate.load("chrf")
# blank line for readability
# Function used by Trainer to compute evaluation metrics from model outputs
def compute_metrics(eval_preds):
 # eval_preds is a tuple (predictions, labels)
 preds, labels = eval_preds
 # blank line for readability
 # Decode predictions from token ids to strings
 decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
 # blank line for readability
 # Replace masked label tokens (-100) with pad token id so they decode properly
 labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
 # blank line for readability
 # Decode label ids to strings
 decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 # blank line for readability
 # Clean whitespace from decoded strings
 decoded_preds = [p.strip() for p in decoded_preds]
 decoded_labels = [[l.strip()] for l in decoded_labels] # convert to list-of-lists for metrics
 # blank line for readability
 # Compute each metric using the decoded predictions and references
 bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
 meteor = meteor_metric.compute(predictions=decoded_preds, references=decoded_labels)
 ter = ter_metric.compute(predictions=decoded_preds, references=decoded_labels)
 chrf = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels)
 # blank line for readability
 # BLEU implementations may return different keys; try common ones
 bleu_score = bleu.get("score", bleu.get("bleu"))
 # blank line for readability
 return {
 "ChrF": chrf["score"], # MAIN METRIC
 "BLEU": bleu_score,
 "METEOR": meteor["meteor"],
 "TER": ter["score"]
 }
# end of cell

In [None]:
# Configure Seq2Seq training arguments for the Hugging Face Trainer
training_args = Seq2SeqTrainingArguments(
 output_dir="./your/model/checkpoints", # directory for checkpoints and outputs
 per_device_train_batch_size=8, # batch size per device for training
 per_device_eval_batch_size=8, # batch size per device for evaluation
 gradient_accumulation_steps=4, # effective batch size = 8*4 = 32
 learning_rate=3e-5, # initial learning rate
 weight_decay=0.01, # weight decay for optimizer
 num_train_epochs=3, # number of training epochs
 warmup_steps=1000, # number of warmup steps for scheduler
 lr_scheduler_type="cosine", # learning rate scheduler type
 fp16=torch.cuda.is_available(), # enable fp16 if CUDA is available
 evaluation_strategy="steps", # evaluate every X steps
 eval_steps=2000, # evaluation interval in steps
 save_strategy="steps", # save checkpoints every X steps
 save_steps=2000, # checkpoint saving interval
 load_best_model_at_end=True, # keep the best model according to metric
 metric_for_best_model="ChrF", # metric used to select best model
 greater_is_better=True, # higher metric value is better
 save_total_limit=5, # limit number of saved checkpoints
 predict_with_generate=True, # use generate() for predictions during eval
 generation_max_length=128, # max length when generating predictions
 generation_num_beams=4, # number of beams for generation
 logging_dir="./logs", # tensorboard/logging dir
 logging_steps=200, # logging interval
 seed=42, # random seed for reproducibility
 report_to="none", # disable reporting to external services
)
# end of training_args cell

In [None]:
# Import a data collator that pads to longest sequence in the batch for seq2seq models
from transformers import DataCollatorForSeq2Seq
# blank line for readability
# Create the data collator which will dynamically pad batch examples
data_collator = DataCollatorForSeq2Seq(
 tokenizer,
 model=model,
 padding="longest", # pad to the longest sequence in the batch
)
# end of data_collator cell

In [None]:
# Create the Seq2SeqTrainer wrapper which handles training/evaluation loops
trainer = Seq2SeqTrainer(
 model=model, # the model to train
 args=training_args, # training configuration
 train_dataset=train_dataset, # training data
 eval_dataset=val_dataset, # evaluation data
 processing_class=tokenizer, # tokenizer/processor used for the model
 data_collator=data_collator, # handles padding in batches
 compute_metrics=compute_metrics, # metrics callback for evaluation
)
# end of trainer creation cell

In [None]:
# Start training. This runs the main training loop according to training_args
trainer.train()

In [None]:
# Evaluate the trained model on the validation set and print returned metrics
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
# Show eval_results variable (already printed above) in a notebook cell to display its value
eval_results

In [None]:
# Save the fine-tuned model weights and tokenizer to a directory
model.save_pretrained("./your/model/name") # saves model config and weights
tokenizer.save_pretrained("./your/model/name") # saves tokenizer files

In [None]:
# Load pipeline utilities for quick inference
import torch
from transformers import pipeline
# blank line for readability
# Create a translation pipeline pointing at the saved model directory
pipeline = pipeline(
 task="translation", # pipeline task
 model="./your/model/name", # path to saved model
 device=0, # device id (0 for first GPU); set to -1 for CPU
 torch_dtype=torch.float16, # use float16 if model and device support it
 src_lang="src_lang_code", # source language code token
 tgt_lang="tgt_lang_code", # target language code token
)
# Run the pipeline on a sample sentence and print the translation
print(pipeline("I like singing"))