{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "b58b67f3", "metadata": {}, "outputs": [], "source": [ "# Import model and training classes from Hugging Face Transformers\n", "from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments\n", "# Import dataset utilities from the datasets library\n", "from datasets import load_dataset, Dataset\n", "# Import pandas for CSV/DF handling\n", "import pandas as pd\n", "# Import torch for device checks and tensors\n", "import torch\n", "# Import evaluate to load evaluation metrics\n", "import evaluate\n", "# Import numpy for numeric array manipulation\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "id": "349f59d3", "metadata": {}, "outputs": [], "source": [ "# Path to the main corpus CSV file (expects two columns: source and target)\n", "data_path = \"./your/main/corpus.csv\" # Two columns: 'en' and 't'\n", "# Read the CSV into a pandas DataFrame\n", "df = pd.read_csv(data_path)" ] }, { "cell_type": "code", "execution_count": null, "id": "f9232b43", "metadata": {}, "outputs": [], "source": [ "# Display the first 3 rows of the DataFrame to inspect loaded data\n", "df.head(3)" ] }, { "cell_type": "code", "execution_count": null, "id": "88f11bdd", "metadata": {}, "outputs": [], "source": [ "# Ensure the dataframe has correct column names and no list-type values\n", "def ensure_text_columns(df):\n", " # If the DataFrame uses 'Src_lang'/'Tgt_lang', rename them to 'src'/'tgt'\n", " if 'Src_lang' in df.columns and 'Tgt_lang' in df.columns:\n", " df = df.rename(columns={\"Src_lang\": \"src\", \"Tgt_lang\": \"tgt\"})\n", " # blank line preserved for readability\n", " # Ensure all values are strings to avoid list/object types during tokenization\n", " df['src'] = df['src'].astype(str)\n", " df['tgt'] = df['tgt'].astype(str)\n", " # blank line preserved for readability\n", " return df # return the normalized DataFrame\n", "# Apply the helper to the loaded DataFrame\n", "df = ensure_text_columns(df)" ] }, { "cell_type": "code", "execution_count": null, "id": "9c3fdfa7", "metadata": {}, "outputs": [], "source": [ "# Re-inspect the DataFrame after normalization\n", "df.head(3)" ] }, { "cell_type": "code", "execution_count": null, "id": "3aaec3a8", "metadata": {}, "outputs": [], "source": [ "# Add language prefix tokens that will be prepended to source/target sentences\n", "prefix_src = \"src_lang_code\" # placeholder source language token\n", "prefix_tgt = \"tgt_lang_code\" # placeholder target language token" ] }, { "cell_type": "code", "execution_count": null, "id": "06bbfc98", "metadata": {}, "outputs": [], "source": [ "# Preprocessing function that adds language prefix tokens to each example\n", "def preprocess(example):\n", " # change the prefix_src and prefix_tgt to change the translation direction\n", " return {\n", " \"translation\": {\n", " \"src\": f\"{prefix_src} {example['src']}\", # prepend source prefix\n", " \"tgt\": f\"{prefix_tgt} {example['tgt']}\" # prepend target prefix\n", " }\n", " }" ] }, { "cell_type": "code", "execution_count": null, "id": "ad52beb7", "metadata": {}, "outputs": [], "source": [ "# Rename columns (no-op here but kept for clarity) and apply preprocessing to create a Dataset\n", "df = df.rename(columns={\"src\": \"src\", \"tgt\": \"tgt\"}) # explicit rename placeholder\n", "# Convert pandas DataFrame to a Hugging Face Dataset\n", "dataset = Dataset.from_pandas(df)\n", "# Apply preprocessing function to each example in the dataset\n", "dataset = dataset.map(preprocess)" ] }, { "cell_type": "code", "execution_count": null, "id": "fd30423f", "metadata": {}, "outputs": [], "source": [ "# Split the Dataset into training and validation sets\n", "split_dataset = dataset.train_test_split(test_size=0.1, seed=42) # 10% for validation\n", "# Extract train and validation Dataset objects\n", "train_data = split_dataset[\"train\"]\n", "val_data = split_dataset[\"test\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "951b1f86", "metadata": {}, "outputs": [], "source": [ "# Save processed train/validation splits to CSV files for later use\n", "train_data.to_csv(\"train_set.csv\", index=False) # write training set\n", "val_data.to_csv(\"val_set.csv\", index=False) # write validation set\n", "print(\"Train and validation data saved successfully:\") # confirmation\n", "print(f\"Train size: {len(train_data)}\") # show train count\n", "print(f\"Validation size: {len(val_data)}\") # show validation count" ] }, { "cell_type": "code", "execution_count": null, "id": "3cbc12e7", "metadata": {}, "outputs": [], "source": [ "# Load the MBART-50 tokenizer (fast implementation)\n", "tokenizer = MBart50TokenizerFast.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n", "# Add any custom special tokens required (e.g., our target language code)\n", "tokenizer.add_special_tokens({'additional_special_tokens': [\"tgt_lang_code\"]})\n", "# Register the new lang token in the tokenizer's lang_code mapping\n", "tokenizer.lang_code_to_id[\"tgt_lang_code\"] = len(tokenizer.lang_code_to_id)\n", "# Rebuild reverse mapping from id to lang code (useful later)\n", "tokenizer.id_to_lang_code = {v: k for k, v in tokenizer.lang_code_to_id.items()}" ] }, { "cell_type": "code", "execution_count": null, "id": "fc507095", "metadata": {}, "outputs": [], "source": [ "# Load the pretrained MBART model for conditional generation\n", "model = MBartForConditionalGeneration.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n", "# Resize model token embeddings to account for any newly added tokens in the tokenizer\n", "model.resize_token_embeddings(len(tokenizer))" ] }, { "cell_type": "code", "execution_count": null, "id": "1ebd78be", "metadata": {}, "outputs": [], "source": [ "# --- Step 4: Tokenize data --- # tokenization settings and helper\n", "# Maximum tokenization length for inputs/targets\n", "max_length = 128\n", "# blank line for readability\n", "# Tokenization function applied to dataset examples\n", "def tokenize_function(examples):\n", " # Tokenize source with padding/truncation to max_length\n", " inputs = tokenizer(examples[\"translation\"][\"src\"], padding=\"max_length\", truncation=True, max_length=max_length)\n", " # Tokenize target similarly\n", " targets = tokenizer(examples[\"translation\"][\"tgt\"], padding=\"max_length\", truncation=True, max_length=max_length)\n", " # Use tokenized target input_ids as labels for seq2seq training\n", " inputs[\"labels\"] = targets[\"input_ids\"]\n", " return inputs" ] }, { "cell_type": "code", "execution_count": null, "id": "bb922c07", "metadata": {}, "outputs": [], "source": [ "# Tokenize the dataset using the helper function defined above\n", "train_dataset = train_data.map(tokenize_function)\n", "val_dataset = val_data.map(tokenize_function)" ] }, { "cell_type": "code", "execution_count": null, "id": "213966dc", "metadata": {}, "outputs": [], "source": [ "# Import evaluation utilities (repeated import is safe inside notebook but already imported above)\n", "import evaluate\n", "# numpy imported earlier; this duplicate import is harmless\n", "import numpy as np\n", "# blank line for readability\n", "# Load metric implementations once to reuse inside compute_metrics\n", "bleu_metric = evaluate.load(\"bleu\")\n", "meteor_metric = evaluate.load(\"meteor\")\n", "ter_metric = evaluate.load(\"ter\")\n", "chrf_metric = evaluate.load(\"chrf\")\n", "# blank line for readability\n", "# Function used by Trainer to compute evaluation metrics from model outputs\n", "def compute_metrics(eval_preds):\n", " # eval_preds is a tuple (predictions, labels)\n", " preds, labels = eval_preds\n", " # blank line for readability\n", " # Decode predictions from token ids to strings\n", " decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n", " # blank line for readability\n", " # Replace masked label tokens (-100) with pad token id so they decode properly\n", " labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n", " # blank line for readability\n", " # Decode label ids to strings\n", " decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n", " # blank line for readability\n", " # Clean whitespace from decoded strings\n", " decoded_preds = [p.strip() for p in decoded_preds]\n", " decoded_labels = [[l.strip()] for l in decoded_labels] # convert to list-of-lists for metrics\n", " # blank line for readability\n", " # Compute each metric using the decoded predictions and references\n", " bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)\n", " meteor = meteor_metric.compute(predictions=decoded_preds, references=decoded_labels)\n", " ter = ter_metric.compute(predictions=decoded_preds, references=decoded_labels)\n", " chrf = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels)\n", " # blank line for readability\n", " # BLEU implementations may return different keys; try common ones\n", " bleu_score = bleu.get(\"score\", bleu.get(\"bleu\"))\n", " # blank line for readability\n", " return {\n", " \"ChrF\": chrf[\"score\"], # MAIN METRIC\n", " \"BLEU\": bleu_score,\n", " \"METEOR\": meteor[\"meteor\"],\n", " \"TER\": ter[\"score\"]\n", " }\n", "# end of cell" ] }, { "cell_type": "code", "execution_count": null, "id": "824252e3", "metadata": {}, "outputs": [], "source": [ "# Configure Seq2Seq training arguments for the Hugging Face Trainer\n", "training_args = Seq2SeqTrainingArguments(\n", " output_dir=\"./your/model/checkpoints\", # directory for checkpoints and outputs\n", " per_device_train_batch_size=8, # batch size per device for training\n", " per_device_eval_batch_size=8, # batch size per device for evaluation\n", " gradient_accumulation_steps=4, # effective batch size = 8*4 = 32\n", " learning_rate=3e-5, # initial learning rate\n", " weight_decay=0.01, # weight decay for optimizer\n", " num_train_epochs=3, # number of training epochs\n", " warmup_steps=1000, # number of warmup steps for scheduler\n", " lr_scheduler_type=\"cosine\", # learning rate scheduler type\n", " fp16=torch.cuda.is_available(), # enable fp16 if CUDA is available\n", " evaluation_strategy=\"steps\", # evaluate every X steps\n", " eval_steps=2000, # evaluation interval in steps\n", " save_strategy=\"steps\", # save checkpoints every X steps\n", " save_steps=2000, # checkpoint saving interval\n", " load_best_model_at_end=True, # keep the best model according to metric\n", " metric_for_best_model=\"ChrF\", # metric used to select best model\n", " greater_is_better=True, # higher metric value is better\n", " save_total_limit=5, # limit number of saved checkpoints\n", " predict_with_generate=True, # use generate() for predictions during eval\n", " generation_max_length=128, # max length when generating predictions\n", " generation_num_beams=4, # number of beams for generation\n", " logging_dir=\"./logs\", # tensorboard/logging dir\n", " logging_steps=200, # logging interval\n", " seed=42, # random seed for reproducibility\n", " report_to=\"none\", # disable reporting to external services\n", ")\n", "# end of training_args cell" ] }, { "cell_type": "code", "execution_count": null, "id": "dc6cb1bd", "metadata": {}, "outputs": [], "source": [ "# Import a data collator that pads to longest sequence in the batch for seq2seq models\n", "from transformers import DataCollatorForSeq2Seq\n", "# blank line for readability\n", "# Create the data collator which will dynamically pad batch examples\n", "data_collator = DataCollatorForSeq2Seq(\n", " tokenizer,\n", " model=model,\n", " padding=\"longest\", # pad to the longest sequence in the batch\n", ")\n", "# end of data_collator cell" ] }, { "cell_type": "code", "execution_count": null, "id": "9508fc22", "metadata": {}, "outputs": [], "source": [ "# Create the Seq2SeqTrainer wrapper which handles training/evaluation loops\n", "trainer = Seq2SeqTrainer(\n", " model=model, # the model to train\n", " args=training_args, # training configuration\n", " train_dataset=train_dataset, # training data\n", " eval_dataset=val_dataset, # evaluation data\n", " processing_class=tokenizer, # tokenizer/processor used for the model\n", " data_collator=data_collator, # handles padding in batches\n", " compute_metrics=compute_metrics, # metrics callback for evaluation\n", ")\n", "# end of trainer creation cell" ] }, { "cell_type": "code", "execution_count": null, "id": "d5aea8ad", "metadata": {}, "outputs": [], "source": [ "# Start training. This runs the main training loop according to training_args\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "id": "462c0da0", "metadata": {}, "outputs": [], "source": [ "# Evaluate the trained model on the validation set and print returned metrics\n", "eval_results = trainer.evaluate()\n", "print(eval_results)" ] }, { "cell_type": "code", "execution_count": null, "id": "59cfe87b", "metadata": {}, "outputs": [], "source": [ "# Show eval_results variable (already printed above) in a notebook cell to display its value\n", "eval_results" ] }, { "cell_type": "code", "execution_count": null, "id": "d629dc4b", "metadata": {}, "outputs": [], "source": [ "# Save the fine-tuned model weights and tokenizer to a directory\n", "model.save_pretrained(\"./your/model/name\") # saves model config and weights\n", "tokenizer.save_pretrained(\"./your/model/name\") # saves tokenizer files" ] }, { "cell_type": "code", "execution_count": null, "id": "0641551e", "metadata": {}, "outputs": [], "source": [ "# Load pipeline utilities for quick inference\n", "import torch\n", "from transformers import pipeline\n", "# blank line for readability\n", "# Create a translation pipeline pointing at the saved model directory\n", "pipeline = pipeline(\n", " task=\"translation\", # pipeline task\n", " model=\"./your/model/name\", # path to saved model\n", " device=0, # device id (0 for first GPU); set to -1 for CPU\n", " torch_dtype=torch.float16, # use float16 if model and device support it\n", " src_lang=\"src_lang_code\", # source language code token\n", " tgt_lang=\"tgt_lang_code\", # target language code token\n", ")\n", "# Run the pipeline on a sample sentence and print the translation\n", "print(pipeline(\"I like singing\"))" ] } ], "metadata": { "kernelspec": { "display_name": "ptorch", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }