File size: 16,560 Bytes

ad0be11

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b58b67f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import model and training classes from Hugging Face Transformers\n",
    "from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments\n",
    "# Import dataset utilities from the datasets library\n",
    "from datasets import load_dataset, Dataset\n",
    "# Import pandas for CSV/DF handling\n",
    "import pandas as pd\n",
    "# Import torch for device checks and tensors\n",
    "import torch\n",
    "# Import evaluate to load evaluation metrics\n",
    "import evaluate\n",
    "# Import numpy for numeric array manipulation\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "349f59d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Path to the main corpus CSV file (expects two columns: source and target)\n",
    "data_path = \"./your/main/corpus.csv\"  # Two columns: 'en' and 't'\n",
    "# Read the CSV into a pandas DataFrame\n",
    "df = pd.read_csv(data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9232b43",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display the first 3 rows of the DataFrame to inspect loaded data\n",
    "df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88f11bdd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ensure the dataframe has correct column names and no list-type values\n",
    "def ensure_text_columns(df):\n",
    "    # If the DataFrame uses 'Src_lang'/'Tgt_lang', rename them to 'src'/'tgt'\n",
    "    if 'Src_lang' in df.columns and 'Tgt_lang' in df.columns:\n",
    "        df = df.rename(columns={\"Src_lang\": \"src\", \"Tgt_lang\": \"tgt\"})\n",
    "    # blank line preserved for readability\n",
    "    # Ensure all values are strings to avoid list/object types during tokenization\n",
    "    df['src'] = df['src'].astype(str)\n",
    "    df['tgt'] = df['tgt'].astype(str)\n",
    "    # blank line preserved for readability\n",
    "    return df  # return the normalized DataFrame\n",
    "# Apply the helper to the loaded DataFrame\n",
    "df = ensure_text_columns(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c3fdfa7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Re-inspect the DataFrame after normalization\n",
    "df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3aaec3a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add language prefix tokens that will be prepended to source/target sentences\n",
    "prefix_src = \"src_lang_code\"  # placeholder source language token\n",
    "prefix_tgt = \"tgt_lang_code\"  # placeholder target language token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06bbfc98",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocessing function that adds language prefix tokens to each example\n",
    "def preprocess(example):\n",
    "    # change the prefix_src and prefix_tgt to change the translation direction\n",
    "    return {\n",
    "        \"translation\": {\n",
    "            \"src\": f\"{prefix_src} {example['src']}\",  # prepend source prefix\n",
    "            \"tgt\": f\"{prefix_tgt} {example['tgt']}\"  # prepend target prefix\n",
    "        }\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad52beb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rename columns (no-op here but kept for clarity) and apply preprocessing to create a Dataset\n",
    "df = df.rename(columns={\"src\": \"src\", \"tgt\": \"tgt\"})  # explicit rename placeholder\n",
    "# Convert pandas DataFrame to a Hugging Face Dataset\n",
    "dataset = Dataset.from_pandas(df)\n",
    "# Apply preprocessing function to each example in the dataset\n",
    "dataset = dataset.map(preprocess)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd30423f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split the Dataset into training and validation sets\n",
    "split_dataset = dataset.train_test_split(test_size=0.1, seed=42)  # 10% for validation\n",
    "# Extract train and validation Dataset objects\n",
    "train_data = split_dataset[\"train\"]\n",
    "val_data = split_dataset[\"test\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "951b1f86",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save processed train/validation splits to CSV files for later use\n",
    "train_data.to_csv(\"train_set.csv\", index=False)  # write training set\n",
    "val_data.to_csv(\"val_set.csv\", index=False)  # write validation set\n",
    "print(\"Train and validation data saved successfully:\")  # confirmation\n",
    "print(f\"Train size: {len(train_data)}\")  # show train count\n",
    "print(f\"Validation size: {len(val_data)}\")  # show validation count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3cbc12e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the MBART-50 tokenizer (fast implementation)\n",
    "tokenizer = MBart50TokenizerFast.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n",
    "# Add any custom special tokens required (e.g., our target language code)\n",
    "tokenizer.add_special_tokens({'additional_special_tokens': [\"tgt_lang_code\"]})\n",
    "# Register the new lang token in the tokenizer's lang_code mapping\n",
    "tokenizer.lang_code_to_id[\"tgt_lang_code\"] = len(tokenizer.lang_code_to_id)\n",
    "# Rebuild reverse mapping from id to lang code (useful later)\n",
    "tokenizer.id_to_lang_code = {v: k for k, v in tokenizer.lang_code_to_id.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc507095",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the pretrained MBART model for conditional generation\n",
    "model = MBartForConditionalGeneration.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n",
    "# Resize model token embeddings to account for any newly added tokens in the tokenizer\n",
    "model.resize_token_embeddings(len(tokenizer))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ebd78be",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Step 4: Tokenize data ---  # tokenization settings and helper\n",
    "# Maximum tokenization length for inputs/targets\n",
    "max_length = 128\n",
    "# blank line for readability\n",
    "# Tokenization function applied to dataset examples\n",
    "def tokenize_function(examples):\n",
    "    # Tokenize source with padding/truncation to max_length\n",
    "    inputs = tokenizer(examples[\"translation\"][\"src\"], padding=\"max_length\", truncation=True, max_length=max_length)\n",
    "    # Tokenize target similarly\n",
    "    targets = tokenizer(examples[\"translation\"][\"tgt\"], padding=\"max_length\", truncation=True, max_length=max_length)\n",
    "    # Use tokenized target input_ids as labels for seq2seq training\n",
    "    inputs[\"labels\"] = targets[\"input_ids\"]\n",
    "    return inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bb922c07",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tokenize the dataset using the helper function defined above\n",
    "train_dataset = train_data.map(tokenize_function)\n",
    "val_dataset = val_data.map(tokenize_function)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "213966dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import evaluation utilities (repeated import is safe inside notebook but already imported above)\n",
    "import evaluate\n",
    "# numpy imported earlier; this duplicate import is harmless\n",
    "import numpy as np\n",
    "# blank line for readability\n",
    "# Load metric implementations once to reuse inside compute_metrics\n",
    "bleu_metric = evaluate.load(\"bleu\")\n",
    "meteor_metric = evaluate.load(\"meteor\")\n",
    "ter_metric = evaluate.load(\"ter\")\n",
    "chrf_metric = evaluate.load(\"chrf\")\n",
    "# blank line for readability\n",
    "# Function used by Trainer to compute evaluation metrics from model outputs\n",
    "def compute_metrics(eval_preds):\n",
    "    # eval_preds is a tuple (predictions, labels)\n",
    "    preds, labels = eval_preds\n",
    "    # blank line for readability\n",
    "    # Decode predictions from token ids to strings\n",
    "    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
    "    # blank line for readability\n",
    "    # Replace masked label tokens (-100) with pad token id so they decode properly\n",
    "    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
    "    # blank line for readability\n",
    "    # Decode label ids to strings\n",
    "    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
    "    # blank line for readability\n",
    "    # Clean whitespace from decoded strings\n",
    "    decoded_preds = [p.strip() for p in decoded_preds]\n",
    "    decoded_labels = [[l.strip()] for l in decoded_labels]  # convert to list-of-lists for metrics\n",
    "    # blank line for readability\n",
    "    # Compute each metric using the decoded predictions and references\n",
    "    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
    "    meteor = meteor_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
    "    ter = ter_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
    "    chrf = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
    "    # blank line for readability\n",
    "    # BLEU implementations may return different keys; try common ones\n",
    "    bleu_score = bleu.get(\"score\", bleu.get(\"bleu\"))\n",
    "    # blank line for readability\n",
    "    return {\n",
    "        \"ChrF\": chrf[\"score\"],      # MAIN METRIC\n",
    "        \"BLEU\": bleu_score,\n",
    "        \"METEOR\": meteor[\"meteor\"],\n",
    "        \"TER\": ter[\"score\"]\n",
    "    }\n",
    "# end of cell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "824252e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Configure Seq2Seq training arguments for the Hugging Face Trainer\n",
    "training_args = Seq2SeqTrainingArguments(\n",
    "    output_dir=\"./your/model/checkpoints\",  # directory for checkpoints and outputs\n",
    "    per_device_train_batch_size=8,  # batch size per device for training\n",
    "    per_device_eval_batch_size=8,  # batch size per device for evaluation\n",
    "    gradient_accumulation_steps=4,      # effective batch size = 8*4 = 32\n",
    "    learning_rate=3e-5,  # initial learning rate\n",
    "    weight_decay=0.01,  # weight decay for optimizer\n",
    "    num_train_epochs=3,  # number of training epochs\n",
    "    warmup_steps=1000,  # number of warmup steps for scheduler\n",
    "    lr_scheduler_type=\"cosine\",  # learning rate scheduler type\n",
    "    fp16=torch.cuda.is_available(),  # enable fp16 if CUDA is available\n",
    "    evaluation_strategy=\"steps\",  # evaluate every X steps\n",
    "    eval_steps=2000,                    # evaluation interval in steps\n",
    "    save_strategy=\"steps\",  # save checkpoints every X steps\n",
    "    save_steps=2000,  # checkpoint saving interval\n",
    "    load_best_model_at_end=True,  # keep the best model according to metric\n",
    "    metric_for_best_model=\"ChrF\",  # metric used to select best model\n",
    "    greater_is_better=True,  # higher metric value is better\n",
    "    save_total_limit=5,  # limit number of saved checkpoints\n",
    "    predict_with_generate=True,  # use generate() for predictions during eval\n",
    "    generation_max_length=128,  # max length when generating predictions\n",
    "    generation_num_beams=4,  # number of beams for generation\n",
    "    logging_dir=\"./logs\",  # tensorboard/logging dir\n",
    "    logging_steps=200,  # logging interval\n",
    "    seed=42,  # random seed for reproducibility\n",
    "    report_to=\"none\",  # disable reporting to external services\n",
    ")\n",
    "# end of training_args cell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc6cb1bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import a data collator that pads to longest sequence in the batch for seq2seq models\n",
    "from transformers import DataCollatorForSeq2Seq\n",
    "# blank line for readability\n",
    "# Create the data collator which will dynamically pad batch examples\n",
    "data_collator = DataCollatorForSeq2Seq(\n",
    "    tokenizer,\n",
    "    model=model,\n",
    "    padding=\"longest\",  # pad to the longest sequence in the batch\n",
    ")\n",
    "# end of data_collator cell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9508fc22",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create the Seq2SeqTrainer wrapper which handles training/evaluation loops\n",
    "trainer = Seq2SeqTrainer(\n",
    "    model=model,  # the model to train\n",
    "    args=training_args,  # training configuration\n",
    "    train_dataset=train_dataset,  # training data\n",
    "    eval_dataset=val_dataset,  # evaluation data\n",
    "    processing_class=tokenizer,  # tokenizer/processor used for the model\n",
    "    data_collator=data_collator,  # handles padding in batches\n",
    "    compute_metrics=compute_metrics,  # metrics callback for evaluation\n",
    ")\n",
    "# end of trainer creation cell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5aea8ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Start training. This runs the main training loop according to training_args\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "462c0da0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate the trained model on the validation set and print returned metrics\n",
    "eval_results = trainer.evaluate()\n",
    "print(eval_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59cfe87b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Show eval_results variable (already printed above) in a notebook cell to display its value\n",
    "eval_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d629dc4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the fine-tuned model weights and tokenizer to a directory\n",
    "model.save_pretrained(\"./your/model/name\")  # saves model config and weights\n",
    "tokenizer.save_pretrained(\"./your/model/name\")  # saves tokenizer files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0641551e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load pipeline utilities for quick inference\n",
    "import torch\n",
    "from transformers import pipeline\n",
    "# blank line for readability\n",
    "# Create a translation pipeline pointing at the saved model directory\n",
    "pipeline = pipeline(\n",
    "    task=\"translation\",  # pipeline task\n",
    "    model=\"./your/model/name\",  # path to saved model\n",
    "    device=0,  # device id (0 for first GPU); set to -1 for CPU\n",
    "    torch_dtype=torch.float16,  # use float16 if model and device support it\n",
    "    src_lang=\"src_lang_code\",  # source language code token\n",
    "    tgt_lang=\"tgt_lang_code\",  # target language code token\n",
    ")\n",
    "# Run the pipeline on a sample sentence and print the translation\n",
    "print(pipeline(\"I like singing\"))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "ptorch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}