Upload 5 files

Browse files

Files changed (5) hide show

scripts/KIFS_filtering_script.ipynb +470 -0
scripts/batch_translation.ipynb +137 -0
scripts/corpus_stats.ipynb +232 -0
scripts/finetuning_script.ipynb +442 -0
scripts/intrinsic_evaluation.ipynb +141 -0

scripts/KIFS_filtering_script.ipynb ADDED Viewed

	@@ -0,0 +1,470 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "311e31e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import pandas for DataFrame manipulation\n",
+    "import pandas as pd\n",
+    "# Import numpy for numerical operations\n",
+    "import numpy as np\n",
+    "# Import torch for tensor operations and device handling\n",
+    "import torch\n",
+    "# Import MBART model and tokenizer from Hugging Face Transformers\n",
+    "from transformers import MBartForConditionalGeneration, MBart50TokenizerFast\n",
+    "# Import cosine similarity for comparing embeddings\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "# Import tqdm to show progress bars for loops\n",
+    "from tqdm import tqdm\n",
+    "# Import regex utilities for tokenization and cleaning\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3363ac62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- Configuration ---\n",
+    "MODEL_NAME = \"your/model/name\"\n",
+    "SRC_LANG_CODE = \"src_lang_code\"\n",
+    "TGT_LANG_CODE = \"tgt_lang_code\"\n",
+    "CORPUS_FILE = \"your/corpus/here.csv\"\n",
+    "DICT_FILE = \"your/bilingual/dictionary/here.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5d67ae6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Hyperparameters for the Knowledge Score (KS_i)\n",
+    "# You would tune these based on empirical performance\n",
+    "ALPHA = 0.1\n",
+    "BETA = 0.3\n",
+    "GAMMA = 0.6\n",
+    "PERCENTILE_THRESHOLD = 70 # Filter threshold: keep pairs above this percentile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5fb7924",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess_text(text):\n",
+    "    \"\"\"\n",
+    "    Safely preprocesses text by handling NaN, non-string values,\n",
+    "    and performing normalization steps.\n",
+    "    \"\"\"\n",
+    "    if not isinstance(text, str):\n",
+    "        return \"\"\n",
+    "    text = text.strip().lower()\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)           # Collapse multiple spaces\n",
+    "    text = re.sub(r\"[^a-zA-Z0-9\\s']\", \"\", text)  # Remove unwanted symbols (keep alphanumerics and apostrophes)\n",
+    "    return text\n",
+    "\n",
+    "\n",
+    "def load_data(corpus_file, dict_file):\n",
+    "    \"\"\"Loads, cleans, and prepares the parallel corpus and bilingual dictionary.\"\"\"\n",
+    "\n",
+    "    # --- Load the CSVs safely ---\n",
+    "    try:\n",
+    "        raw_corpus = pd.read_csv(corpus_file)\n",
+    "        word_dictionary = pd.read_csv(dict_file)\n",
+    "    except Exception as e:\n",
+    "        raise ValueError(f\"Error loading files: {e}\")\n",
+    "\n",
+    "    # --- Ensure expected columns exist ---\n",
+    "    required_corpus_cols = {'English', 'Tagin'}\n",
+    "    required_dict_cols = {'English', 'Tagin'}\n",
+    "\n",
+    "    if not required_corpus_cols.issubset(raw_corpus.columns):\n",
+    "        raise ValueError(f\"Corpus file must contain columns: {required_corpus_cols}\")\n",
+    "    if not required_dict_cols.issubset(word_dictionary.columns):\n",
+    "        raise ValueError(f\"Dictionary file must contain columns: {required_dict_cols}\")\n",
+    "\n",
+    "    # --- Drop rows with all NaN values ---\n",
+    "    raw_corpus = raw_corpus.dropna(how='all')\n",
+    "\n",
+    "    # --- Fill NaN cells with empty strings ---\n",
+    "    raw_corpus = raw_corpus.fillna(\"\")\n",
+    "\n",
+    "    # --- Apply text preprocessing ---\n",
+    "    raw_corpus[\"English\"] = raw_corpus[\"English\"].apply(preprocess_text)\n",
+    "    raw_corpus[\"Tagin\"] = raw_corpus[\"Tagin\"].apply(preprocess_text)\n",
+    "\n",
+    "    # --- Clean dictionary entries ---\n",
+    "    word_dictionary[\"English\"] = word_dictionary[\"English\"].apply(preprocess_text)\n",
+    "    word_dictionary[\"Tagin\"] = word_dictionary[\"Tagin\"].apply(preprocess_text)\n",
+    "\n",
+    "    # --- Convert dictionary to mapping ---\n",
+    "    word_dictionary = word_dictionary.set_index('English')['Tagin'].to_dict()\n",
+    "\n",
+    "    # --- Remove empty rows after cleaning ---\n",
+    "    raw_corpus = raw_corpus[\n",
+    "        (raw_corpus[\"English\"].str.strip() != \"\") &\n",
+    "        (raw_corpus[\"Tagin\"].str.strip() != \"\")\n",
+    "    ].reset_index(drop=True)\n",
+    "\n",
+    "    print(f\"Loaded {len(raw_corpus)} sentence pairs and {len(word_dictionary)} dictionary entries.\")\n",
+    "    return raw_corpus, word_dictionary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "772824d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "load_data(CORPUS_FILE,DICT_FILE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7322656f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function for Step 2: Perplexity (PPL)\n",
+    "@torch.no_grad()\n",
+    "def calculate_perplexity(sentence, model, tokenizer, device):\n",
+    "    \"\"\"Computes perplexity of a sentence using the given LM.\"\"\"\n",
+    "    try:\n",
+    "        # Tokenize and format for mBART-50 (e.g., [lang_code] X [eos])\n",
+    "        # We'll treat this as a generation task from the source language to itself\n",
+    "        # to get log probabilities for the language modeling loss.\n",
+    "        input_ids = tokenizer(\n",
+    "            sentence,\n",
+    "            return_tensors=\"pt\",\n",
+    "            max_length=512,\n",
+    "            truncation=True\n",
+    "        ).input_ids.to(device)\n",
+    "        \n",
+    "        # Set the source language\n",
+    "        tokenizer.src_lang = SRC_LANG_CODE\n",
+    "        \n",
+    "        # The labels for perplexity are the input tokens themselves, shifted.\n",
+    "        # This is essentially a language modeling task.\n",
+    "        labels = input_ids.clone()\n",
+    "        \n",
+    "        # Use -100 to ignore the loss for special tokens (like the language code token)\n",
+    "        labels[:, 0] = -100\n",
+    "\n",
+    "        outputs = model(input_ids=input_ids, labels=labels)\n",
+    "        neg_log_likelihood = outputs.loss\n",
+    "        \n",
+    "        # Perplexity is exp(average negative log-likelihood)\n",
+    "        # The 'outputs.loss' from the Transformers library is already the average NLL per token.\n",
+    "        ppl = torch.exp(neg_log_likelihood).item()\n",
+    "        return ppl\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error calculating PPL for: '{sentence}'. Error: {e}\")\n",
+    "        return float('inf') # Return a very high PPL for errors/bad sentences\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "231f19a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def normalize_inverse_ppl(ppl_scores, epsilon=1e-6):\n",
+    "    \"\"\"\n",
+    "    Safely normalizes inverse perplexity (1/PPL_i) to [0, 1].\n",
+    "    \n",
+    "    Handles edge cases where PPL scores are constant, contain inf/nan, or are invalid.\n",
+    "    \"\"\"\n",
+    "    ppl_scores = np.array(ppl_scores, dtype=np.float64)\n",
+    "\n",
+    "    # Replace infinities or NaNs with large finite numbers for stability\n",
+    "    ppl_scores = np.nan_to_num(ppl_scores, nan=np.inf, posinf=np.inf, neginf=np.inf)\n",
+    "\n",
+    "    # Compute inverse PPL (fluency measure)\n",
+    "    inv_ppl = 1.0 / (ppl_scores + epsilon)\n",
+    "\n",
+    "    # Remove any remaining NaNs/Infs from inverse scores\n",
+    "    inv_ppl = np.nan_to_num(inv_ppl, nan=0.0, posinf=0.0, neginf=0.0)\n",
+    "\n",
+    "    inv_min = np.min(inv_ppl)\n",
+    "    inv_max = np.max(inv_ppl)\n",
+    "\n",
+    "    # Handle zero-range case: all scores are the same\n",
+    "    if np.isclose(inv_max, inv_min) or np.isnan(inv_max - inv_min):\n",
+    "        return np.zeros_like(inv_ppl)\n",
+    "\n",
+    "    # Normal min–max scaling\n",
+    "    inv_ppl_norm = (inv_ppl - inv_min) / (inv_max - inv_min)\n",
+    "    inv_ppl_norm = np.clip(inv_ppl_norm, 0.0, 1.0)\n",
+    "\n",
+    "    return inv_ppl_norm\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad791177",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function for Step 3: Semantic Similarity (Sim)\n",
+    "\n",
+    "def calculate_semantic_similarity(s_i, t_i, model, tokenizer, device):\n",
+    "    \"\"\"\n",
+    "    Computes Cosine Similarity between source and target sentence embeddings \n",
+    "    and normalizes the result to the range [0, 1].\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        def get_embedding(sentence, lang_code):\n",
+    "            tokenizer.src_lang = lang_code\n",
+    "            inputs = tokenizer(\n",
+    "                sentence,\n",
+    "                return_tensors=\"pt\",\n",
+    "                max_length=512,\n",
+    "                truncation=True,\n",
+    "                padding=True\n",
+    "            ).to(device)\n",
+    "            \n",
+    "            with torch.no_grad():\n",
+    "                encoder_output = model.model.encoder(**inputs).last_hidden_state\n",
+    "            \n",
+    "            mean_embedding = encoder_output[:, 1:-1, :].mean(dim=1).squeeze() \n",
+    "            \n",
+    "            return mean_embedding.cpu().detach().numpy().reshape(1, -1)\n",
+    "\n",
+    "        emb_s = get_embedding(s_i, SRC_LANG_CODE) \n",
+    "        emb_t = get_embedding(t_i, TGT_LANG_CODE)\n",
+    "\n",
+    "        sim_raw = cosine_similarity(emb_s, emb_t)[0][0]\n",
+    "        \n",
+    "        sim_normalized = (sim_raw + 1) / 2\n",
+    "        sim_normalized = max(0.0, min(1.0, sim_normalized))\n",
+    "        \n",
+    "        return sim_normalized\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        # print(f\"Error calculating Sim for: '{s_i}' and '{t_i}'. Error: {e}\")\n",
+    "        return 0.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87eebd8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function for Step 4: Lexical Match (Lex)  # header describing the block\n",
+    "# blank line preserved for readability\n",
+    "# Define a function that computes a lexical match score based on a bilingual dictionary\n",
+    "def calculate_lexical_match(s_i, t_i, word_dictionary):\n",
+    "    # Docstring start: describe purpose and formula for lex score\n",
+    "    \"\"\"\n",
+    "    Computes a dictionary-based lexical match score prioritizing phrase matches.\n",
+    "    Score = (Count of source words covered by successfully translated phrases) / (Total words in source sentence)\n",
+    "    \"\"\"  # docstring end\n",
+    "    # Helper: normalize text to ease phrase matching (lowercase, token boundaries)\n",
+    "    def normalize_text(text):\n",
+    "        # Simple tokenization: lowercase, remove non-word characters, and join back for easy phrase matching\n",
+    "        return \" \" + \" \".join(re.findall(r'\\b\\w+\\b', text.lower())) + \" \"  # pad with spaces for boundary-safe matching\n",
+    "    # Normalize the source sentence for phrase lookups\n",
+    "    s_normalized = normalize_text(s_i)\n",
+    "    # Token set of the target sentence for quick membership tests\n",
+    "    t_tokens = set(re.findall(r'\\b\\w+\\b', t_i.lower()))\n",
+    "    # blank line preserved for readability\n",
+    "    # Sort dictionary keys by length (descending) to prioritize phrase matches over single words\n",
+    "    tagin_phrases = sorted(word_dictionary.keys(), key=len, reverse=True)\n",
+    "    # blank line preserved for readability\n",
+    "    # Extract source word tokens and compute total count\n",
+    "    source_words = re.findall(r'\\b\\w+\\b', s_i.lower())\n",
+    "    total_source_words = len(source_words)\n",
+    "    # Initialize covered word counter\n",
+    "    covered_word_count = 0\n",
+    "    # If the source sentence is empty, return 0.0 immediately\n",
+    "    if total_source_words == 0:\n",
+    "        return 0.0\n",
+    "    # Track indices of covered words if needed (not used further but kept for clarity)\n",
+    "    covered_indices = set()\n",
+    "    # Iterate over dictionary phrases (longest-first) to find matches in the source\n",
+    "    for phrase in tagin_phrases:\n",
+    "        # Skip empty dictionary entries\n",
+    "        if not phrase:\n",
+    "            continue\n",
+    "        # Normalize the phrase for safe matching\n",
+    "        norm_phrase = normalize_text(phrase)\n",
+    "        # If the normalized phrase exists in the normalized source text, proceed\n",
+    "        if norm_phrase in s_normalized:\n",
+    "            # Get expected translation from the dictionary (lowercased)\n",
+    "            expected_translation = word_dictionary[phrase].lower()\n",
+    "            # Tokenize the expected translation into words\n",
+    "            translation_words = re.findall(r'\\b\\w+\\b', expected_translation)\n",
+    "            # Check whether all translated words appear in the target sentence tokens\n",
+    "            is_translation_present = all(word in t_tokens for word in translation_words)\n",
+    "            # If the translation words are present in the target, count the phrase as covered\n",
+    "            if is_translation_present:\n",
+    "                # Search for possibly multiple occurrences of the phrase in the source\n",
+    "                start = 0\n",
+    "                while True:\n",
+    "                    # Find next occurrence starting from 'start' index\n",
+    "                    start_index = s_normalized.find(norm_phrase, start)\n",
+    "                    # If no more occurrences, break the loop\n",
+    "                    if start_index == -1:\n",
+    "                        break\n",
+    "                    # Count how many words are in the matched phrase\n",
+    "                    phrase_word_count = len(re.findall(r'\\b\\w+\\b', phrase))\n",
+    "                    # Add the phrase's word count to the covered total\n",
+    "                    covered_word_count += phrase_word_count\n",
+    "                    # Advance the search start position past the current match\n",
+    "                    start = start_index + len(norm_phrase)\n",
+    "                # end while loop for occurrences\n",
+    "    # After checking all phrases, compute lex score as covered words / total source words (capped at 1.0)\n",
+    "    lex_score = min(1.0, covered_word_count / total_source_words)\n",
+    "    # Return lexical match score between 0 and 1\n",
+    "    return lex_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46f310e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Main Algorithm Implementation\n",
+    "def knowledge_based_filtering(raw_corpus, word_dictionary, alpha, beta, gamma, percentile_threshold):\n",
+    "    # 1. Load Model and Tokenizer (Step 1 of the algorithm's loop)\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    print(f\"Loading mBART-tgj-base model to {device}...\")\n",
+    "    model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)\n",
+    "    tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_NAME)\n",
+    "    \n",
+    "    # Ensure source/target language codes are in the tokenizer vocabulary\n",
+    "    if SRC_LANG_CODE not in tokenizer.vocab or TGT_LANG_CODE not in tokenizer.vocab:\n",
+    "        print(f\"Warning: Language codes {SRC_LANG_CODE} or {TGT_LANG_CODE} not found in base mBART-tgj-base vocab.\")\n",
+    "        print(\"Using placeholder language codes. Results may not be accurate.\")\n",
+    "\n",
+    "    results = []\n",
+    "\n",
+    "    # 2. Iterate through the corpus (Lines 1-6)\n",
+    "    print(\"Processing corpus to calculate scores...\")\n",
+    "    for index, row in tqdm(raw_corpus.iterrows(), total=len(raw_corpus), desc=\"Calculating KS\"):\n",
+    "        s_i = row['English']\n",
+    "        t_i = row['Tagin']\n",
+    "\n",
+    "        # Line 2: Compute PPL_i (Lower is better)\n",
+    "        pp= calculate_perplexity(s_i, model, tokenizer, device)\n",
+    "        PPL_i=  normalize_inverse_ppl(pp, epsilon=1e-6)\n",
+    "        # PPL_i = normalize_inverse_ppl(row[\"Perplexity\"])\n",
+    "        \n",
+    "        # Line 3: Compute Sim_i (Higher is better)\n",
+    "        Sim_i = calculate_semantic_similarity(s_i, t_i, model, tokenizer, device)\n",
+    "        \n",
+    "        # Line 4: Check Lex_i (Higher is better)\n",
+    "        Lex_i = calculate_lexical_match(s_i, t_i, word_dictionary)\n",
+    "        \n",
+    "        # Line 5: Derive Knowledge Score (KS_i)\n",
+    "        # Note: We use 1/PPL_i because PPL_i is an inverse quality metric (lower PPL is higher quality)\n",
+    "        # while Sim and Lex are direct quality metrics (higher is better).\n",
+    "        # We add a small epsilon to avoid division by zero, though a PPL of 0 is practically impossible.\n",
+    "        # PPL_i_inv = 1.0 / (PPL_i + 1e-6)\n",
+    "        # -----IMPORTANT------\n",
+    "        \n",
+    "        KS_i = alpha * PPL_i + beta * Sim_i + gamma * Lex_i\n",
+    "        \n",
+    "        results.append({\n",
+    "            'src_lang': s_i,\n",
+    "            'tgt_lang': t_i,\n",
+    "            'PPL_i': PPL_i,\n",
+    "            'Sim_i': Sim_i,\n",
+    "            'Lex_i': Lex_i,\n",
+    "            'PPL_i': PPL_i,\n",
+    "            'KS_i': KS_i\n",
+    "        })\n",
+    "\n",
+    "    # Convert results to DataFrame for filtering\n",
+    "    scored_corpus = pd.DataFrame(results)\n",
+    "\n",
+    "    # 3. Determine Threshold and Filter (Lines 7-9)\n",
+    "    # Line 7: Find the 80th percentile of Knowledge Scores\n",
+    "    tau_K = np.percentile(scored_corpus['KS_i'], percentile_threshold)\n",
+    "    print(f\"\\n50th Percentile Knowledge Score (τ_K): {tau_K:.4f}\")\n",
+    "    \n",
+    "    # Line 8: Filter the corpus\n",
+    "    D_filtered = scored_corpus[scored_corpus['KS_i'] >= tau_K].copy()\n",
+    "    \n",
+    "    # Final cleanup of columns and return\n",
+    "    D_filtered = D_filtered[['src_lang', 'tgt_lang', 'KS_i']]\n",
+    "    print(f\"Raw corpus size: {len(raw_corpus)}\")\n",
+    "    print(f\"Filtered corpus size (KS_i >= τ_K): {len(D_filtered)}\")\n",
+    "    \n",
+    "    return D_filtered"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b2ce69d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- Execution ---  # script entry and high-level execution steps\n",
+    "# blank line preserved for readability\n",
+    "# Guard to ensure code only runs when executed as a script, not on import\n",
+    "if __name__ == '__main__':\n",
+    "    # 1. Load data  # load and preprocess corpus and dictionary files\n",
+    "    raw_corpus, word_dictionary = load_data(CORPUS_FILE, DICT_FILE)\n",
+    "    # blank line preserved for readability\n",
+    "    # 2. Run the filtering algorithm  # compute KS_i and filter by percentile\n",
+    "    filtered_corpus = knowledge_based_filtering(\n",
+    "        raw_corpus,  # pass the preprocessed corpus DataFrame\n",
+    "        word_dictionary,  # pass the dictionary mapping\n",
+    "        ALPHA, BETA, GAMMA,  # weighting hyperparameters for KS_i\n",
+    "        PERCENTILE_THRESHOLD  # percentile cutoff for filtering\n",
+    "    )\n",
+    "    # blank line preserved for readability\n",
+    "    # Save the filtered corpus to CSV for downstream use\n",
+    "    filtered_corpus.to_csv(\"tgj_corpus_filtered_70th.csv\", index=False)\n",
+    "    # blank line preserved for readability\n",
+    "    # Notify user of completion and where results were saved\n",
+    "    print(\"\\nFiltering complete. Results saved to tgj_corpus_filtered_70th.csv\")\n",
+    "    # Show a short preview of the filtered corpus\n",
+    "    print(\"\\nFiltered Corpus Head:\")\n",
+    "    print(filtered_corpus)  # print DataFrame to stdout"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ptorch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

scripts/batch_translation.ipynb ADDED Viewed

	@@ -0,0 +1,137 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2230ec1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import MBartForConditionalGeneration, MBart50TokenizerFast  # MBART model and tokenizer classes\n",
+    "from tqdm import tqdm  # progress bar for loops\n",
+    "import torch  # PyTorch for tensors and device handling\n",
+    "import csv  # CSV writer for output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7552da07",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load tokenizer and model (local path to your fine-tuned model)\n",
+    "model_path = \"./combined_training/en_tgj_combined_model\"  # path to fine-tuned model directory (change if needed)\n",
+    "tokenizer = MBart50TokenizerFast.from_pretrained(model_path)  # load tokenizer from model path\n",
+    "model = MBartForConditionalGeneration.from_pretrained(model_path)  # load model weights and config\n",
+    "model.eval()  # set model to evaluation mode (disables dropout)\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")  # prefer GPU if available\n",
+    "model.to(device)  # move model to selected device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "750545cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Parameters for tokenization and generation\n",
+    "src_lang_token = \"en_XX\"  # MBART source language token to prepend\n",
+    "tgt_lang_token = \"<tgn_IN>\"  # target language token / forced BOS for generation\n",
+    "batch_size = 16  # number of sentences per batch\n",
+    "max_length = 128  # maximum token length for tokenization and generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03415b52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read English sentences from a text file (one sentence per line)\n",
+    "with open(\"./sentences01.txt\", \"r\", encoding=\"utf-8\") as f:  # input file path\n",
+    "    english_sentences = [line.strip() for line in f if line.strip()]  # strip and ignore empty lines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0927f85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepend the MBART source language token to each sentence\n",
+    "prefixed_sentences = [f\"{src_lang_token} {s}\" for s in english_sentences]  # required by MBART tokenizer\n",
+    "\n",
+    "# Prepare a list to collect generated translations\n",
+    "translated_sentences = []  # will hold output strings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86b13113",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Iterate through sentences in batches and generate translations\n",
+    "for i in tqdm(range(0, len(prefixed_sentences), batch_size), desc=\"Batch Translating\"):  # batching loop\n",
+    "    batch = prefixed_sentences[i:i+batch_size]  # take a slice for this batch\n",
+    "\n",
+    "    # Tokenize the batch and move tensors to the model device\n",
+    "    inputs = tokenizer(batch, return_tensors=\"pt\", padding=True, truncation=True, max_length=max_length).to(device)\n",
+    "\n",
+    "    with torch.no_grad():  # disable gradients for inference to save memory\n",
+    "        generated_tokens = model.generate(\n",
+    "            **inputs,  # pass input_ids, attention_mask, etc.\n",
+    "            forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang_token),  # ensure generation uses target language token\n",
+    "            max_length=max_length,  # cap the generated length\n",
+    "            num_beams=5,  # beam search for higher-quality decoding\n",
+    "            early_stopping=True,  # stop once beams finish\n",
+    "        )\n",
+    "\n",
+    "    # Decode token IDs to text and collect results\n",
+    "    outputs = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)  # convert ids to strings\n",
+    "    translated_sentences.extend(outputs)  # append batch outputs to final list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d9a12d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Write aligned original and translated sentences to a CSV file\n",
+    "with open(\"./output_entgj_combined01.csv\", \"w\", encoding=\"utf-8\", newline=\"\") as f:  # output file path\n",
+    "    writer = csv.writer(f)  # CSV writer object\n",
+    "    writer.writerow([\"original\", \"translated\"])  # write header row\n",
+    "    for src, tgt in zip(english_sentences, translated_sentences):  # iterate aligned pairs\n",
+    "        writer.writerow([src, tgt])  # write each pair as a row"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ptorch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

scripts/corpus_stats.ipynb ADDED Viewed

	@@ -0,0 +1,232 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54834b8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd  # import pandas and alias as pd for DataFrame operations\n",
+    "# blank line kept for readability\n",
+    "# Load your CSV  # comment indicating next line loads the CSV into a DataFrame\n",
+    "df = pd.read_csv(\"filtered_corpus_here.csv\")  # read the CSV file into variable df\n",
+    "# blank line kept for readability\n",
+    "# Normalize whitespace so \"hello  world\" and \"hello world\" match  # explain normalization intent\n",
+    "df['src_norm'] = df['src_lang'].str.strip().str.replace(r'\\s+', ' ', regex=True)  # strip edges and collapse multiple spaces in source column\n",
+    "df['tgt_norm'] = df['tgt_lang'].str.strip().str.replace(r'\\s+', ' ', regex=True)  # same normalization for target column\n",
+    "# blank line kept for readability\n",
+    "# Drop duplicates based on combined src+tgt  # remove identical source-target pairs after normalization\n",
+    "df_unique = df.drop_duplicates(subset=['src_norm', 'tgt_norm'], keep='first')  # keep first occurrence of duplicate pairs\n",
+    "# blank line kept for readability\n",
+    "# Remove helper columns  # drop intermediate normalization columns before saving\n",
+    "df_unique = df_unique.drop(columns=['src_norm', 'tgt_norm'])  # remove the temporary normalized columns\n",
+    "# blank line kept for readability\n",
+    "# Save result  # write the deduplicated DataFrame to a new CSV file\n",
+    "df_unique.to_csv(\"filtered_corpus_here_removedDuplicates.csv\", index=False)  # save without row index\n",
+    "# blank line kept for readability\n",
+    "print(\"Done. Original rows:\", len(df))  # print the original number of rows\n",
+    "print(\"New rows:\", len(df_unique))  # print the number of rows after deduplication\n",
+    "print(\"Removed:\", len(df) - len(df_unique))  # print how many rows were removed\n",
+    "# end of cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e73f3d91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd  # pandas for DataFrame operations\n",
+    "import numpy as np  # numpy for numeric utilities (unused but commonly imported)\n",
+    "import string  # string constants, used to build punctuation translator\n",
+    "import os  # os module for file/path operations (imported for potential use)\n",
+    "# blank line for readability\n",
+    "# --- Configuration ---  # configuration section start\n",
+    "# NOTE: This script assumes 'sample_corpus.csv' exists to generate the filtered file.  # informational note\n",
+    "FILEPATH_RAW = 'filtered_corpus_here_removedDuplicates.csv'  # path to the deduplicated raw corpus\n",
+    "FILEPATH_FILTERED = 'filtered_corpus_top_20.csv'  # path to write/read the top filtered corpus\n",
+    "SOURCE_LANG_COL = 'src_lang'  # column name for source language text\n",
+    "TARGET_LANG_COL = 'tgt_lang'  # column name for target language text\n",
+    "SCORE_COL = 'KS_i'  # column name that stores the Knowledge Score\n",
+    "# Filtering constants moved from calculate_threshold.py  # note about origin of constant\n",
+    "TARGET_PERCENTILE = 80  # percentile threshold to filter top N% by score\n",
+    "OUTPUT_FILENAME = FILEPATH_FILTERED # Ensure output filename is the same  # output target file variable\n",
+    "# --- End Configuration ---  # configuration section end\n",
+    "# blank line for readability\n",
+    "def calculate_knowledge_threshold(filepath, percentile):  # function to compute tau_K and filtered DF\n",
+    "    \"\"\"  # docstring start\n",
+    "    Reads a CSV file, calculates the specified percentile of the Knowledge Score  # description line 1\n",
+    "    column, and returns the filtered corpus and the threshold (tau_K).  # description line 2\n",
+    "    \"\"\"  # docstring end\n",
+    "    try:  # attempt to load and process the file\n",
+    "        # 1. Load the data  # step 1 comment\n",
+    "        df = pd.read_csv(filepath)  # load CSV into DataFrame df\n",
+    "        # blank line for readability\n",
+    "        # 1.5. Robust column check and numeric conversion  # validate columns and convert types\n",
+    "        if SCORE_COL not in df.columns:  # check that score column exists\n",
+    "            print(f\"Error: The CSV file must contain a column named '{SCORE_COL}'. Found columns: {list(df.columns)}\")  # informative error\n",
+    "            return pd.DataFrame(), None  # return empty DF and None threshold on error\n",
+    "        # blank line for readability\n",
+    "        df_initial_size = len(df)  # store initial row count for warnings\n",
+    "        # Convert the score column to numeric, coercing errors  # ensure numeric dtype for quantile\n",
+    "        df[SCORE_COL] = pd.to_numeric(df[SCORE_COL], errors='coerce')  # coerce invalids to NaN\n",
+    "        df.dropna(subset=[SCORE_COL], inplace=True)  # drop rows where score could not be parsed\n",
+    "        # blank line for readability\n",
+    "        if len(df) < df_initial_size:  # warn if rows were dropped\n",
+    "            print(f\"Warning: Dropped {df_initial_size - len(df)} rows with non-numeric scores.\")  # warn about dropped rows\n",
+    "        # blank line for readability\n",
+    "        # 2. Calculate the threshold (tau_K)  # compute percentile threshold\n",
+    "        tau_K = df[SCORE_COL].quantile(percentile / 100, interpolation='linear')  # compute numeric threshold\n",
+    "        # blank line for readability\n",
+    "        # 3. Apply the threshold to construct the filtered corpus (D_filtered)  # filter rows >= tau_K\n",
+    "        D_filtered = df[df[SCORE_COL] >= tau_K].copy()  # select high-score rows and copy to new DF\n",
+    "        # blank line for readability\n",
+    "        return D_filtered, tau_K  # return filtered DF and threshold\n",
+    "    except FileNotFoundError:  # handle missing file error\n",
+    "        print(f\"Error: The file '{filepath}' was not found. Please ensure it exists.\")  # print helpful message\n",
+    "        return pd.DataFrame(), None  # return empty DF and None threshold when file missing\n",
+    "    except Exception as e:  # catch-all for other errors\n",
+    "        print(f\"An unexpected error occurred during filtering: {e}\")  # print exception details\n",
+    "        return pd.DataFrame(), None  # return safe defaults on error\n",
+    "# blank line for readability\n",
+    "def tokenize_and_clean(text_series):  # function to tokenize text series and clean tokens\n",
+    "    \"\"\"  # docstring start\n",
+    "    Tokenizes text by splitting on whitespace, then cleans tokens by removing  # description line 1\n",
+    "    punctuation and converting to lowercase for accurate token counting and vocabulary size.  # description line 2\n",
+    "    \"\"\"  # docstring end\n",
+    "    all_tokens = []  # accumulator for all tokens across sentences\n",
+    "    # blank line for readability\n",
+    "    # Simple preprocessing: remove punctuation and lowercase  # describe translator creation\n",
+    "    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))  # map punctuation to spaces\n",
+    "    # blank line for readability\n",
+    "    for text in text_series.astype(str):  # iterate rows as strings\n",
+    "        clean_text = text.translate(translator).lower()  # remove punctuation and lowercase the text\n",
+    "        tokens = clean_text.split()  # split on whitespace into tokens\n",
+    "        all_tokens.extend(tokens)  # add tokens to accumulator\n",
+    "        # loop continues for next sentence\n",
+    "    return all_tokens  # return the flattened token list\n",
+    "# blank line for readability\n",
+    "def calculate_corpus_metrics(filepath):  # function to compute corpus-level metrics\n",
+    "    \"\"\"Calculates all required corpus statistics for a given file.\"\"\"  # single-line docstring\n",
+    "    try:  # try to read the file into a DataFrame\n",
+    "        df = pd.read_csv(filepath)  # load corpus file\n",
+    "    except FileNotFoundError:  # handle missing file\n",
+    "        print(f\"Error: The file '{filepath}' was not found. Please ensure it exists.\")  # user-friendly message\n",
+    "        return None  # return None to indicate failure\n",
+    "    except pd.errors.EmptyDataError:  # handle empty file error\n",
+    "        print(f\"Error: The file '{filepath}' is empty.\")  # inform user file has no data\n",
+    "        return None  # return None to indicate failure\n",
+    "    # blank line for readability\n",
+    "    # 1. Metric: Sentence Pairs  # compute total number of sentence pairs\n",
+    "    sentence_pairs = len(df)  # number of rows equals sentence pairs\n",
+    "    # blank line for readability\n",
+    "    # 2. Metric: Tokens, Avg. Sentence Length, Vocabulary Size  # prepare metrics container\n",
+    "    metrics = {}  # dict to hold source/target metrics\n",
+    "    # blank line for readability\n",
+    "    # Ensure column existence before proceeding  # validate presence of expected columns\n",
+    "    for col in [SOURCE_LANG_COL, TARGET_LANG_COL]:  # iterate expected column names\n",
+    "        if col not in df.columns:  # raise if missing\n",
+    "            raise KeyError(f\"Column '{col}' not found in the corpus file.\")  # explicit error to surface missing columns\n",
+    "    # blank line for readability\n",
+    "    for col, tag in [(SOURCE_LANG_COL, 'English'), (TARGET_LANG_COL, 'Target')]:  # compute metrics for both sides\n",
+    "        # Tokenization and Cleaning  # comment for next steps\n",
+    "        tokens = tokenize_and_clean(df[col])  # get token list for this column\n",
+    "        total_tokens = len(tokens)  # total token count across all sentences\n",
+    "        # blank line for readability\n",
+    "        # Vocabulary Size  # compute unique token count\n",
+    "        vocab_size = len(set(tokens))  # size of unique token set\n",
+    "        # blank line for readability\n",
+    "        # Avg. Sentence Length (Tokens / Sentence Pairs)  # compute average tokens per sentence\n",
+    "        avg_sentence_length = total_tokens / sentence_pairs if sentence_pairs > 0 else 0  # guard division by zero\n",
+    "        # blank line for readability\n",
+    "        metrics[tag] = {  # store computed metrics under tag key\n",
+    "            'tokens': total_tokens,  # total tokens count\n",
+    "            'avg_len': avg_sentence_length,  # average sentence length in tokens\n",
+    "            'vocab_size': vocab_size  # vocabulary size after preprocessing\n",
+    "        }\n",
+    "    return sentence_pairs, metrics  # return computed metrics\n",
+    "# blank line for readability\n",
+    "def format_and_print_results(sentence_pairs, metrics):  # pretty-print the metrics in table form\n",
+    "    \"\"\"Formats the calculated metrics into the requested table structure.\"\"\"  # docstring\n",
+    "    # blank line for readability\n",
+    "    src_data = metrics['English']  # metrics for source/English side\n",
+    "    tgt_data = metrics['Target']  # metrics for target side\n",
+    "    # blank line for readability\n",
+    "    print(\"\\n\" + \"=\"*80)  # print top border\n",
+    "    print(f\"          FILTERED CORPUS METRICS ({FILEPATH_FILTERED})\")  # title with filename\n",
+    "    print(\"=\"*80)  # print border again\n",
+    "    # blank line for readability\n",
+    "    # Table Header  # print header row labels\n",
+    "    print(f\"| {'Metric':<30} | {'Source (English)':>20} | {'Target (Target)':>20} | {'Notes':<10} |\")  # header formatting\n",
+    "    print(\"-\" * 80)  # separator line\n",
+    "    # blank line for readability\n",
+    "    # Row: Sentence Pairs  # print sentence pair counts\n",
+    "    print(f\"| {'Sentence Pairs':<30} | {sentence_pairs:>20,} | {sentence_pairs:>20,} | {'--':<10} |\")  # counts for both columns\n",
+    "    # blank line for readability\n",
+    "    # Row: Tokens (Formatted for M/K display based on size)  # prepare token display strings\n",
+    "    src_tokens_display = f\"{src_data['tokens']:,}\"  # formatted source token count\n",
+    "    tgt_tokens_display = f\"{tgt_data['tokens']:,}\"  # formatted target token count\n",
+    "    # blank line for readability\n",
+    "    print(f\"| {'Tokens':<30} | {src_tokens_display:>20} | {tgt_tokens_display:>20} | {'Actual Count':<10} |\")  # print tokens row\n",
+    "    # blank line for readability\n",
+    "    # Row: Avg. Sentence Length  # print average sentence lengths\n",
+    "    print(f\"| {'Avg. Sentence Length':<30} | {src_data['avg_len']:>20.2f} | {tgt_data['avg_len']:>20.2f} | {'Tokens/Pair':<10} |\")  # two-decimal precision\n",
+    "    # blank line for readability\n",
+    "    # Row: Vocabulary Size  # print vocabulary sizes\n",
+    "    print(f\"| {'Vocabulary Size':<30} | {src_data['vocab_size']:>20,} | {tgt_data['vocab_size']:>20,} | {'After Preprocessing':<10} |\")  # vocab counts\n",
+    "    # blank line for readability\n",
+    "    # Row: OOV Rate (Placeholder since test set is needed)  # OOV requires a test set\n",
+    "    print(f\"| {'OOV Rate':<30} | {'--':>20} | {'--':>20} | {'Requires Test Set':<10} |\")  # placeholder output\n",
+    "    # blank line for readability\n",
+    "    print(\"-\" * 80)  # bottom separator\n",
+    "# blank line for readability\n",
+    "# blank line for readability\n",
+    "if __name__ == \"__main__\":  # script entrypoint guard\n",
+    "    # blank line for readability\n",
+    "    print(\"NOTE: Running filtering logic to ensure 'filtered_corpus_top_20.csv' is up-to-date for metrics.\")  # informational message\n",
+    "    # blank line for readability\n",
+    "    # 1. Run the filtering logic (now self-contained)  # compute filtered DF and threshold\n",
+    "    filtered_df, threshold = calculate_knowledge_threshold(FILEPATH_RAW, TARGET_PERCENTILE)  # call to compute top percentile\n",
+    "    # blank line for readability\n",
+    "    if filtered_df.empty:  # check if filtering produced results\n",
+    "        print(\"Could not generate filtered corpus. Cannot proceed with metrics calculation.\")  # failure message\n",
+    "    else:  # if we have filtered data, continue\n",
+    "        # 2. Save the filtered corpus (important for the metrics script to read it)  # save step\n",
+    "        filtered_df.to_csv(OUTPUT_FILENAME, index=False)  # write filtered DF to output file\n",
+    "        print(f\"Filtered corpus saved to '{OUTPUT_FILENAME}'.\")  # confirmation message\n",
+    "        # blank line for readability\n",
+    "        # 3. Now run the actual metrics calculation on the saved filtered file  # compute metrics\n",
+    "        sentence_pairs, metrics = calculate_corpus_metrics(FILEPATH_FILTERED)  # call metrics function\n",
+    "        # blank line for readability\n",
+    "        if metrics:  # if metrics computed successfully\n",
+    "            format_and_print_results(sentence_pairs, metrics)  # print the formatted table\n",
+    "        else:  # metrics call failed\n",
+    "            print(\"Metrics calculation failed.\")  # failure message\n",
+    "# end of cell"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ptorch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

scripts/finetuning_script.ipynb ADDED Viewed

	@@ -0,0 +1,442 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b58b67f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import model and training classes from Hugging Face Transformers\n",
+    "from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments\n",
+    "# Import dataset utilities from the datasets library\n",
+    "from datasets import load_dataset, Dataset\n",
+    "# Import pandas for CSV/DF handling\n",
+    "import pandas as pd\n",
+    "# Import torch for device checks and tensors\n",
+    "import torch\n",
+    "# Import evaluate to load evaluation metrics\n",
+    "import evaluate\n",
+    "# Import numpy for numeric array manipulation\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "349f59d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Path to the main corpus CSV file (expects two columns: source and target)\n",
+    "data_path = \"./your/main/corpus.csv\"  # Two columns: 'en' and 't'\n",
+    "# Read the CSV into a pandas DataFrame\n",
+    "df = pd.read_csv(data_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9232b43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Display the first 3 rows of the DataFrame to inspect loaded data\n",
+    "df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88f11bdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ensure the dataframe has correct column names and no list-type values\n",
+    "def ensure_text_columns(df):\n",
+    "    # If the DataFrame uses 'Src_lang'/'Tgt_lang', rename them to 'src'/'tgt'\n",
+    "    if 'Src_lang' in df.columns and 'Tgt_lang' in df.columns:\n",
+    "        df = df.rename(columns={\"Src_lang\": \"src\", \"Tgt_lang\": \"tgt\"})\n",
+    "    # blank line preserved for readability\n",
+    "    # Ensure all values are strings to avoid list/object types during tokenization\n",
+    "    df['src'] = df['src'].astype(str)\n",
+    "    df['tgt'] = df['tgt'].astype(str)\n",
+    "    # blank line preserved for readability\n",
+    "    return df  # return the normalized DataFrame\n",
+    "# Apply the helper to the loaded DataFrame\n",
+    "df = ensure_text_columns(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c3fdfa7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Re-inspect the DataFrame after normalization\n",
+    "df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3aaec3a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add language prefix tokens that will be prepended to source/target sentences\n",
+    "prefix_src = \"src_lang_code\"  # placeholder source language token\n",
+    "prefix_tgt = \"tgt_lang_code\"  # placeholder target language token"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06bbfc98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Preprocessing function that adds language prefix tokens to each example\n",
+    "def preprocess(example):\n",
+    "    # change the prefix_src and prefix_tgt to change the translation direction\n",
+    "    return {\n",
+    "        \"translation\": {\n",
+    "            \"src\": f\"{prefix_src} {example['src']}\",  # prepend source prefix\n",
+    "            \"tgt\": f\"{prefix_tgt} {example['tgt']}\"  # prepend target prefix\n",
+    "        }\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad52beb7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rename columns (no-op here but kept for clarity) and apply preprocessing to create a Dataset\n",
+    "df = df.rename(columns={\"src\": \"src\", \"tgt\": \"tgt\"})  # explicit rename placeholder\n",
+    "# Convert pandas DataFrame to a Hugging Face Dataset\n",
+    "dataset = Dataset.from_pandas(df)\n",
+    "# Apply preprocessing function to each example in the dataset\n",
+    "dataset = dataset.map(preprocess)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd30423f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Split the Dataset into training and validation sets\n",
+    "split_dataset = dataset.train_test_split(test_size=0.1, seed=42)  # 10% for validation\n",
+    "# Extract train and validation Dataset objects\n",
+    "train_data = split_dataset[\"train\"]\n",
+    "val_data = split_dataset[\"test\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "951b1f86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save processed train/validation splits to CSV files for later use\n",
+    "train_data.to_csv(\"train_set.csv\", index=False)  # write training set\n",
+    "val_data.to_csv(\"val_set.csv\", index=False)  # write validation set\n",
+    "print(\"Train and validation data saved successfully:\")  # confirmation\n",
+    "print(f\"Train size: {len(train_data)}\")  # show train count\n",
+    "print(f\"Validation size: {len(val_data)}\")  # show validation count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3cbc12e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the MBART-50 tokenizer (fast implementation)\n",
+    "tokenizer = MBart50TokenizerFast.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n",
+    "# Add any custom special tokens required (e.g., our target language code)\n",
+    "tokenizer.add_special_tokens({'additional_special_tokens': [\"tgt_lang_code\"]})\n",
+    "# Register the new lang token in the tokenizer's lang_code mapping\n",
+    "tokenizer.lang_code_to_id[\"tgt_lang_code\"] = len(tokenizer.lang_code_to_id)\n",
+    "# Rebuild reverse mapping from id to lang code (useful later)\n",
+    "tokenizer.id_to_lang_code = {v: k for k, v in tokenizer.lang_code_to_id.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc507095",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the pretrained MBART model for conditional generation\n",
+    "model = MBartForConditionalGeneration.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n",
+    "# Resize model token embeddings to account for any newly added tokens in the tokenizer\n",
+    "model.resize_token_embeddings(len(tokenizer))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ebd78be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- Step 4: Tokenize data ---  # tokenization settings and helper\n",
+    "# Maximum tokenization length for inputs/targets\n",
+    "max_length = 128\n",
+    "# blank line for readability\n",
+    "# Tokenization function applied to dataset examples\n",
+    "def tokenize_function(examples):\n",
+    "    # Tokenize source with padding/truncation to max_length\n",
+    "    inputs = tokenizer(examples[\"translation\"][\"src\"], padding=\"max_length\", truncation=True, max_length=max_length)\n",
+    "    # Tokenize target similarly\n",
+    "    targets = tokenizer(examples[\"translation\"][\"tgt\"], padding=\"max_length\", truncation=True, max_length=max_length)\n",
+    "    # Use tokenized target input_ids as labels for seq2seq training\n",
+    "    inputs[\"labels\"] = targets[\"input_ids\"]\n",
+    "    return inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb922c07",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Tokenize the dataset using the helper function defined above\n",
+    "train_dataset = train_data.map(tokenize_function)\n",
+    "val_dataset = val_data.map(tokenize_function)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "213966dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import evaluation utilities (repeated import is safe inside notebook but already imported above)\n",
+    "import evaluate\n",
+    "# numpy imported earlier; this duplicate import is harmless\n",
+    "import numpy as np\n",
+    "# blank line for readability\n",
+    "# Load metric implementations once to reuse inside compute_metrics\n",
+    "bleu_metric = evaluate.load(\"bleu\")\n",
+    "meteor_metric = evaluate.load(\"meteor\")\n",
+    "ter_metric = evaluate.load(\"ter\")\n",
+    "chrf_metric = evaluate.load(\"chrf\")\n",
+    "# blank line for readability\n",
+    "# Function used by Trainer to compute evaluation metrics from model outputs\n",
+    "def compute_metrics(eval_preds):\n",
+    "    # eval_preds is a tuple (predictions, labels)\n",
+    "    preds, labels = eval_preds\n",
+    "    # blank line for readability\n",
+    "    # Decode predictions from token ids to strings\n",
+    "    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
+    "    # blank line for readability\n",
+    "    # Replace masked label tokens (-100) with pad token id so they decode properly\n",
+    "    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
+    "    # blank line for readability\n",
+    "    # Decode label ids to strings\n",
+    "    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+    "    # blank line for readability\n",
+    "    # Clean whitespace from decoded strings\n",
+    "    decoded_preds = [p.strip() for p in decoded_preds]\n",
+    "    decoded_labels = [[l.strip()] for l in decoded_labels]  # convert to list-of-lists for metrics\n",
+    "    # blank line for readability\n",
+    "    # Compute each metric using the decoded predictions and references\n",
+    "    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
+    "    meteor = meteor_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
+    "    ter = ter_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
+    "    chrf = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
+    "    # blank line for readability\n",
+    "    # BLEU implementations may return different keys; try common ones\n",
+    "    bleu_score = bleu.get(\"score\", bleu.get(\"bleu\"))\n",
+    "    # blank line for readability\n",
+    "    return {\n",
+    "        \"ChrF\": chrf[\"score\"],      # MAIN METRIC\n",
+    "        \"BLEU\": bleu_score,\n",
+    "        \"METEOR\": meteor[\"meteor\"],\n",
+    "        \"TER\": ter[\"score\"]\n",
+    "    }\n",
+    "# end of cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "824252e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Configure Seq2Seq training arguments for the Hugging Face Trainer\n",
+    "training_args = Seq2SeqTrainingArguments(\n",
+    "    output_dir=\"./your/model/checkpoints\",  # directory for checkpoints and outputs\n",
+    "    per_device_train_batch_size=8,  # batch size per device for training\n",
+    "    per_device_eval_batch_size=8,  # batch size per device for evaluation\n",
+    "    gradient_accumulation_steps=4,      # effective batch size = 8*4 = 32\n",
+    "    learning_rate=3e-5,  # initial learning rate\n",
+    "    weight_decay=0.01,  # weight decay for optimizer\n",
+    "    num_train_epochs=3,  # number of training epochs\n",
+    "    warmup_steps=1000,  # number of warmup steps for scheduler\n",
+    "    lr_scheduler_type=\"cosine\",  # learning rate scheduler type\n",
+    "    fp16=torch.cuda.is_available(),  # enable fp16 if CUDA is available\n",
+    "    evaluation_strategy=\"steps\",  # evaluate every X steps\n",
+    "    eval_steps=2000,                    # evaluation interval in steps\n",
+    "    save_strategy=\"steps\",  # save checkpoints every X steps\n",
+    "    save_steps=2000,  # checkpoint saving interval\n",
+    "    load_best_model_at_end=True,  # keep the best model according to metric\n",
+    "    metric_for_best_model=\"ChrF\",  # metric used to select best model\n",
+    "    greater_is_better=True,  # higher metric value is better\n",
+    "    save_total_limit=5,  # limit number of saved checkpoints\n",
+    "    predict_with_generate=True,  # use generate() for predictions during eval\n",
+    "    generation_max_length=128,  # max length when generating predictions\n",
+    "    generation_num_beams=4,  # number of beams for generation\n",
+    "    logging_dir=\"./logs\",  # tensorboard/logging dir\n",
+    "    logging_steps=200,  # logging interval\n",
+    "    seed=42,  # random seed for reproducibility\n",
+    "    report_to=\"none\",  # disable reporting to external services\n",
+    ")\n",
+    "# end of training_args cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc6cb1bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import a data collator that pads to longest sequence in the batch for seq2seq models\n",
+    "from transformers import DataCollatorForSeq2Seq\n",
+    "# blank line for readability\n",
+    "# Create the data collator which will dynamically pad batch examples\n",
+    "data_collator = DataCollatorForSeq2Seq(\n",
+    "    tokenizer,\n",
+    "    model=model,\n",
+    "    padding=\"longest\",  # pad to the longest sequence in the batch\n",
+    ")\n",
+    "# end of data_collator cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9508fc22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the Seq2SeqTrainer wrapper which handles training/evaluation loops\n",
+    "trainer = Seq2SeqTrainer(\n",
+    "    model=model,  # the model to train\n",
+    "    args=training_args,  # training configuration\n",
+    "    train_dataset=train_dataset,  # training data\n",
+    "    eval_dataset=val_dataset,  # evaluation data\n",
+    "    processing_class=tokenizer,  # tokenizer/processor used for the model\n",
+    "    data_collator=data_collator,  # handles padding in batches\n",
+    "    compute_metrics=compute_metrics,  # metrics callback for evaluation\n",
+    ")\n",
+    "# end of trainer creation cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5aea8ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Start training. This runs the main training loop according to training_args\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "462c0da0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluate the trained model on the validation set and print returned metrics\n",
+    "eval_results = trainer.evaluate()\n",
+    "print(eval_results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59cfe87b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Show eval_results variable (already printed above) in a notebook cell to display its value\n",
+    "eval_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d629dc4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the fine-tuned model weights and tokenizer to a directory\n",
+    "model.save_pretrained(\"./your/model/name\")  # saves model config and weights\n",
+    "tokenizer.save_pretrained(\"./your/model/name\")  # saves tokenizer files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0641551e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load pipeline utilities for quick inference\n",
+    "import torch\n",
+    "from transformers import pipeline\n",
+    "# blank line for readability\n",
+    "# Create a translation pipeline pointing at the saved model directory\n",
+    "pipeline = pipeline(\n",
+    "    task=\"translation\",  # pipeline task\n",
+    "    model=\"./your/model/name\",  # path to saved model\n",
+    "    device=0,  # device id (0 for first GPU); set to -1 for CPU\n",
+    "    torch_dtype=torch.float16,  # use float16 if model and device support it\n",
+    "    src_lang=\"src_lang_code\",  # source language code token\n",
+    "    tgt_lang=\"tgt_lang_code\",  # target language code token\n",
+    ")\n",
+    "# Run the pipeline on a sample sentence and print the translation\n",
+    "print(pipeline(\"I like singing\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ptorch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

scripts/intrinsic_evaluation.ipynb ADDED Viewed

	@@ -0,0 +1,141 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cbf05abe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import string\n",
+    "import re\n",
+    "\n",
+    "# --- Configuration ---\n",
+    "FILEPATH_RAW = 'Path/to/the/original/corpus.csv'\n",
+    "FILEPATH_FILTERED = 'Path/to/the/pre-filtered/corpus.csv' # Path to the pre-filtered corpus\n",
+    "SCORE_COL = 'KS_i'\n",
+    "# --- End Configuration ---\n",
+    "\n",
+    "def tokenize_and_clean(text_series):\n",
+    "    \"\"\"Tokenizes text, converts to lowercase, and removes punctuation.\"\"\"\n",
+    "    # Combine all text into a single string\n",
+    "    full_text = \" \".join(text_series.astype(str))\n",
+    "    # Remove punctuation\n",
+    "    full_text = full_text.lower().translate(str.maketrans('', '', string.punctuation))\n",
+    "    # Simple split by whitespace\n",
+    "    tokens = full_text.split()\n",
+    "    return tokens\n",
+    "\n",
+    "def calculate_ttr(tokens):\n",
+    "    \"\"\"Calculates Type-Token Ratio (Lexical Diversity).\"\"\"\n",
+    "    if not tokens:\n",
+    "        return 0.0\n",
+    "    types = set(tokens)\n",
+    "    return len(types) / len(tokens)\n",
+    "\n",
+    "def intrinsic_evaluation(filepath_raw, filepath_filtered, score_col):\n",
+    "    \"\"\"\n",
+    "    Performs intrinsic evaluation metrics comparing the pre-filtered corpus \n",
+    "    against the raw corpus.\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        # 1. Load and prepare the data\n",
+    "        df_raw = pd.read_csv(filepath_raw)\n",
+    "        df_filtered = pd.read_csv(filepath_filtered)\n",
+    "\n",
+    "        # 1.5. Robust cleaning and preparation for both dataframes\n",
+    "        for df, name in [(df_raw, \"Raw\"), (df_filtered, \"Filtered\")]:\n",
+    "            if score_col not in df.columns:\n",
+    "                print(f\"Error: The {name} CSV file must contain a column named '{score_col}'.\")\n",
+    "                return\n",
+    "            # Convert score column to numeric and drop rows where score or text columns are missing\n",
+    "            df[score_col] = pd.to_numeric(df[score_col], errors='coerce')\n",
+    "            df.dropna(subset=[score_col, 'src_lang', 'tgt_lang'], inplace=True)\n",
+    "        \n",
+    "        if len(df_raw) == 0 or len(df_filtered) == 0:\n",
+    "            print(\"Error: One or both corpora are empty after cleaning.\")\n",
+    "            return\n",
+    "\n",
+    "        # 2. Calculate Intrinsic Metrics\n",
+    "        \n",
+    "        # --- A. Knowledge Score Averages ---\n",
+    "        raw_avg_ks = df_raw[score_col].mean()\n",
+    "        filtered_avg_ks = df_filtered[score_col].mean()\n",
+    "        \n",
+    "        # --- B. Lexical Diversity (Type-Token Ratio) ---\n",
+    "        \n",
+    "        # Raw Corpus TTR\n",
+    "        raw_src_tokens = tokenize_and_clean(df_raw['src_lang'])\n",
+    "        raw_tgt_tokens = tokenize_and_clean(df_raw['tgt_lang'])\n",
+    "        raw_src_ttr = calculate_ttr(raw_src_tokens)\n",
+    "        raw_tgt_ttr = calculate_ttr(raw_tgt_tokens)\n",
+    "        \n",
+    "        # Filtered Corpus TTR\n",
+    "        filtered_src_tokens = tokenize_and_clean(df_filtered['src_lang'])\n",
+    "        filtered_tgt_tokens = tokenize_and_clean(df_filtered['tgt_lang'])\n",
+    "        filtered_src_ttr = calculate_ttr(filtered_src_tokens)\n",
+    "        filtered_tgt_ttr = calculate_ttr(filtered_tgt_tokens)\n",
+    "        \n",
+    "        # 3. Print Results\n",
+    "        print(\"\\n\" + \"=\"*60)\n",
+    "        print(\"INTRINSIC CORPUS QUALITY EVALUATION\")\n",
+    "        print(\"=\"*60)\n",
+    "        retention_rate = len(df_filtered)/len(df_raw)*100\n",
+    "        print(f\"Corpus Sizes: Raw={len(df_raw)} | Filtered={len(df_filtered)} (Retention: {retention_rate:.2f}%)\")\n",
+    "        \n",
+    "        # --- Average KS_i Comparison ---\n",
+    "        print(\"\\n--- 1. Average Knowledge Score (KS_i) ---\")\n",
+    "        print(f\"| {'Metric':<25} | {'Raw Corpus':>15} | {'Filtered Corpus':>15} |\")\n",
+    "        print(f\"| {'Average KS_i':<25} | {raw_avg_ks:>15.4f} | {filtered_avg_ks:>15.4f} |\")\n",
+    "        \n",
+    "        ks_increase_percent = ((filtered_avg_ks - raw_avg_ks) / raw_avg_ks) * 100\n",
+    "        print(\"\\n**Conclusion:** The Average KS_i increased by {0:.2f}% after filtering. The increase in mean score confirms that the KS_i metric successfully concentrated high-quality data.\".format(ks_increase_percent))\n",
+    "        \n",
+    "        # --- TTR Comparison ---\n",
+    "        print(\"\\n--- 2. Lexical Diversity (Type-Token Ratio) ---\")\n",
+    "        print(f\"| {'Metric':<25} | {'Raw Corpus':>15} | {'Filtered Corpus':>15} |\")\n",
+    "        print(f\"| {'Source (src_lang) TTR':<25} | {raw_src_ttr:>15.4f} | {filtered_src_ttr:>15.4f} |\")\n",
+    "        print(f\"| {'Target (tgt_lang) TTR':<25} | {raw_tgt_ttr:>15.4f} | {filtered_tgt_ttr:>15.4f} |\")\n",
+    "        \n",
+    "        # --- Diversity Conclusion ---\n",
+    "        src_ttr_change = (filtered_src_ttr - raw_src_ttr) / raw_src_ttr * 100\n",
+    "        tgt_ttr_change = (filtered_tgt_ttr - raw_tgt_ttr) / raw_tgt_ttr * 100\n",
+    "\n",
+    "        print(f\"\\n**Conclusion:** Diversity (TTR) changed by {src_ttr_change:.2f}% (Source) and {tgt_ttr_change:.2f}% (Target).\")\n",
+    "        print(\"A positive or minimal negative change in TTR suggests the filter successfully isolated quality data without sacrificing vital vocabulary coverage.\")\n",
+    "\n",
+    "\n",
+    "    except FileNotFoundError as e:\n",
+    "        print(f\"Error: A required file was not found: {e}. Ensure both '{filepath_raw}' and '{filepath_filtered}' exist.\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"An unexpected error occurred: {e}\")\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    intrinsic_evaluation(FILEPATH_RAW, FILEPATH_FILTERED, SCORE_COL)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ptorch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}