{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "311e31e2", "metadata": {}, "outputs": [], "source": [ "# Import pandas for DataFrame manipulation\n", "import pandas as pd\n", "# Import numpy for numerical operations\n", "import numpy as np\n", "# Import torch for tensor operations and device handling\n", "import torch\n", "# Import MBART model and tokenizer from Hugging Face Transformers\n", "from transformers import MBartForConditionalGeneration, MBart50TokenizerFast\n", "# Import cosine similarity for comparing embeddings\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "# Import tqdm to show progress bars for loops\n", "from tqdm import tqdm\n", "# Import regex utilities for tokenization and cleaning\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "id": "3363ac62", "metadata": {}, "outputs": [], "source": [ "# --- Configuration ---\n", "MODEL_NAME = \"your/model/name\"\n", "SRC_LANG_CODE = \"src_lang_code\"\n", "TGT_LANG_CODE = \"tgt_lang_code\"\n", "CORPUS_FILE = \"your/corpus/here.csv\"\n", "DICT_FILE = \"your/bilingual/dictionary/here.csv\"" ] }, { "cell_type": "code", "execution_count": null, "id": "d5d67ae6", "metadata": {}, "outputs": [], "source": [ "# Hyperparameters for the Knowledge Score (KS_i)\n", "# You would tune these based on empirical performance\n", "ALPHA = 0.1\n", "BETA = 0.3\n", "GAMMA = 0.6\n", "PERCENTILE_THRESHOLD = 70 # Filter threshold: keep pairs above this percentile" ] }, { "cell_type": "code", "execution_count": null, "id": "f5fb7924", "metadata": {}, "outputs": [], "source": [ "def preprocess_text(text):\n", " \"\"\"\n", " Safely preprocesses text by handling NaN, non-string values,\n", " and performing normalization steps.\n", " \"\"\"\n", " if not isinstance(text, str):\n", " return \"\"\n", " text = text.strip().lower()\n", " text = re.sub(r\"\\s+\", \" \", text) # Collapse multiple spaces\n", " text = re.sub(r\"[^a-zA-Z0-9\\s']\", \"\", text) # Remove unwanted symbols (keep alphanumerics and apostrophes)\n", " return text\n", "\n", "\n", "def load_data(corpus_file, dict_file):\n", " \"\"\"Loads, cleans, and prepares the parallel corpus and bilingual dictionary.\"\"\"\n", "\n", " # --- Load the CSVs safely ---\n", " try:\n", " raw_corpus = pd.read_csv(corpus_file)\n", " word_dictionary = pd.read_csv(dict_file)\n", " except Exception as e:\n", " raise ValueError(f\"Error loading files: {e}\")\n", "\n", " # --- Ensure expected columns exist ---\n", " required_corpus_cols = {'English', 'Tagin'}\n", " required_dict_cols = {'English', 'Tagin'}\n", "\n", " if not required_corpus_cols.issubset(raw_corpus.columns):\n", " raise ValueError(f\"Corpus file must contain columns: {required_corpus_cols}\")\n", " if not required_dict_cols.issubset(word_dictionary.columns):\n", " raise ValueError(f\"Dictionary file must contain columns: {required_dict_cols}\")\n", "\n", " # --- Drop rows with all NaN values ---\n", " raw_corpus = raw_corpus.dropna(how='all')\n", "\n", " # --- Fill NaN cells with empty strings ---\n", " raw_corpus = raw_corpus.fillna(\"\")\n", "\n", " # --- Apply text preprocessing ---\n", " raw_corpus[\"English\"] = raw_corpus[\"English\"].apply(preprocess_text)\n", " raw_corpus[\"Tagin\"] = raw_corpus[\"Tagin\"].apply(preprocess_text)\n", "\n", " # --- Clean dictionary entries ---\n", " word_dictionary[\"English\"] = word_dictionary[\"English\"].apply(preprocess_text)\n", " word_dictionary[\"Tagin\"] = word_dictionary[\"Tagin\"].apply(preprocess_text)\n", "\n", " # --- Convert dictionary to mapping ---\n", " word_dictionary = word_dictionary.set_index('English')['Tagin'].to_dict()\n", "\n", " # --- Remove empty rows after cleaning ---\n", " raw_corpus = raw_corpus[\n", " (raw_corpus[\"English\"].str.strip() != \"\") &\n", " (raw_corpus[\"Tagin\"].str.strip() != \"\")\n", " ].reset_index(drop=True)\n", "\n", " print(f\"Loaded {len(raw_corpus)} sentence pairs and {len(word_dictionary)} dictionary entries.\")\n", " return raw_corpus, word_dictionary" ] }, { "cell_type": "code", "execution_count": null, "id": "772824d1", "metadata": {}, "outputs": [], "source": [ "load_data(CORPUS_FILE,DICT_FILE)" ] }, { "cell_type": "code", "execution_count": null, "id": "7322656f", "metadata": {}, "outputs": [], "source": [ "# Function for Step 2: Perplexity (PPL)\n", "@torch.no_grad()\n", "def calculate_perplexity(sentence, model, tokenizer, device):\n", " \"\"\"Computes perplexity of a sentence using the given LM.\"\"\"\n", " try:\n", " # Tokenize and format for mBART-50 (e.g., [lang_code] X [eos])\n", " # We'll treat this as a generation task from the source language to itself\n", " # to get log probabilities for the language modeling loss.\n", " input_ids = tokenizer(\n", " sentence,\n", " return_tensors=\"pt\",\n", " max_length=512,\n", " truncation=True\n", " ).input_ids.to(device)\n", " \n", " # Set the source language\n", " tokenizer.src_lang = SRC_LANG_CODE\n", " \n", " # The labels for perplexity are the input tokens themselves, shifted.\n", " # This is essentially a language modeling task.\n", " labels = input_ids.clone()\n", " \n", " # Use -100 to ignore the loss for special tokens (like the language code token)\n", " labels[:, 0] = -100\n", "\n", " outputs = model(input_ids=input_ids, labels=labels)\n", " neg_log_likelihood = outputs.loss\n", " \n", " # Perplexity is exp(average negative log-likelihood)\n", " # The 'outputs.loss' from the Transformers library is already the average NLL per token.\n", " ppl = torch.exp(neg_log_likelihood).item()\n", " return ppl\n", " except Exception as e:\n", " print(f\"Error calculating PPL for: '{sentence}'. Error: {e}\")\n", " return float('inf') # Return a very high PPL for errors/bad sentences\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "231f19a8", "metadata": {}, "outputs": [], "source": [ "def normalize_inverse_ppl(ppl_scores, epsilon=1e-6):\n", " \"\"\"\n", " Safely normalizes inverse perplexity (1/PPL_i) to [0, 1].\n", " \n", " Handles edge cases where PPL scores are constant, contain inf/nan, or are invalid.\n", " \"\"\"\n", " ppl_scores = np.array(ppl_scores, dtype=np.float64)\n", "\n", " # Replace infinities or NaNs with large finite numbers for stability\n", " ppl_scores = np.nan_to_num(ppl_scores, nan=np.inf, posinf=np.inf, neginf=np.inf)\n", "\n", " # Compute inverse PPL (fluency measure)\n", " inv_ppl = 1.0 / (ppl_scores + epsilon)\n", "\n", " # Remove any remaining NaNs/Infs from inverse scores\n", " inv_ppl = np.nan_to_num(inv_ppl, nan=0.0, posinf=0.0, neginf=0.0)\n", "\n", " inv_min = np.min(inv_ppl)\n", " inv_max = np.max(inv_ppl)\n", "\n", " # Handle zero-range case: all scores are the same\n", " if np.isclose(inv_max, inv_min) or np.isnan(inv_max - inv_min):\n", " return np.zeros_like(inv_ppl)\n", "\n", " # Normal min–max scaling\n", " inv_ppl_norm = (inv_ppl - inv_min) / (inv_max - inv_min)\n", " inv_ppl_norm = np.clip(inv_ppl_norm, 0.0, 1.0)\n", "\n", " return inv_ppl_norm\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ad791177", "metadata": {}, "outputs": [], "source": [ "# Function for Step 3: Semantic Similarity (Sim)\n", "\n", "def calculate_semantic_similarity(s_i, t_i, model, tokenizer, device):\n", " \"\"\"\n", " Computes Cosine Similarity between source and target sentence embeddings \n", " and normalizes the result to the range [0, 1].\n", " \"\"\"\n", " try:\n", " def get_embedding(sentence, lang_code):\n", " tokenizer.src_lang = lang_code\n", " inputs = tokenizer(\n", " sentence,\n", " return_tensors=\"pt\",\n", " max_length=512,\n", " truncation=True,\n", " padding=True\n", " ).to(device)\n", " \n", " with torch.no_grad():\n", " encoder_output = model.model.encoder(**inputs).last_hidden_state\n", " \n", " mean_embedding = encoder_output[:, 1:-1, :].mean(dim=1).squeeze() \n", " \n", " return mean_embedding.cpu().detach().numpy().reshape(1, -1)\n", "\n", " emb_s = get_embedding(s_i, SRC_LANG_CODE) \n", " emb_t = get_embedding(t_i, TGT_LANG_CODE)\n", "\n", " sim_raw = cosine_similarity(emb_s, emb_t)[0][0]\n", " \n", " sim_normalized = (sim_raw + 1) / 2\n", " sim_normalized = max(0.0, min(1.0, sim_normalized))\n", " \n", " return sim_normalized\n", " \n", " except Exception as e:\n", " # print(f\"Error calculating Sim for: '{s_i}' and '{t_i}'. Error: {e}\")\n", " return 0.0" ] }, { "cell_type": "code", "execution_count": null, "id": "87eebd8b", "metadata": {}, "outputs": [], "source": [ "# Function for Step 4: Lexical Match (Lex) # header describing the block\n", "# blank line preserved for readability\n", "# Define a function that computes a lexical match score based on a bilingual dictionary\n", "def calculate_lexical_match(s_i, t_i, word_dictionary):\n", " # Docstring start: describe purpose and formula for lex score\n", " \"\"\"\n", " Computes a dictionary-based lexical match score prioritizing phrase matches.\n", " Score = (Count of source words covered by successfully translated phrases) / (Total words in source sentence)\n", " \"\"\" # docstring end\n", " # Helper: normalize text to ease phrase matching (lowercase, token boundaries)\n", " def normalize_text(text):\n", " # Simple tokenization: lowercase, remove non-word characters, and join back for easy phrase matching\n", " return \" \" + \" \".join(re.findall(r'\\b\\w+\\b', text.lower())) + \" \" # pad with spaces for boundary-safe matching\n", " # Normalize the source sentence for phrase lookups\n", " s_normalized = normalize_text(s_i)\n", " # Token set of the target sentence for quick membership tests\n", " t_tokens = set(re.findall(r'\\b\\w+\\b', t_i.lower()))\n", " # blank line preserved for readability\n", " # Sort dictionary keys by length (descending) to prioritize phrase matches over single words\n", " tagin_phrases = sorted(word_dictionary.keys(), key=len, reverse=True)\n", " # blank line preserved for readability\n", " # Extract source word tokens and compute total count\n", " source_words = re.findall(r'\\b\\w+\\b', s_i.lower())\n", " total_source_words = len(source_words)\n", " # Initialize covered word counter\n", " covered_word_count = 0\n", " # If the source sentence is empty, return 0.0 immediately\n", " if total_source_words == 0:\n", " return 0.0\n", " # Track indices of covered words if needed (not used further but kept for clarity)\n", " covered_indices = set()\n", " # Iterate over dictionary phrases (longest-first) to find matches in the source\n", " for phrase in tagin_phrases:\n", " # Skip empty dictionary entries\n", " if not phrase:\n", " continue\n", " # Normalize the phrase for safe matching\n", " norm_phrase = normalize_text(phrase)\n", " # If the normalized phrase exists in the normalized source text, proceed\n", " if norm_phrase in s_normalized:\n", " # Get expected translation from the dictionary (lowercased)\n", " expected_translation = word_dictionary[phrase].lower()\n", " # Tokenize the expected translation into words\n", " translation_words = re.findall(r'\\b\\w+\\b', expected_translation)\n", " # Check whether all translated words appear in the target sentence tokens\n", " is_translation_present = all(word in t_tokens for word in translation_words)\n", " # If the translation words are present in the target, count the phrase as covered\n", " if is_translation_present:\n", " # Search for possibly multiple occurrences of the phrase in the source\n", " start = 0\n", " while True:\n", " # Find next occurrence starting from 'start' index\n", " start_index = s_normalized.find(norm_phrase, start)\n", " # If no more occurrences, break the loop\n", " if start_index == -1:\n", " break\n", " # Count how many words are in the matched phrase\n", " phrase_word_count = len(re.findall(r'\\b\\w+\\b', phrase))\n", " # Add the phrase's word count to the covered total\n", " covered_word_count += phrase_word_count\n", " # Advance the search start position past the current match\n", " start = start_index + len(norm_phrase)\n", " # end while loop for occurrences\n", " # After checking all phrases, compute lex score as covered words / total source words (capped at 1.0)\n", " lex_score = min(1.0, covered_word_count / total_source_words)\n", " # Return lexical match score between 0 and 1\n", " return lex_score" ] }, { "cell_type": "code", "execution_count": null, "id": "46f310e4", "metadata": {}, "outputs": [], "source": [ "# Main Algorithm Implementation\n", "def knowledge_based_filtering(raw_corpus, word_dictionary, alpha, beta, gamma, percentile_threshold):\n", " # 1. Load Model and Tokenizer (Step 1 of the algorithm's loop)\n", " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " print(f\"Loading mBART-tgj-base model to {device}...\")\n", " model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)\n", " tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_NAME)\n", " \n", " # Ensure source/target language codes are in the tokenizer vocabulary\n", " if SRC_LANG_CODE not in tokenizer.vocab or TGT_LANG_CODE not in tokenizer.vocab:\n", " print(f\"Warning: Language codes {SRC_LANG_CODE} or {TGT_LANG_CODE} not found in base mBART-tgj-base vocab.\")\n", " print(\"Using placeholder language codes. Results may not be accurate.\")\n", "\n", " results = []\n", "\n", " # 2. Iterate through the corpus (Lines 1-6)\n", " print(\"Processing corpus to calculate scores...\")\n", " for index, row in tqdm(raw_corpus.iterrows(), total=len(raw_corpus), desc=\"Calculating KS\"):\n", " s_i = row['English']\n", " t_i = row['Tagin']\n", "\n", " # Line 2: Compute PPL_i (Lower is better)\n", " pp= calculate_perplexity(s_i, model, tokenizer, device)\n", " PPL_i= normalize_inverse_ppl(pp, epsilon=1e-6)\n", " # PPL_i = normalize_inverse_ppl(row[\"Perplexity\"])\n", " \n", " # Line 3: Compute Sim_i (Higher is better)\n", " Sim_i = calculate_semantic_similarity(s_i, t_i, model, tokenizer, device)\n", " \n", " # Line 4: Check Lex_i (Higher is better)\n", " Lex_i = calculate_lexical_match(s_i, t_i, word_dictionary)\n", " \n", " # Line 5: Derive Knowledge Score (KS_i)\n", " # Note: We use 1/PPL_i because PPL_i is an inverse quality metric (lower PPL is higher quality)\n", " # while Sim and Lex are direct quality metrics (higher is better).\n", " # We add a small epsilon to avoid division by zero, though a PPL of 0 is practically impossible.\n", " # PPL_i_inv = 1.0 / (PPL_i + 1e-6)\n", " # -----IMPORTANT------\n", " \n", " KS_i = alpha * PPL_i + beta * Sim_i + gamma * Lex_i\n", " \n", " results.append({\n", " 'src_lang': s_i,\n", " 'tgt_lang': t_i,\n", " 'PPL_i': PPL_i,\n", " 'Sim_i': Sim_i,\n", " 'Lex_i': Lex_i,\n", " 'PPL_i': PPL_i,\n", " 'KS_i': KS_i\n", " })\n", "\n", " # Convert results to DataFrame for filtering\n", " scored_corpus = pd.DataFrame(results)\n", "\n", " # 3. Determine Threshold and Filter (Lines 7-9)\n", " # Line 7: Find the 80th percentile of Knowledge Scores\n", " tau_K = np.percentile(scored_corpus['KS_i'], percentile_threshold)\n", " print(f\"\\n50th Percentile Knowledge Score (τ_K): {tau_K:.4f}\")\n", " \n", " # Line 8: Filter the corpus\n", " D_filtered = scored_corpus[scored_corpus['KS_i'] >= tau_K].copy()\n", " \n", " # Final cleanup of columns and return\n", " D_filtered = D_filtered[['src_lang', 'tgt_lang', 'KS_i']]\n", " print(f\"Raw corpus size: {len(raw_corpus)}\")\n", " print(f\"Filtered corpus size (KS_i >= τ_K): {len(D_filtered)}\")\n", " \n", " return D_filtered" ] }, { "cell_type": "code", "execution_count": null, "id": "2b2ce69d", "metadata": {}, "outputs": [], "source": [ "# --- Execution --- # script entry and high-level execution steps\n", "# blank line preserved for readability\n", "# Guard to ensure code only runs when executed as a script, not on import\n", "if __name__ == '__main__':\n", " # 1. Load data # load and preprocess corpus and dictionary files\n", " raw_corpus, word_dictionary = load_data(CORPUS_FILE, DICT_FILE)\n", " # blank line preserved for readability\n", " # 2. Run the filtering algorithm # compute KS_i and filter by percentile\n", " filtered_corpus = knowledge_based_filtering(\n", " raw_corpus, # pass the preprocessed corpus DataFrame\n", " word_dictionary, # pass the dictionary mapping\n", " ALPHA, BETA, GAMMA, # weighting hyperparameters for KS_i\n", " PERCENTILE_THRESHOLD # percentile cutoff for filtering\n", " )\n", " # blank line preserved for readability\n", " # Save the filtered corpus to CSV for downstream use\n", " filtered_corpus.to_csv(\"tgj_corpus_filtered_70th.csv\", index=False)\n", " # blank line preserved for readability\n", " # Notify user of completion and where results were saved\n", " print(\"\\nFiltering complete. Results saved to tgj_corpus_filtered_70th.csv\")\n", " # Show a short preview of the filtered corpus\n", " print(\"\\nFiltered Corpus Head:\")\n", " print(filtered_corpus) # print DataFrame to stdout" ] } ], "metadata": { "kernelspec": { "display_name": "ptorch", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }