File size: 15,673 Bytes

ad0be11

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54834b8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd  # import pandas and alias as pd for DataFrame operations\n",
    "# blank line kept for readability\n",
    "# Load your CSV  # comment indicating next line loads the CSV into a DataFrame\n",
    "df = pd.read_csv(\"filtered_corpus_here.csv\")  # read the CSV file into variable df\n",
    "# blank line kept for readability\n",
    "# Normalize whitespace so \"hello  world\" and \"hello world\" match  # explain normalization intent\n",
    "df['src_norm'] = df['src_lang'].str.strip().str.replace(r'\\s+', ' ', regex=True)  # strip edges and collapse multiple spaces in source column\n",
    "df['tgt_norm'] = df['tgt_lang'].str.strip().str.replace(r'\\s+', ' ', regex=True)  # same normalization for target column\n",
    "# blank line kept for readability\n",
    "# Drop duplicates based on combined src+tgt  # remove identical source-target pairs after normalization\n",
    "df_unique = df.drop_duplicates(subset=['src_norm', 'tgt_norm'], keep='first')  # keep first occurrence of duplicate pairs\n",
    "# blank line kept for readability\n",
    "# Remove helper columns  # drop intermediate normalization columns before saving\n",
    "df_unique = df_unique.drop(columns=['src_norm', 'tgt_norm'])  # remove the temporary normalized columns\n",
    "# blank line kept for readability\n",
    "# Save result  # write the deduplicated DataFrame to a new CSV file\n",
    "df_unique.to_csv(\"filtered_corpus_here_removedDuplicates.csv\", index=False)  # save without row index\n",
    "# blank line kept for readability\n",
    "print(\"Done. Original rows:\", len(df))  # print the original number of rows\n",
    "print(\"New rows:\", len(df_unique))  # print the number of rows after deduplication\n",
    "print(\"Removed:\", len(df) - len(df_unique))  # print how many rows were removed\n",
    "# end of cell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e73f3d91",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd  # pandas for DataFrame operations\n",
    "import numpy as np  # numpy for numeric utilities (unused but commonly imported)\n",
    "import string  # string constants, used to build punctuation translator\n",
    "import os  # os module for file/path operations (imported for potential use)\n",
    "# blank line for readability\n",
    "# --- Configuration ---  # configuration section start\n",
    "# NOTE: This script assumes 'sample_corpus.csv' exists to generate the filtered file.  # informational note\n",
    "FILEPATH_RAW = 'filtered_corpus_here_removedDuplicates.csv'  # path to the deduplicated raw corpus\n",
    "FILEPATH_FILTERED = 'filtered_corpus_top_20.csv'  # path to write/read the top filtered corpus\n",
    "SOURCE_LANG_COL = 'src_lang'  # column name for source language text\n",
    "TARGET_LANG_COL = 'tgt_lang'  # column name for target language text\n",
    "SCORE_COL = 'KS_i'  # column name that stores the Knowledge Score\n",
    "# Filtering constants moved from calculate_threshold.py  # note about origin of constant\n",
    "TARGET_PERCENTILE = 80  # percentile threshold to filter top N% by score\n",
    "OUTPUT_FILENAME = FILEPATH_FILTERED # Ensure output filename is the same  # output target file variable\n",
    "# --- End Configuration ---  # configuration section end\n",
    "# blank line for readability\n",
    "def calculate_knowledge_threshold(filepath, percentile):  # function to compute tau_K and filtered DF\n",
    "    \"\"\"  # docstring start\n",
    "    Reads a CSV file, calculates the specified percentile of the Knowledge Score  # description line 1\n",
    "    column, and returns the filtered corpus and the threshold (tau_K).  # description line 2\n",
    "    \"\"\"  # docstring end\n",
    "    try:  # attempt to load and process the file\n",
    "        # 1. Load the data  # step 1 comment\n",
    "        df = pd.read_csv(filepath)  # load CSV into DataFrame df\n",
    "        # blank line for readability\n",
    "        # 1.5. Robust column check and numeric conversion  # validate columns and convert types\n",
    "        if SCORE_COL not in df.columns:  # check that score column exists\n",
    "            print(f\"Error: The CSV file must contain a column named '{SCORE_COL}'. Found columns: {list(df.columns)}\")  # informative error\n",
    "            return pd.DataFrame(), None  # return empty DF and None threshold on error\n",
    "        # blank line for readability\n",
    "        df_initial_size = len(df)  # store initial row count for warnings\n",
    "        # Convert the score column to numeric, coercing errors  # ensure numeric dtype for quantile\n",
    "        df[SCORE_COL] = pd.to_numeric(df[SCORE_COL], errors='coerce')  # coerce invalids to NaN\n",
    "        df.dropna(subset=[SCORE_COL], inplace=True)  # drop rows where score could not be parsed\n",
    "        # blank line for readability\n",
    "        if len(df) < df_initial_size:  # warn if rows were dropped\n",
    "            print(f\"Warning: Dropped {df_initial_size - len(df)} rows with non-numeric scores.\")  # warn about dropped rows\n",
    "        # blank line for readability\n",
    "        # 2. Calculate the threshold (tau_K)  # compute percentile threshold\n",
    "        tau_K = df[SCORE_COL].quantile(percentile / 100, interpolation='linear')  # compute numeric threshold\n",
    "        # blank line for readability\n",
    "        # 3. Apply the threshold to construct the filtered corpus (D_filtered)  # filter rows >= tau_K\n",
    "        D_filtered = df[df[SCORE_COL] >= tau_K].copy()  # select high-score rows and copy to new DF\n",
    "        # blank line for readability\n",
    "        return D_filtered, tau_K  # return filtered DF and threshold\n",
    "    except FileNotFoundError:  # handle missing file error\n",
    "        print(f\"Error: The file '{filepath}' was not found. Please ensure it exists.\")  # print helpful message\n",
    "        return pd.DataFrame(), None  # return empty DF and None threshold when file missing\n",
    "    except Exception as e:  # catch-all for other errors\n",
    "        print(f\"An unexpected error occurred during filtering: {e}\")  # print exception details\n",
    "        return pd.DataFrame(), None  # return safe defaults on error\n",
    "# blank line for readability\n",
    "def tokenize_and_clean(text_series):  # function to tokenize text series and clean tokens\n",
    "    \"\"\"  # docstring start\n",
    "    Tokenizes text by splitting on whitespace, then cleans tokens by removing  # description line 1\n",
    "    punctuation and converting to lowercase for accurate token counting and vocabulary size.  # description line 2\n",
    "    \"\"\"  # docstring end\n",
    "    all_tokens = []  # accumulator for all tokens across sentences\n",
    "    # blank line for readability\n",
    "    # Simple preprocessing: remove punctuation and lowercase  # describe translator creation\n",
    "    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))  # map punctuation to spaces\n",
    "    # blank line for readability\n",
    "    for text in text_series.astype(str):  # iterate rows as strings\n",
    "        clean_text = text.translate(translator).lower()  # remove punctuation and lowercase the text\n",
    "        tokens = clean_text.split()  # split on whitespace into tokens\n",
    "        all_tokens.extend(tokens)  # add tokens to accumulator\n",
    "        # loop continues for next sentence\n",
    "    return all_tokens  # return the flattened token list\n",
    "# blank line for readability\n",
    "def calculate_corpus_metrics(filepath):  # function to compute corpus-level metrics\n",
    "    \"\"\"Calculates all required corpus statistics for a given file.\"\"\"  # single-line docstring\n",
    "    try:  # try to read the file into a DataFrame\n",
    "        df = pd.read_csv(filepath)  # load corpus file\n",
    "    except FileNotFoundError:  # handle missing file\n",
    "        print(f\"Error: The file '{filepath}' was not found. Please ensure it exists.\")  # user-friendly message\n",
    "        return None  # return None to indicate failure\n",
    "    except pd.errors.EmptyDataError:  # handle empty file error\n",
    "        print(f\"Error: The file '{filepath}' is empty.\")  # inform user file has no data\n",
    "        return None  # return None to indicate failure\n",
    "    # blank line for readability\n",
    "    # 1. Metric: Sentence Pairs  # compute total number of sentence pairs\n",
    "    sentence_pairs = len(df)  # number of rows equals sentence pairs\n",
    "    # blank line for readability\n",
    "    # 2. Metric: Tokens, Avg. Sentence Length, Vocabulary Size  # prepare metrics container\n",
    "    metrics = {}  # dict to hold source/target metrics\n",
    "    # blank line for readability\n",
    "    # Ensure column existence before proceeding  # validate presence of expected columns\n",
    "    for col in [SOURCE_LANG_COL, TARGET_LANG_COL]:  # iterate expected column names\n",
    "        if col not in df.columns:  # raise if missing\n",
    "            raise KeyError(f\"Column '{col}' not found in the corpus file.\")  # explicit error to surface missing columns\n",
    "    # blank line for readability\n",
    "    for col, tag in [(SOURCE_LANG_COL, 'English'), (TARGET_LANG_COL, 'Target')]:  # compute metrics for both sides\n",
    "        # Tokenization and Cleaning  # comment for next steps\n",
    "        tokens = tokenize_and_clean(df[col])  # get token list for this column\n",
    "        total_tokens = len(tokens)  # total token count across all sentences\n",
    "        # blank line for readability\n",
    "        # Vocabulary Size  # compute unique token count\n",
    "        vocab_size = len(set(tokens))  # size of unique token set\n",
    "        # blank line for readability\n",
    "        # Avg. Sentence Length (Tokens / Sentence Pairs)  # compute average tokens per sentence\n",
    "        avg_sentence_length = total_tokens / sentence_pairs if sentence_pairs > 0 else 0  # guard division by zero\n",
    "        # blank line for readability\n",
    "        metrics[tag] = {  # store computed metrics under tag key\n",
    "            'tokens': total_tokens,  # total tokens count\n",
    "            'avg_len': avg_sentence_length,  # average sentence length in tokens\n",
    "            'vocab_size': vocab_size  # vocabulary size after preprocessing\n",
    "        }\n",
    "    return sentence_pairs, metrics  # return computed metrics\n",
    "# blank line for readability\n",
    "def format_and_print_results(sentence_pairs, metrics):  # pretty-print the metrics in table form\n",
    "    \"\"\"Formats the calculated metrics into the requested table structure.\"\"\"  # docstring\n",
    "    # blank line for readability\n",
    "    src_data = metrics['English']  # metrics for source/English side\n",
    "    tgt_data = metrics['Target']  # metrics for target side\n",
    "    # blank line for readability\n",
    "    print(\"\\n\" + \"=\"*80)  # print top border\n",
    "    print(f\"          FILTERED CORPUS METRICS ({FILEPATH_FILTERED})\")  # title with filename\n",
    "    print(\"=\"*80)  # print border again\n",
    "    # blank line for readability\n",
    "    # Table Header  # print header row labels\n",
    "    print(f\"| {'Metric':<30} | {'Source (English)':>20} | {'Target (Target)':>20} | {'Notes':<10} |\")  # header formatting\n",
    "    print(\"-\" * 80)  # separator line\n",
    "    # blank line for readability\n",
    "    # Row: Sentence Pairs  # print sentence pair counts\n",
    "    print(f\"| {'Sentence Pairs':<30} | {sentence_pairs:>20,} | {sentence_pairs:>20,} | {'--':<10} |\")  # counts for both columns\n",
    "    # blank line for readability\n",
    "    # Row: Tokens (Formatted for M/K display based on size)  # prepare token display strings\n",
    "    src_tokens_display = f\"{src_data['tokens']:,}\"  # formatted source token count\n",
    "    tgt_tokens_display = f\"{tgt_data['tokens']:,}\"  # formatted target token count\n",
    "    # blank line for readability\n",
    "    print(f\"| {'Tokens':<30} | {src_tokens_display:>20} | {tgt_tokens_display:>20} | {'Actual Count':<10} |\")  # print tokens row\n",
    "    # blank line for readability\n",
    "    # Row: Avg. Sentence Length  # print average sentence lengths\n",
    "    print(f\"| {'Avg. Sentence Length':<30} | {src_data['avg_len']:>20.2f} | {tgt_data['avg_len']:>20.2f} | {'Tokens/Pair':<10} |\")  # two-decimal precision\n",
    "    # blank line for readability\n",
    "    # Row: Vocabulary Size  # print vocabulary sizes\n",
    "    print(f\"| {'Vocabulary Size':<30} | {src_data['vocab_size']:>20,} | {tgt_data['vocab_size']:>20,} | {'After Preprocessing':<10} |\")  # vocab counts\n",
    "    # blank line for readability\n",
    "    # Row: OOV Rate (Placeholder since test set is needed)  # OOV requires a test set\n",
    "    print(f\"| {'OOV Rate':<30} | {'--':>20} | {'--':>20} | {'Requires Test Set':<10} |\")  # placeholder output\n",
    "    # blank line for readability\n",
    "    print(\"-\" * 80)  # bottom separator\n",
    "# blank line for readability\n",
    "# blank line for readability\n",
    "if __name__ == \"__main__\":  # script entrypoint guard\n",
    "    # blank line for readability\n",
    "    print(\"NOTE: Running filtering logic to ensure 'filtered_corpus_top_20.csv' is up-to-date for metrics.\")  # informational message\n",
    "    # blank line for readability\n",
    "    # 1. Run the filtering logic (now self-contained)  # compute filtered DF and threshold\n",
    "    filtered_df, threshold = calculate_knowledge_threshold(FILEPATH_RAW, TARGET_PERCENTILE)  # call to compute top percentile\n",
    "    # blank line for readability\n",
    "    if filtered_df.empty:  # check if filtering produced results\n",
    "        print(\"Could not generate filtered corpus. Cannot proceed with metrics calculation.\")  # failure message\n",
    "    else:  # if we have filtered data, continue\n",
    "        # 2. Save the filtered corpus (important for the metrics script to read it)  # save step\n",
    "        filtered_df.to_csv(OUTPUT_FILENAME, index=False)  # write filtered DF to output file\n",
    "        print(f\"Filtered corpus saved to '{OUTPUT_FILENAME}'.\")  # confirmation message\n",
    "        # blank line for readability\n",
    "        # 3. Now run the actual metrics calculation on the saved filtered file  # compute metrics\n",
    "        sentence_pairs, metrics = calculate_corpus_metrics(FILEPATH_FILTERED)  # call metrics function\n",
    "        # blank line for readability\n",
    "        if metrics:  # if metrics computed successfully\n",
    "            format_and_print_results(sentence_pairs, metrics)  # print the formatted table\n",
    "        else:  # metrics call failed\n",
    "            print(\"Metrics calculation failed.\")  # failure message\n",
    "# end of cell"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "ptorch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}