{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "54834b8c", "metadata": {}, "outputs": [], "source": [ "import pandas as pd # import pandas and alias as pd for DataFrame operations\n", "# blank line kept for readability\n", "# Load your CSV # comment indicating next line loads the CSV into a DataFrame\n", "df = pd.read_csv(\"filtered_corpus_here.csv\") # read the CSV file into variable df\n", "# blank line kept for readability\n", "# Normalize whitespace so \"hello world\" and \"hello world\" match # explain normalization intent\n", "df['src_norm'] = df['src_lang'].str.strip().str.replace(r'\\s+', ' ', regex=True) # strip edges and collapse multiple spaces in source column\n", "df['tgt_norm'] = df['tgt_lang'].str.strip().str.replace(r'\\s+', ' ', regex=True) # same normalization for target column\n", "# blank line kept for readability\n", "# Drop duplicates based on combined src+tgt # remove identical source-target pairs after normalization\n", "df_unique = df.drop_duplicates(subset=['src_norm', 'tgt_norm'], keep='first') # keep first occurrence of duplicate pairs\n", "# blank line kept for readability\n", "# Remove helper columns # drop intermediate normalization columns before saving\n", "df_unique = df_unique.drop(columns=['src_norm', 'tgt_norm']) # remove the temporary normalized columns\n", "# blank line kept for readability\n", "# Save result # write the deduplicated DataFrame to a new CSV file\n", "df_unique.to_csv(\"filtered_corpus_here_removedDuplicates.csv\", index=False) # save without row index\n", "# blank line kept for readability\n", "print(\"Done. Original rows:\", len(df)) # print the original number of rows\n", "print(\"New rows:\", len(df_unique)) # print the number of rows after deduplication\n", "print(\"Removed:\", len(df) - len(df_unique)) # print how many rows were removed\n", "# end of cell" ] }, { "cell_type": "code", "execution_count": null, "id": "e73f3d91", "metadata": {}, "outputs": [], "source": [ "import pandas as pd # pandas for DataFrame operations\n", "import numpy as np # numpy for numeric utilities (unused but commonly imported)\n", "import string # string constants, used to build punctuation translator\n", "import os # os module for file/path operations (imported for potential use)\n", "# blank line for readability\n", "# --- Configuration --- # configuration section start\n", "# NOTE: This script assumes 'sample_corpus.csv' exists to generate the filtered file. # informational note\n", "FILEPATH_RAW = 'filtered_corpus_here_removedDuplicates.csv' # path to the deduplicated raw corpus\n", "FILEPATH_FILTERED = 'filtered_corpus_top_20.csv' # path to write/read the top filtered corpus\n", "SOURCE_LANG_COL = 'src_lang' # column name for source language text\n", "TARGET_LANG_COL = 'tgt_lang' # column name for target language text\n", "SCORE_COL = 'KS_i' # column name that stores the Knowledge Score\n", "# Filtering constants moved from calculate_threshold.py # note about origin of constant\n", "TARGET_PERCENTILE = 80 # percentile threshold to filter top N% by score\n", "OUTPUT_FILENAME = FILEPATH_FILTERED # Ensure output filename is the same # output target file variable\n", "# --- End Configuration --- # configuration section end\n", "# blank line for readability\n", "def calculate_knowledge_threshold(filepath, percentile): # function to compute tau_K and filtered DF\n", " \"\"\" # docstring start\n", " Reads a CSV file, calculates the specified percentile of the Knowledge Score # description line 1\n", " column, and returns the filtered corpus and the threshold (tau_K). # description line 2\n", " \"\"\" # docstring end\n", " try: # attempt to load and process the file\n", " # 1. Load the data # step 1 comment\n", " df = pd.read_csv(filepath) # load CSV into DataFrame df\n", " # blank line for readability\n", " # 1.5. Robust column check and numeric conversion # validate columns and convert types\n", " if SCORE_COL not in df.columns: # check that score column exists\n", " print(f\"Error: The CSV file must contain a column named '{SCORE_COL}'. Found columns: {list(df.columns)}\") # informative error\n", " return pd.DataFrame(), None # return empty DF and None threshold on error\n", " # blank line for readability\n", " df_initial_size = len(df) # store initial row count for warnings\n", " # Convert the score column to numeric, coercing errors # ensure numeric dtype for quantile\n", " df[SCORE_COL] = pd.to_numeric(df[SCORE_COL], errors='coerce') # coerce invalids to NaN\n", " df.dropna(subset=[SCORE_COL], inplace=True) # drop rows where score could not be parsed\n", " # blank line for readability\n", " if len(df) < df_initial_size: # warn if rows were dropped\n", " print(f\"Warning: Dropped {df_initial_size - len(df)} rows with non-numeric scores.\") # warn about dropped rows\n", " # blank line for readability\n", " # 2. Calculate the threshold (tau_K) # compute percentile threshold\n", " tau_K = df[SCORE_COL].quantile(percentile / 100, interpolation='linear') # compute numeric threshold\n", " # blank line for readability\n", " # 3. Apply the threshold to construct the filtered corpus (D_filtered) # filter rows >= tau_K\n", " D_filtered = df[df[SCORE_COL] >= tau_K].copy() # select high-score rows and copy to new DF\n", " # blank line for readability\n", " return D_filtered, tau_K # return filtered DF and threshold\n", " except FileNotFoundError: # handle missing file error\n", " print(f\"Error: The file '{filepath}' was not found. Please ensure it exists.\") # print helpful message\n", " return pd.DataFrame(), None # return empty DF and None threshold when file missing\n", " except Exception as e: # catch-all for other errors\n", " print(f\"An unexpected error occurred during filtering: {e}\") # print exception details\n", " return pd.DataFrame(), None # return safe defaults on error\n", "# blank line for readability\n", "def tokenize_and_clean(text_series): # function to tokenize text series and clean tokens\n", " \"\"\" # docstring start\n", " Tokenizes text by splitting on whitespace, then cleans tokens by removing # description line 1\n", " punctuation and converting to lowercase for accurate token counting and vocabulary size. # description line 2\n", " \"\"\" # docstring end\n", " all_tokens = [] # accumulator for all tokens across sentences\n", " # blank line for readability\n", " # Simple preprocessing: remove punctuation and lowercase # describe translator creation\n", " translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) # map punctuation to spaces\n", " # blank line for readability\n", " for text in text_series.astype(str): # iterate rows as strings\n", " clean_text = text.translate(translator).lower() # remove punctuation and lowercase the text\n", " tokens = clean_text.split() # split on whitespace into tokens\n", " all_tokens.extend(tokens) # add tokens to accumulator\n", " # loop continues for next sentence\n", " return all_tokens # return the flattened token list\n", "# blank line for readability\n", "def calculate_corpus_metrics(filepath): # function to compute corpus-level metrics\n", " \"\"\"Calculates all required corpus statistics for a given file.\"\"\" # single-line docstring\n", " try: # try to read the file into a DataFrame\n", " df = pd.read_csv(filepath) # load corpus file\n", " except FileNotFoundError: # handle missing file\n", " print(f\"Error: The file '{filepath}' was not found. Please ensure it exists.\") # user-friendly message\n", " return None # return None to indicate failure\n", " except pd.errors.EmptyDataError: # handle empty file error\n", " print(f\"Error: The file '{filepath}' is empty.\") # inform user file has no data\n", " return None # return None to indicate failure\n", " # blank line for readability\n", " # 1. Metric: Sentence Pairs # compute total number of sentence pairs\n", " sentence_pairs = len(df) # number of rows equals sentence pairs\n", " # blank line for readability\n", " # 2. Metric: Tokens, Avg. Sentence Length, Vocabulary Size # prepare metrics container\n", " metrics = {} # dict to hold source/target metrics\n", " # blank line for readability\n", " # Ensure column existence before proceeding # validate presence of expected columns\n", " for col in [SOURCE_LANG_COL, TARGET_LANG_COL]: # iterate expected column names\n", " if col not in df.columns: # raise if missing\n", " raise KeyError(f\"Column '{col}' not found in the corpus file.\") # explicit error to surface missing columns\n", " # blank line for readability\n", " for col, tag in [(SOURCE_LANG_COL, 'English'), (TARGET_LANG_COL, 'Target')]: # compute metrics for both sides\n", " # Tokenization and Cleaning # comment for next steps\n", " tokens = tokenize_and_clean(df[col]) # get token list for this column\n", " total_tokens = len(tokens) # total token count across all sentences\n", " # blank line for readability\n", " # Vocabulary Size # compute unique token count\n", " vocab_size = len(set(tokens)) # size of unique token set\n", " # blank line for readability\n", " # Avg. Sentence Length (Tokens / Sentence Pairs) # compute average tokens per sentence\n", " avg_sentence_length = total_tokens / sentence_pairs if sentence_pairs > 0 else 0 # guard division by zero\n", " # blank line for readability\n", " metrics[tag] = { # store computed metrics under tag key\n", " 'tokens': total_tokens, # total tokens count\n", " 'avg_len': avg_sentence_length, # average sentence length in tokens\n", " 'vocab_size': vocab_size # vocabulary size after preprocessing\n", " }\n", " return sentence_pairs, metrics # return computed metrics\n", "# blank line for readability\n", "def format_and_print_results(sentence_pairs, metrics): # pretty-print the metrics in table form\n", " \"\"\"Formats the calculated metrics into the requested table structure.\"\"\" # docstring\n", " # blank line for readability\n", " src_data = metrics['English'] # metrics for source/English side\n", " tgt_data = metrics['Target'] # metrics for target side\n", " # blank line for readability\n", " print(\"\\n\" + \"=\"*80) # print top border\n", " print(f\" FILTERED CORPUS METRICS ({FILEPATH_FILTERED})\") # title with filename\n", " print(\"=\"*80) # print border again\n", " # blank line for readability\n", " # Table Header # print header row labels\n", " print(f\"| {'Metric':<30} | {'Source (English)':>20} | {'Target (Target)':>20} | {'Notes':<10} |\") # header formatting\n", " print(\"-\" * 80) # separator line\n", " # blank line for readability\n", " # Row: Sentence Pairs # print sentence pair counts\n", " print(f\"| {'Sentence Pairs':<30} | {sentence_pairs:>20,} | {sentence_pairs:>20,} | {'--':<10} |\") # counts for both columns\n", " # blank line for readability\n", " # Row: Tokens (Formatted for M/K display based on size) # prepare token display strings\n", " src_tokens_display = f\"{src_data['tokens']:,}\" # formatted source token count\n", " tgt_tokens_display = f\"{tgt_data['tokens']:,}\" # formatted target token count\n", " # blank line for readability\n", " print(f\"| {'Tokens':<30} | {src_tokens_display:>20} | {tgt_tokens_display:>20} | {'Actual Count':<10} |\") # print tokens row\n", " # blank line for readability\n", " # Row: Avg. Sentence Length # print average sentence lengths\n", " print(f\"| {'Avg. Sentence Length':<30} | {src_data['avg_len']:>20.2f} | {tgt_data['avg_len']:>20.2f} | {'Tokens/Pair':<10} |\") # two-decimal precision\n", " # blank line for readability\n", " # Row: Vocabulary Size # print vocabulary sizes\n", " print(f\"| {'Vocabulary Size':<30} | {src_data['vocab_size']:>20,} | {tgt_data['vocab_size']:>20,} | {'After Preprocessing':<10} |\") # vocab counts\n", " # blank line for readability\n", " # Row: OOV Rate (Placeholder since test set is needed) # OOV requires a test set\n", " print(f\"| {'OOV Rate':<30} | {'--':>20} | {'--':>20} | {'Requires Test Set':<10} |\") # placeholder output\n", " # blank line for readability\n", " print(\"-\" * 80) # bottom separator\n", "# blank line for readability\n", "# blank line for readability\n", "if __name__ == \"__main__\": # script entrypoint guard\n", " # blank line for readability\n", " print(\"NOTE: Running filtering logic to ensure 'filtered_corpus_top_20.csv' is up-to-date for metrics.\") # informational message\n", " # blank line for readability\n", " # 1. Run the filtering logic (now self-contained) # compute filtered DF and threshold\n", " filtered_df, threshold = calculate_knowledge_threshold(FILEPATH_RAW, TARGET_PERCENTILE) # call to compute top percentile\n", " # blank line for readability\n", " if filtered_df.empty: # check if filtering produced results\n", " print(\"Could not generate filtered corpus. Cannot proceed with metrics calculation.\") # failure message\n", " else: # if we have filtered data, continue\n", " # 2. Save the filtered corpus (important for the metrics script to read it) # save step\n", " filtered_df.to_csv(OUTPUT_FILENAME, index=False) # write filtered DF to output file\n", " print(f\"Filtered corpus saved to '{OUTPUT_FILENAME}'.\") # confirmation message\n", " # blank line for readability\n", " # 3. Now run the actual metrics calculation on the saved filtered file # compute metrics\n", " sentence_pairs, metrics = calculate_corpus_metrics(FILEPATH_FILTERED) # call metrics function\n", " # blank line for readability\n", " if metrics: # if metrics computed successfully\n", " format_and_print_results(sentence_pairs, metrics) # print the formatted table\n", " else: # metrics call failed\n", " print(\"Metrics calculation failed.\") # failure message\n", "# end of cell" ] } ], "metadata": { "kernelspec": { "display_name": "ptorch", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }