File size: 15,673 Bytes
ad0be11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "54834b8c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd # import pandas and alias as pd for DataFrame operations\n",
"# blank line kept for readability\n",
"# Load your CSV # comment indicating next line loads the CSV into a DataFrame\n",
"df = pd.read_csv(\"filtered_corpus_here.csv\") # read the CSV file into variable df\n",
"# blank line kept for readability\n",
"# Normalize whitespace so \"hello world\" and \"hello world\" match # explain normalization intent\n",
"df['src_norm'] = df['src_lang'].str.strip().str.replace(r'\\s+', ' ', regex=True) # strip edges and collapse multiple spaces in source column\n",
"df['tgt_norm'] = df['tgt_lang'].str.strip().str.replace(r'\\s+', ' ', regex=True) # same normalization for target column\n",
"# blank line kept for readability\n",
"# Drop duplicates based on combined src+tgt # remove identical source-target pairs after normalization\n",
"df_unique = df.drop_duplicates(subset=['src_norm', 'tgt_norm'], keep='first') # keep first occurrence of duplicate pairs\n",
"# blank line kept for readability\n",
"# Remove helper columns # drop intermediate normalization columns before saving\n",
"df_unique = df_unique.drop(columns=['src_norm', 'tgt_norm']) # remove the temporary normalized columns\n",
"# blank line kept for readability\n",
"# Save result # write the deduplicated DataFrame to a new CSV file\n",
"df_unique.to_csv(\"filtered_corpus_here_removedDuplicates.csv\", index=False) # save without row index\n",
"# blank line kept for readability\n",
"print(\"Done. Original rows:\", len(df)) # print the original number of rows\n",
"print(\"New rows:\", len(df_unique)) # print the number of rows after deduplication\n",
"print(\"Removed:\", len(df) - len(df_unique)) # print how many rows were removed\n",
"# end of cell"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e73f3d91",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd # pandas for DataFrame operations\n",
"import numpy as np # numpy for numeric utilities (unused but commonly imported)\n",
"import string # string constants, used to build punctuation translator\n",
"import os # os module for file/path operations (imported for potential use)\n",
"# blank line for readability\n",
"# --- Configuration --- # configuration section start\n",
"# NOTE: This script assumes 'sample_corpus.csv' exists to generate the filtered file. # informational note\n",
"FILEPATH_RAW = 'filtered_corpus_here_removedDuplicates.csv' # path to the deduplicated raw corpus\n",
"FILEPATH_FILTERED = 'filtered_corpus_top_20.csv' # path to write/read the top filtered corpus\n",
"SOURCE_LANG_COL = 'src_lang' # column name for source language text\n",
"TARGET_LANG_COL = 'tgt_lang' # column name for target language text\n",
"SCORE_COL = 'KS_i' # column name that stores the Knowledge Score\n",
"# Filtering constants moved from calculate_threshold.py # note about origin of constant\n",
"TARGET_PERCENTILE = 80 # percentile threshold to filter top N% by score\n",
"OUTPUT_FILENAME = FILEPATH_FILTERED # Ensure output filename is the same # output target file variable\n",
"# --- End Configuration --- # configuration section end\n",
"# blank line for readability\n",
"def calculate_knowledge_threshold(filepath, percentile): # function to compute tau_K and filtered DF\n",
" \"\"\" # docstring start\n",
" Reads a CSV file, calculates the specified percentile of the Knowledge Score # description line 1\n",
" column, and returns the filtered corpus and the threshold (tau_K). # description line 2\n",
" \"\"\" # docstring end\n",
" try: # attempt to load and process the file\n",
" # 1. Load the data # step 1 comment\n",
" df = pd.read_csv(filepath) # load CSV into DataFrame df\n",
" # blank line for readability\n",
" # 1.5. Robust column check and numeric conversion # validate columns and convert types\n",
" if SCORE_COL not in df.columns: # check that score column exists\n",
" print(f\"Error: The CSV file must contain a column named '{SCORE_COL}'. Found columns: {list(df.columns)}\") # informative error\n",
" return pd.DataFrame(), None # return empty DF and None threshold on error\n",
" # blank line for readability\n",
" df_initial_size = len(df) # store initial row count for warnings\n",
" # Convert the score column to numeric, coercing errors # ensure numeric dtype for quantile\n",
" df[SCORE_COL] = pd.to_numeric(df[SCORE_COL], errors='coerce') # coerce invalids to NaN\n",
" df.dropna(subset=[SCORE_COL], inplace=True) # drop rows where score could not be parsed\n",
" # blank line for readability\n",
" if len(df) < df_initial_size: # warn if rows were dropped\n",
" print(f\"Warning: Dropped {df_initial_size - len(df)} rows with non-numeric scores.\") # warn about dropped rows\n",
" # blank line for readability\n",
" # 2. Calculate the threshold (tau_K) # compute percentile threshold\n",
" tau_K = df[SCORE_COL].quantile(percentile / 100, interpolation='linear') # compute numeric threshold\n",
" # blank line for readability\n",
" # 3. Apply the threshold to construct the filtered corpus (D_filtered) # filter rows >= tau_K\n",
" D_filtered = df[df[SCORE_COL] >= tau_K].copy() # select high-score rows and copy to new DF\n",
" # blank line for readability\n",
" return D_filtered, tau_K # return filtered DF and threshold\n",
" except FileNotFoundError: # handle missing file error\n",
" print(f\"Error: The file '{filepath}' was not found. Please ensure it exists.\") # print helpful message\n",
" return pd.DataFrame(), None # return empty DF and None threshold when file missing\n",
" except Exception as e: # catch-all for other errors\n",
" print(f\"An unexpected error occurred during filtering: {e}\") # print exception details\n",
" return pd.DataFrame(), None # return safe defaults on error\n",
"# blank line for readability\n",
"def tokenize_and_clean(text_series): # function to tokenize text series and clean tokens\n",
" \"\"\" # docstring start\n",
" Tokenizes text by splitting on whitespace, then cleans tokens by removing # description line 1\n",
" punctuation and converting to lowercase for accurate token counting and vocabulary size. # description line 2\n",
" \"\"\" # docstring end\n",
" all_tokens = [] # accumulator for all tokens across sentences\n",
" # blank line for readability\n",
" # Simple preprocessing: remove punctuation and lowercase # describe translator creation\n",
" translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) # map punctuation to spaces\n",
" # blank line for readability\n",
" for text in text_series.astype(str): # iterate rows as strings\n",
" clean_text = text.translate(translator).lower() # remove punctuation and lowercase the text\n",
" tokens = clean_text.split() # split on whitespace into tokens\n",
" all_tokens.extend(tokens) # add tokens to accumulator\n",
" # loop continues for next sentence\n",
" return all_tokens # return the flattened token list\n",
"# blank line for readability\n",
"def calculate_corpus_metrics(filepath): # function to compute corpus-level metrics\n",
" \"\"\"Calculates all required corpus statistics for a given file.\"\"\" # single-line docstring\n",
" try: # try to read the file into a DataFrame\n",
" df = pd.read_csv(filepath) # load corpus file\n",
" except FileNotFoundError: # handle missing file\n",
" print(f\"Error: The file '{filepath}' was not found. Please ensure it exists.\") # user-friendly message\n",
" return None # return None to indicate failure\n",
" except pd.errors.EmptyDataError: # handle empty file error\n",
" print(f\"Error: The file '{filepath}' is empty.\") # inform user file has no data\n",
" return None # return None to indicate failure\n",
" # blank line for readability\n",
" # 1. Metric: Sentence Pairs # compute total number of sentence pairs\n",
" sentence_pairs = len(df) # number of rows equals sentence pairs\n",
" # blank line for readability\n",
" # 2. Metric: Tokens, Avg. Sentence Length, Vocabulary Size # prepare metrics container\n",
" metrics = {} # dict to hold source/target metrics\n",
" # blank line for readability\n",
" # Ensure column existence before proceeding # validate presence of expected columns\n",
" for col in [SOURCE_LANG_COL, TARGET_LANG_COL]: # iterate expected column names\n",
" if col not in df.columns: # raise if missing\n",
" raise KeyError(f\"Column '{col}' not found in the corpus file.\") # explicit error to surface missing columns\n",
" # blank line for readability\n",
" for col, tag in [(SOURCE_LANG_COL, 'English'), (TARGET_LANG_COL, 'Target')]: # compute metrics for both sides\n",
" # Tokenization and Cleaning # comment for next steps\n",
" tokens = tokenize_and_clean(df[col]) # get token list for this column\n",
" total_tokens = len(tokens) # total token count across all sentences\n",
" # blank line for readability\n",
" # Vocabulary Size # compute unique token count\n",
" vocab_size = len(set(tokens)) # size of unique token set\n",
" # blank line for readability\n",
" # Avg. Sentence Length (Tokens / Sentence Pairs) # compute average tokens per sentence\n",
" avg_sentence_length = total_tokens / sentence_pairs if sentence_pairs > 0 else 0 # guard division by zero\n",
" # blank line for readability\n",
" metrics[tag] = { # store computed metrics under tag key\n",
" 'tokens': total_tokens, # total tokens count\n",
" 'avg_len': avg_sentence_length, # average sentence length in tokens\n",
" 'vocab_size': vocab_size # vocabulary size after preprocessing\n",
" }\n",
" return sentence_pairs, metrics # return computed metrics\n",
"# blank line for readability\n",
"def format_and_print_results(sentence_pairs, metrics): # pretty-print the metrics in table form\n",
" \"\"\"Formats the calculated metrics into the requested table structure.\"\"\" # docstring\n",
" # blank line for readability\n",
" src_data = metrics['English'] # metrics for source/English side\n",
" tgt_data = metrics['Target'] # metrics for target side\n",
" # blank line for readability\n",
" print(\"\\n\" + \"=\"*80) # print top border\n",
" print(f\" FILTERED CORPUS METRICS ({FILEPATH_FILTERED})\") # title with filename\n",
" print(\"=\"*80) # print border again\n",
" # blank line for readability\n",
" # Table Header # print header row labels\n",
" print(f\"| {'Metric':<30} | {'Source (English)':>20} | {'Target (Target)':>20} | {'Notes':<10} |\") # header formatting\n",
" print(\"-\" * 80) # separator line\n",
" # blank line for readability\n",
" # Row: Sentence Pairs # print sentence pair counts\n",
" print(f\"| {'Sentence Pairs':<30} | {sentence_pairs:>20,} | {sentence_pairs:>20,} | {'--':<10} |\") # counts for both columns\n",
" # blank line for readability\n",
" # Row: Tokens (Formatted for M/K display based on size) # prepare token display strings\n",
" src_tokens_display = f\"{src_data['tokens']:,}\" # formatted source token count\n",
" tgt_tokens_display = f\"{tgt_data['tokens']:,}\" # formatted target token count\n",
" # blank line for readability\n",
" print(f\"| {'Tokens':<30} | {src_tokens_display:>20} | {tgt_tokens_display:>20} | {'Actual Count':<10} |\") # print tokens row\n",
" # blank line for readability\n",
" # Row: Avg. Sentence Length # print average sentence lengths\n",
" print(f\"| {'Avg. Sentence Length':<30} | {src_data['avg_len']:>20.2f} | {tgt_data['avg_len']:>20.2f} | {'Tokens/Pair':<10} |\") # two-decimal precision\n",
" # blank line for readability\n",
" # Row: Vocabulary Size # print vocabulary sizes\n",
" print(f\"| {'Vocabulary Size':<30} | {src_data['vocab_size']:>20,} | {tgt_data['vocab_size']:>20,} | {'After Preprocessing':<10} |\") # vocab counts\n",
" # blank line for readability\n",
" # Row: OOV Rate (Placeholder since test set is needed) # OOV requires a test set\n",
" print(f\"| {'OOV Rate':<30} | {'--':>20} | {'--':>20} | {'Requires Test Set':<10} |\") # placeholder output\n",
" # blank line for readability\n",
" print(\"-\" * 80) # bottom separator\n",
"# blank line for readability\n",
"# blank line for readability\n",
"if __name__ == \"__main__\": # script entrypoint guard\n",
" # blank line for readability\n",
" print(\"NOTE: Running filtering logic to ensure 'filtered_corpus_top_20.csv' is up-to-date for metrics.\") # informational message\n",
" # blank line for readability\n",
" # 1. Run the filtering logic (now self-contained) # compute filtered DF and threshold\n",
" filtered_df, threshold = calculate_knowledge_threshold(FILEPATH_RAW, TARGET_PERCENTILE) # call to compute top percentile\n",
" # blank line for readability\n",
" if filtered_df.empty: # check if filtering produced results\n",
" print(\"Could not generate filtered corpus. Cannot proceed with metrics calculation.\") # failure message\n",
" else: # if we have filtered data, continue\n",
" # 2. Save the filtered corpus (important for the metrics script to read it) # save step\n",
" filtered_df.to_csv(OUTPUT_FILENAME, index=False) # write filtered DF to output file\n",
" print(f\"Filtered corpus saved to '{OUTPUT_FILENAME}'.\") # confirmation message\n",
" # blank line for readability\n",
" # 3. Now run the actual metrics calculation on the saved filtered file # compute metrics\n",
" sentence_pairs, metrics = calculate_corpus_metrics(FILEPATH_FILTERED) # call metrics function\n",
" # blank line for readability\n",
" if metrics: # if metrics computed successfully\n",
" format_and_print_results(sentence_pairs, metrics) # print the formatted table\n",
" else: # metrics call failed\n",
" print(\"Metrics calculation failed.\") # failure message\n",
"# end of cell"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "ptorch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|