In [None]:
import pandas as pd # import pandas and alias as pd for DataFrame operations
# blank line kept for readability
# Load your CSV # comment indicating next line loads the CSV into a DataFrame
df = pd.read_csv("filtered_corpus_here.csv") # read the CSV file into variable df
# blank line kept for readability
# Normalize whitespace so "hello world" and "hello world" match # explain normalization intent
df['src_norm'] = df['src_lang'].str.strip().str.replace(r'\s+', ' ', regex=True) # strip edges and collapse multiple spaces in source column
df['tgt_norm'] = df['tgt_lang'].str.strip().str.replace(r'\s+', ' ', regex=True) # same normalization for target column
# blank line kept for readability
# Drop duplicates based on combined src+tgt # remove identical source-target pairs after normalization
df_unique = df.drop_duplicates(subset=['src_norm', 'tgt_norm'], keep='first') # keep first occurrence of duplicate pairs
# blank line kept for readability
# Remove helper columns # drop intermediate normalization columns before saving
df_unique = df_unique.drop(columns=['src_norm', 'tgt_norm']) # remove the temporary normalized columns
# blank line kept for readability
# Save result # write the deduplicated DataFrame to a new CSV file
df_unique.to_csv("filtered_corpus_here_removedDuplicates.csv", index=False) # save without row index
# blank line kept for readability
print("Done. Original rows:", len(df)) # print the original number of rows
print("New rows:", len(df_unique)) # print the number of rows after deduplication
print("Removed:", len(df) - len(df_unique)) # print how many rows were removed
# end of cell

In [None]:
import pandas as pd # pandas for DataFrame operations
import numpy as np # numpy for numeric utilities (unused but commonly imported)
import string # string constants, used to build punctuation translator
import os # os module for file/path operations (imported for potential use)
# blank line for readability
# --- Configuration --- # configuration section start
# NOTE: This script assumes 'sample_corpus.csv' exists to generate the filtered file. # informational note
FILEPATH_RAW = 'filtered_corpus_here_removedDuplicates.csv' # path to the deduplicated raw corpus
FILEPATH_FILTERED = 'filtered_corpus_top_20.csv' # path to write/read the top filtered corpus
SOURCE_LANG_COL = 'src_lang' # column name for source language text
TARGET_LANG_COL = 'tgt_lang' # column name for target language text
SCORE_COL = 'KS_i' # column name that stores the Knowledge Score
# Filtering constants moved from calculate_threshold.py # note about origin of constant
TARGET_PERCENTILE = 80 # percentile threshold to filter top N% by score
OUTPUT_FILENAME = FILEPATH_FILTERED # Ensure output filename is the same # output target file variable
# --- End Configuration --- # configuration section end
# blank line for readability
def calculate_knowledge_threshold(filepath, percentile): # function to compute tau_K and filtered DF
 """ # docstring start
 Reads a CSV file, calculates the specified percentile of the Knowledge Score # description line 1
 column, and returns the filtered corpus and the threshold (tau_K). # description line 2
 """ # docstring end
 try: # attempt to load and process the file
 # 1. Load the data # step 1 comment
 df = pd.read_csv(filepath) # load CSV into DataFrame df
 # blank line for readability
 # 1.5. Robust column check and numeric conversion # validate columns and convert types
 if SCORE_COL not in df.columns: # check that score column exists
 print(f"Error: The CSV file must contain a column named '{SCORE_COL}'. Found columns: {list(df.columns)}") # informative error
 return pd.DataFrame(), None # return empty DF and None threshold on error
 # blank line for readability
 df_initial_size = len(df) # store initial row count for warnings
 # Convert the score column to numeric, coercing errors # ensure numeric dtype for quantile
 df[SCORE_COL] = pd.to_numeric(df[SCORE_COL], errors='coerce') # coerce invalids to NaN
 df.dropna(subset=[SCORE_COL], inplace=True) # drop rows where score could not be parsed
 # blank line for readability
 if len(df) < df_initial_size: # warn if rows were dropped
 print(f"Warning: Dropped {df_initial_size - len(df)} rows with non-numeric scores.") # warn about dropped rows
 # blank line for readability
 # 2. Calculate the threshold (tau_K) # compute percentile threshold
 tau_K = df[SCORE_COL].quantile(percentile / 100, interpolation='linear') # compute numeric threshold
 # blank line for readability
 # 3. Apply the threshold to construct the filtered corpus (D_filtered) # filter rows >= tau_K
 D_filtered = df[df[SCORE_COL] >= tau_K].copy() # select high-score rows and copy to new DF
 # blank line for readability
 return D_filtered, tau_K # return filtered DF and threshold
 except FileNotFoundError: # handle missing file error
 print(f"Error: The file '{filepath}' was not found. Please ensure it exists.") # print helpful message
 return pd.DataFrame(), None # return empty DF and None threshold when file missing
 except Exception as e: # catch-all for other errors
 print(f"An unexpected error occurred during filtering: {e}") # print exception details
 return pd.DataFrame(), None # return safe defaults on error
# blank line for readability
def tokenize_and_clean(text_series): # function to tokenize text series and clean tokens
 """ # docstring start
 Tokenizes text by splitting on whitespace, then cleans tokens by removing # description line 1
 punctuation and converting to lowercase for accurate token counting and vocabulary size. # description line 2
 """ # docstring end
 all_tokens = [] # accumulator for all tokens across sentences
 # blank line for readability
 # Simple preprocessing: remove punctuation and lowercase # describe translator creation
 translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) # map punctuation to spaces
 # blank line for readability
 for text in text_series.astype(str): # iterate rows as strings
 clean_text = text.translate(translator).lower() # remove punctuation and lowercase the text
 tokens = clean_text.split() # split on whitespace into tokens
 all_tokens.extend(tokens) # add tokens to accumulator
 # loop continues for next sentence
 return all_tokens # return the flattened token list
# blank line for readability
def calculate_corpus_metrics(filepath): # function to compute corpus-level metrics
 """Calculates all required corpus statistics for a given file.""" # single-line docstring
 try: # try to read the file into a DataFrame
 df = pd.read_csv(filepath) # load corpus file
 except FileNotFoundError: # handle missing file
 print(f"Error: The file '{filepath}' was not found. Please ensure it exists.") # user-friendly message
 return None # return None to indicate failure
 except pd.errors.EmptyDataError: # handle empty file error
 print(f"Error: The file '{filepath}' is empty.") # inform user file has no data
 return None # return None to indicate failure
 # blank line for readability
 # 1. Metric: Sentence Pairs # compute total number of sentence pairs
 sentence_pairs = len(df) # number of rows equals sentence pairs
 # blank line for readability
 # 2. Metric: Tokens, Avg. Sentence Length, Vocabulary Size # prepare metrics container
 metrics = {} # dict to hold source/target metrics
 # blank line for readability
 # Ensure column existence before proceeding # validate presence of expected columns
 for col in [SOURCE_LANG_COL, TARGET_LANG_COL]: # iterate expected column names
 if col not in df.columns: # raise if missing
 raise KeyError(f"Column '{col}' not found in the corpus file.") # explicit error to surface missing columns
 # blank line for readability
 for col, tag in [(SOURCE_LANG_COL, 'English'), (TARGET_LANG_COL, 'Target')]: # compute metrics for both sides
 # Tokenization and Cleaning # comment for next steps
 tokens = tokenize_and_clean(df[col]) # get token list for this column
 total_tokens = len(tokens) # total token count across all sentences
 # blank line for readability
 # Vocabulary Size # compute unique token count
 vocab_size = len(set(tokens)) # size of unique token set
 # blank line for readability
 # Avg. Sentence Length (Tokens / Sentence Pairs) # compute average tokens per sentence
 avg_sentence_length = total_tokens / sentence_pairs if sentence_pairs > 0 else 0 # guard division by zero
 # blank line for readability
 metrics[tag] = { # store computed metrics under tag key
 'tokens': total_tokens, # total tokens count
 'avg_len': avg_sentence_length, # average sentence length in tokens
 'vocab_size': vocab_size # vocabulary size after preprocessing
 }
 return sentence_pairs, metrics # return computed metrics
# blank line for readability
def format_and_print_results(sentence_pairs, metrics): # pretty-print the metrics in table form
 """Formats the calculated metrics into the requested table structure.""" # docstring
 # blank line for readability
 src_data = metrics['English'] # metrics for source/English side
 tgt_data = metrics['Target'] # metrics for target side
 # blank line for readability
 print("\n" + "="*80) # print top border
 print(f" FILTERED CORPUS METRICS ({FILEPATH_FILTERED})") # title with filename
 print("="*80) # print border again
 # blank line for readability
 # Table Header # print header row labels
 print(f"| {'Metric':<30} | {'Source (English)':>20} | {'Target (Target)':>20} | {'Notes':<10} |") # header formatting
 print("-" * 80) # separator line
 # blank line for readability
 # Row: Sentence Pairs # print sentence pair counts
 print(f"| {'Sentence Pairs':<30} | {sentence_pairs:>20,} | {sentence_pairs:>20,} | {'--':<10} |") # counts for both columns
 # blank line for readability
 # Row: Tokens (Formatted for M/K display based on size) # prepare token display strings
 src_tokens_display = f"{src_data['tokens']:,}" # formatted source token count
 tgt_tokens_display = f"{tgt_data['tokens']:,}" # formatted target token count
 # blank line for readability
 print(f"| {'Tokens':<30} | {src_tokens_display:>20} | {tgt_tokens_display:>20} | {'Actual Count':<10} |") # print tokens row
 # blank line for readability
 # Row: Avg. Sentence Length # print average sentence lengths
 print(f"| {'Avg. Sentence Length':<30} | {src_data['avg_len']:>20.2f} | {tgt_data['avg_len']:>20.2f} | {'Tokens/Pair':<10} |") # two-decimal precision
 # blank line for readability
 # Row: Vocabulary Size # print vocabulary sizes
 print(f"| {'Vocabulary Size':<30} | {src_data['vocab_size']:>20,} | {tgt_data['vocab_size']:>20,} | {'After Preprocessing':<10} |") # vocab counts
 # blank line for readability
 # Row: OOV Rate (Placeholder since test set is needed) # OOV requires a test set
 print(f"| {'OOV Rate':<30} | {'--':>20} | {'--':>20} | {'Requires Test Set':<10} |") # placeholder output
 # blank line for readability
 print("-" * 80) # bottom separator
# blank line for readability
# blank line for readability
if __name__ == "__main__": # script entrypoint guard
 # blank line for readability
 print("NOTE: Running filtering logic to ensure 'filtered_corpus_top_20.csv' is up-to-date for metrics.") # informational message
 # blank line for readability
 # 1. Run the filtering logic (now self-contained) # compute filtered DF and threshold
 filtered_df, threshold = calculate_knowledge_threshold(FILEPATH_RAW, TARGET_PERCENTILE) # call to compute top percentile
 # blank line for readability
 if filtered_df.empty: # check if filtering produced results
 print("Could not generate filtered corpus. Cannot proceed with metrics calculation.") # failure message
 else: # if we have filtered data, continue
 # 2. Save the filtered corpus (important for the metrics script to read it) # save step
 filtered_df.to_csv(OUTPUT_FILENAME, index=False) # write filtered DF to output file
 print(f"Filtered corpus saved to '{OUTPUT_FILENAME}'.") # confirmation message
 # blank line for readability
 # 3. Now run the actual metrics calculation on the saved filtered file # compute metrics
 sentence_pairs, metrics = calculate_corpus_metrics(FILEPATH_FILTERED) # call metrics function
 # blank line for readability
 if metrics: # if metrics computed successfully
 format_and_print_results(sentence_pairs, metrics) # print the formatted table
 else: # metrics call failed
 print("Metrics calculation failed.") # failure message
# end of cell