import pickle import pandas as pd from sklearn.model_selection import train_test_split from openpyxl.utils import get_column_letter from openpyxl import load_workbook from openpyxl.workbook import Workbook INPUT_FILE = "AimoScore_WeakLink_big_scores.xls" # or .xls if using xlrd engine SCORE_PICKLE = "AimoScore_WeakLink_big_scores.pkl" OUTPUT_FILE = "AimoScore_deduped.xlsx" REMOVED_REPORT = "AimoScore_removed_report.csv" # old implemenation to load from original excel sheets = pd.read_excel(INPUT_FILE, sheet_name=None) df = next(iter(sheets.values())) """ with open(SCORE_PICKLE, "rb") as f: sheets = pickle.load(f) df = sheets["Sheet1"] #print(df) """ required_cols = {"AimoScore", "EstimatedScore"} if not required_cols.issubset(df.columns): raise ValueError(f"Input file must contain columns: {required_cols}. Found: {df.columns.tolist()}") counts = df["AimoScore"].value_counts() duplicated_values = set(counts[counts > 1].index) mask_removed = df["AimoScore"].isin(duplicated_values) removed_rows = df[mask_removed].copy().reset_index(drop=True) kept_rows = df[~mask_removed].copy() # Add comparison columns for removed rows removed_rows["Diff"] = removed_rows["EstimatedScore"] - removed_rows["AimoScore"] # Convert RelDiff into boolean Quality estimator: True if absolute rel diff > 0.1 removed_rows["Threshold"] = removed_rows["Diff"].apply( lambda x: bool(abs(x) > 0.1) if pd.notna(x) else False ) # Only keep the requested columns for the removed report removed_report_df = removed_rows[["EstimatedScore", "Diff", "Threshold"]] # Save removed report (overwrite previous save) removed_report_df.to_csv(REMOVED_REPORT, index=False) agg = removed_rows.groupby("AimoScore").agg( count_removed=("AimoScore", "size"), mean_estimated=("EstimatedScore", "mean"), median_estimated=("EstimatedScore", "median"), mean_diff=("Diff", "mean") ).reset_index() # Save kept rows to Excel via pandas (openpyxl engine creates file) kept_rows.to_excel(OUTPUT_FILE, index=False, engine="openpyxl") # Adjust column widths using openpyxl wb = load_workbook(OUTPUT_FILE) ws = wb.active # Strategy: set each column width to max(length of header, max cell string length) capped to a sensible range min_width = 15 max_width = 60 for i, col in enumerate(kept_rows.columns, start=1): col_letter = get_column_letter(i) # header length max_len = len(str(col)) # check cell values in column for cell in ws[col_letter]: if cell.value is not None: cell_len = len(str(cell.value)) if cell_len > max_len: max_len = cell_len # set width with caps and small padding adjusted_width = min(max(max_len + 2, min_width), max_width) ws.column_dimensions[col_letter].width = adjusted_width wb.save(OUTPUT_FILE) # Split kept rows into 80% train and 20% test (random, reproducible) train_df, test_df = train_test_split(kept_rows, test_size=0.2, random_state=42, shuffle=True) # Optional: reset index train_df = train_df.reset_index(drop=True) test_df = test_df.reset_index(drop=True) # Save CSVs train_df.to_csv("train.csv", index=False) test_df.to_csv("test.csv", index=False) print(f"Train rows: {len(train_df)}") print(f"Test rows: {len(test_df)}") # Save reports removed_rows.to_csv(REMOVED_REPORT, index=False) agg.to_csv("AimoScore_removed_agg.csv", index=False) print(f"Original rows: {len(df)}") print(f"Removed rows: {len(removed_rows)} (duplicated AimoScore values: {len(duplicated_values)})") print(f"Kept rows: {len(kept_rows)}")