import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from openpyxl.utils import get_column_letter
from openpyxl import load_workbook
from openpyxl.workbook import Workbook

INPUT_FILE = "AimoScore_WeakLink_big_scores.xls"  # or .xls if using xlrd engine
SCORE_PICKLE = "AimoScore_WeakLink_big_scores.pkl"
OUTPUT_FILE = "AimoScore_deduped.xlsx"
REMOVED_REPORT = "AimoScore_removed_report.csv"

# old implemenation to load from original excel
sheets = pd.read_excel(INPUT_FILE, sheet_name=None)
df = next(iter(sheets.values()))
"""
with open(SCORE_PICKLE, "rb") as f:
    sheets = pickle.load(f)
    df = sheets["Sheet1"]
    #print(df)
"""

required_cols = {"AimoScore", "EstimatedScore"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"Input file must contain columns: {required_cols}. Found: {df.columns.tolist()}")

counts = df["AimoScore"].value_counts()
duplicated_values = set(counts[counts > 1].index)
mask_removed = df["AimoScore"].isin(duplicated_values)
removed_rows = df[mask_removed].copy().reset_index(drop=True)
kept_rows = df[~mask_removed].copy()

# Add comparison columns for removed rows
removed_rows["Diff"] = removed_rows["EstimatedScore"] - removed_rows["AimoScore"]


# Convert RelDiff into boolean Quality estimator: True if absolute rel diff > 0.1
removed_rows["Threshold"] = removed_rows["Diff"].apply(
    lambda x: bool(abs(x) > 0.1) if pd.notna(x) else False
)

# Only keep the requested columns for the removed report
removed_report_df = removed_rows[["EstimatedScore", "Diff", "Threshold"]]

# Save removed report (overwrite previous save)
removed_report_df.to_csv(REMOVED_REPORT, index=False)


agg = removed_rows.groupby("AimoScore").agg(
    count_removed=("AimoScore", "size"),
    mean_estimated=("EstimatedScore", "mean"),
    median_estimated=("EstimatedScore", "median"),
    mean_diff=("Diff", "mean")
).reset_index()

# Save kept rows to Excel via pandas (openpyxl engine creates file)
kept_rows.to_excel(OUTPUT_FILE, index=False, engine="openpyxl")

# Adjust column widths using openpyxl
wb = load_workbook(OUTPUT_FILE)
ws = wb.active

# Strategy: set each column width to max(length of header, max cell string length) capped to a sensible range
min_width = 15
max_width = 60

for i, col in enumerate(kept_rows.columns, start=1):
    col_letter = get_column_letter(i)
    # header length
    max_len = len(str(col))
    # check cell values in column
    for cell in ws[col_letter]:
        if cell.value is not None:
            cell_len = len(str(cell.value))
            if cell_len > max_len:
                max_len = cell_len
    # set width with caps and small padding
    adjusted_width = min(max(max_len + 2, min_width), max_width)
    ws.column_dimensions[col_letter].width = adjusted_width

wb.save(OUTPUT_FILE)

# Split kept rows into 80% train and 20% test (random, reproducible)
train_df, test_df = train_test_split(kept_rows, test_size=0.2, random_state=42, shuffle=True)

# Optional: reset index
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Save CSVs
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

print(f"Train rows: {len(train_df)}")
print(f"Test rows: {len(test_df)}")

# Save reports
removed_rows.to_csv(REMOVED_REPORT, index=False)
agg.to_csv("AimoScore_removed_agg.csv", index=False)

print(f"Original rows: {len(df)}")
print(f"Removed rows: {len(removed_rows)} (duplicated AimoScore values: {len(duplicated_values)})")
print(f"Kept rows: {len(kept_rows)}")