from pathlib import Path
import pandas as pd
import numpy as np

BASE_DIR = Path(__file__).resolve().parent
OUT_DIR = BASE_DIR / "outputs"

PRED_PATH = OUT_DIR / "model_predictions_label_1to1.csv"
OUT_PATH = OUT_DIR / "threshold_evaluation_label_1to1.csv"

THRESHOLDS = [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80]


def evaluate_thresholds(df: pd.DataFrame) -> pd.DataFrame:
    rows = []

    for model_name, g in df.groupby("model"):
        g = g.copy().reset_index(drop=True)

        total_trades = len(g)
        baseline_hit_rate = g["y_true"].mean() if total_trades else 0.0
        baseline_expectancy_r = 2 * baseline_hit_rate - 1

        # threshold analysis
        for th in THRESHOLDS:
            kept = g[g["y_prob"] >= th].copy()
            kept_trades = len(kept)

            if kept_trades == 0:
                rows.append({
                    "model": model_name,
                    "mode": "threshold",
                    "threshold": th,
                    "top_pct": None,
                    "total_trades": total_trades,
                    "kept_trades": 0,
                    "coverage_pct": 0.0,
                    "baseline_hit_rate": round(baseline_hit_rate, 4),
                    "kept_hit_rate": None,
                    "lift_vs_baseline_pct": None,
                    "baseline_expectancy_R": round(baseline_expectancy_r, 4),
                    "kept_expectancy_R": None,
                    "wins_kept": 0,
                    "losses_kept": 0,
                })
                continue

            kept_hit_rate = kept["y_true"].mean()
            lift = ((kept_hit_rate / baseline_hit_rate) - 1) * 100 if baseline_hit_rate > 0 else None
            kept_expectancy_r = 2 * kept_hit_rate - 1

            rows.append({
                "model": model_name,
                "mode": "threshold",
                "threshold": th,
                "top_pct": None,
                "total_trades": total_trades,
                "kept_trades": kept_trades,
                "coverage_pct": round((kept_trades / total_trades) * 100, 2),
                "baseline_hit_rate": round(baseline_hit_rate, 4),
                "kept_hit_rate": round(kept_hit_rate, 4),
                "lift_vs_baseline_pct": round(lift, 2) if lift is not None else None,
                "baseline_expectancy_R": round(baseline_expectancy_r, 4),
                "kept_expectancy_R": round(kept_expectancy_r, 4),
                "wins_kept": int(kept["y_true"].sum()),
                "losses_kept": int(kept_trades - kept["y_true"].sum()),
            })

        # top percentile analysis
        for top_pct in [10, 20, 30, 40, 50]:
            k = max(int(np.ceil(total_trades * top_pct / 100)), 1)
            kept = g.sort_values("y_prob", ascending=False).head(k).copy()

            kept_hit_rate = kept["y_true"].mean()
            lift = ((kept_hit_rate / baseline_hit_rate) - 1) * 100 if baseline_hit_rate > 0 else None
            kept_expectancy_r = 2 * kept_hit_rate - 1
            min_prob_in_bucket = kept["y_prob"].min()

            rows.append({
                "model": model_name,
                "mode": "top_pct",
                "threshold": round(float(min_prob_in_bucket), 4),
                "top_pct": top_pct,
                "total_trades": total_trades,
                "kept_trades": k,
                "coverage_pct": round((k / total_trades) * 100, 2),
                "baseline_hit_rate": round(baseline_hit_rate, 4),
                "kept_hit_rate": round(kept_hit_rate, 4),
                "lift_vs_baseline_pct": round(lift, 2) if lift is not None else None,
                "baseline_expectancy_R": round(baseline_expectancy_r, 4),
                "kept_expectancy_R": round(kept_expectancy_r, 4),
                "wins_kept": int(kept["y_true"].sum()),
                "losses_kept": int(k - kept["y_true"].sum()),
            })

    return pd.DataFrame(rows)


def main():
    df = pd.read_csv(PRED_PATH)

    required_cols = {"model", "y_true", "y_prob"}
    missing = required_cols - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns in predictions file: {missing}")

    out = evaluate_thresholds(df)
    out.to_csv(OUT_PATH, index=False)

    print(f"Saved threshold evaluation to: {OUT_PATH}")

    print("\n=== Threshold rows only ===")
    print(
        out[out["mode"] == "threshold"]
        .sort_values(["model", "threshold"])
        .to_string(index=False)
    )

    print("\n=== Top-percentile rows only ===")
    print(
        out[out["mode"] == "top_pct"]
        .sort_values(["model", "top_pct"])
        .to_string(index=False)
    )

    print("\n=== Best rows by kept_expectancy_R (minimum 100 kept trades) ===")
    best = out[(out["kept_trades"] >= 100) & (out["kept_expectancy_R"].notna())].copy()
    if best.empty:
        print("No qualifying rows")
    else:
        best = best.sort_values(
            ["kept_expectancy_R", "kept_hit_rate", "kept_trades"],
            ascending=[False, False, False]
        )
        print(best.head(15).to_string(index=False))


if __name__ == "__main__":
    main()