nse-bot-backend / forward_test_xgboost.py
ash001's picture
Deploy from GitHub Actions to nse-bot-backend
789e5eb verified
from pathlib import Path
import json
import joblib
import pandas as pd
BASE_DIR = Path(__file__).resolve().parent
OUT_DIR = BASE_DIR / "outputs"
MODEL_PATH = OUT_DIR / "xgboost_label_1to1_pipeline.joblib"
DATA_PATH = OUT_DIR / "ml_dataset_exact_all_v2_2026-03-02_to_2026-03-06.csv"
OUT_SCORED_PATH = OUT_DIR / "forward_scored_xgboost_2026-03-02_to_2026-03-06.csv"
OUT_SUMMARY_PATH = OUT_DIR / "forward_summary_xgboost_2026-03-02_to_2026-03-06.json"
THRESHOLDS = [0.55, 0.60]
DROP_COLS_ALWAYS = [
"trade_key",
"label_1to1",
"label_1to2",
"bt_buy_signal_time",
"bt_sell_signal_time",
"bt_buy_time",
"bt_buy_price",
"bt_stop_loss",
"bt_target_1",
"bt_target_2",
"bt_qty_per_lot",
"bt_capital_per_lot",
"bt_stop_loss_amt_per_lot",
"signal_time",
"confirmation_time",
"indication_time",
"buy_time",
]
OPTIONAL_DROP_COLS = [
"exit_status",
"option_symbol",
"trade_side",
]
def build_feature_matrix(df: pd.DataFrame):
drop_cols = [c for c in DROP_COLS_ALWAYS if c in df.columns]
drop_cols += [c for c in OPTIONAL_DROP_COLS if c in df.columns]
X = df.drop(columns=drop_cols, errors="ignore").copy()
if "sector" in X.columns:
X["sector"] = X["sector"].fillna("UNKNOWN").replace("", "UNKNOWN")
return X
def summarize_for_threshold(df: pd.DataFrame, threshold: float):
kept = df[df["y_prob"] >= threshold].copy()
total = len(df)
kept_n = len(kept)
if kept_n == 0:
return {
"threshold": threshold,
"total_trades": total,
"kept_trades": 0,
"coverage_pct": 0.0,
"hit_rate_1to1": None,
"gross_pnl_1to1_per_lot_sum": 0.0,
"avg_pnl_1to1_per_lot": None,
}
hit_rate_1to1 = kept["label_1to1"].mean()
# 1:1 realized PnL approximation:
# if label_1to1 = 1 => +TOTAL PROFIT (PER LOT) (1:1)
# else => -STOP LOSS AMOUNT FOR ONE LOT
kept["realized_pnl_1to1_per_lot"] = kept.apply(
lambda r: r["bt_target_1"] - r["bt_buy_price"] if False else (
r["bt_stop_loss_amt_per_lot"] * -1 if r["label_1to1"] == 0 else r["bt_stop_loss_amt_per_lot"]
),
axis=1
)
# Since for a 1:1 setup target profit equals stop-loss amount, we can use stop_loss_amt_per_lot magnitude
gross_pnl = kept["realized_pnl_1to1_per_lot"].sum()
avg_pnl = kept["realized_pnl_1to1_per_lot"].mean()
by_day = (
kept.groupby("trade_date")
.agg(
trades=("label_1to1", "size"),
wins=("label_1to1", "sum"),
pnl_1to1_per_lot=("realized_pnl_1to1_per_lot", "sum"),
)
.reset_index()
)
by_day["trade_date"] = pd.to_datetime(by_day["trade_date"], errors="coerce").dt.strftime("%Y-%m-%d")
return {
"threshold": threshold,
"total_trades": total,
"kept_trades": kept_n,
"coverage_pct": round((kept_n / total) * 100, 2),
"hit_rate_1to1": round(float(hit_rate_1to1), 4),
"gross_pnl_1to1_per_lot_sum": round(float(gross_pnl), 2),
"avg_pnl_1to1_per_lot": round(float(avg_pnl), 2),
"daily_breakdown": by_day.to_dict(orient="records"),
}
def main():
model = joblib.load(MODEL_PATH)
df = pd.read_csv(DATA_PATH)
if "trade_date" in df.columns:
df["trade_date"] = pd.to_datetime(df["trade_date"], errors="coerce")
X = build_feature_matrix(df)
df["y_prob"] = model.predict_proba(X)[:, 1]
df.to_csv(OUT_SCORED_PATH, index=False)
summaries = []
for th in THRESHOLDS:
summaries.append(summarize_for_threshold(df, th))
with open(OUT_SUMMARY_PATH, "w") as f:
json.dump(summaries, f, indent=2)
print(f"Saved scored trades to: {OUT_SCORED_PATH}")
print(f"Saved summary to: {OUT_SUMMARY_PATH}")
for s in summaries:
print("\n", s)
if __name__ == "__main__":
main()