Spaces:
Running
Running
| from pathlib import Path | |
| import json | |
| import warnings | |
| import joblib | |
| import numpy as np | |
| import pandas as pd | |
| warnings.filterwarnings('ignore') | |
| BASE_DIR = Path(__file__).resolve().parent | |
| OUT_DIR = BASE_DIR / 'outputs' | |
| # Prefer Mar27 merged file, fall back to Mar25 merged file if user hasn't updated yet | |
| PREFERRED_DATA_PATHS = [ | |
| OUT_DIR / 'ml_dataset_exact_all_v2_2026-01-01_to_2026-03-27_merged.csv', | |
| OUT_DIR / 'ml_dataset_exact_all_v2_2026-01-01_to_2026-03-25_merged.csv', | |
| ] | |
| FRIDAY_DATE = '2026-03-27' | |
| THRESHOLDS = [0.46, 0.48, 0.50, 0.52, 0.55] | |
| TARGET = 'label_1to1' | |
| MODELS = [ | |
| { | |
| 'name': 'old_champion', | |
| 'preprocessor': OUT_DIR / 'nn_preprocessor_label_1to1_jan_to_mar12_v2.joblib', | |
| 'model': OUT_DIR / 'nn_label_1to1_jan_to_mar12_v2.keras', | |
| }, | |
| { | |
| 'name': 'same_arch_jan_to_mar25_v1', | |
| 'preprocessor': OUT_DIR / 'nn_preprocessor_label_1to1_jan_to_mar25_same_arch_v1.joblib', | |
| 'model': OUT_DIR / 'nn_label_1to1_jan_to_mar25_same_arch_v1.keras', | |
| }, | |
| { | |
| 'name': 'large_arch_jan_to_mar25_v1', | |
| 'preprocessor': OUT_DIR / 'nn_preprocessor_label_1to1_jan_to_mar25_large_v1.joblib', | |
| 'model': OUT_DIR / 'nn_label_1to1_jan_to_mar25_large_v1.keras', | |
| }, | |
| ] | |
| DROP_COLS_ALWAYS = [ | |
| 'trade_key', | |
| 'label_1to1', | |
| 'label_1to2', | |
| 'bt_buy_signal_time', | |
| 'bt_sell_signal_time', | |
| 'bt_buy_time', | |
| 'bt_buy_price', | |
| 'bt_stop_loss', | |
| 'bt_target_1', | |
| 'bt_target_2', | |
| 'bt_qty_per_lot', | |
| 'bt_capital_per_lot', | |
| 'bt_stop_loss_amt_per_lot', | |
| 'signal_time', | |
| 'confirmation_time', | |
| 'indication_time', | |
| 'buy_time', | |
| ] | |
| OPTIONAL_DROP_COLS = [ | |
| 'exit_status', | |
| 'option_symbol', | |
| 'trade_side', | |
| ] | |
| def resolve_data_path() -> Path: | |
| for p in PREFERRED_DATA_PATHS: | |
| if p.exists(): | |
| return p | |
| raise FileNotFoundError( | |
| 'Could not find merged dataset. Expected one of:\n' + '\n'.join(str(p) for p in PREFERRED_DATA_PATHS) | |
| ) | |
| def load_friday_data() -> pd.DataFrame: | |
| data_path = resolve_data_path() | |
| df = pd.read_csv(data_path) | |
| df['trade_date'] = pd.to_datetime(df['trade_date'], errors='coerce') | |
| friday_df = df[df['trade_date'].dt.strftime('%Y-%m-%d') == FRIDAY_DATE].copy() | |
| friday_df = friday_df.sort_values([c for c in ['trade_date', 'signal_time', 'confirmation_time', 'buy_time', 'trade_key'] if c in friday_df.columns]).reset_index(drop=True) | |
| if friday_df.empty: | |
| raise ValueError( | |
| f'No rows found for {FRIDAY_DATE} in {data_path.name}. '\ | |
| 'This usually means your dataset_generator/merge still do not include Friday.' | |
| ) | |
| return friday_df | |
| def build_feature_matrix(df: pd.DataFrame): | |
| drop_cols = [c for c in DROP_COLS_ALWAYS if c in df.columns] | |
| drop_cols += [c for c in OPTIONAL_DROP_COLS if c in df.columns] | |
| X = df.drop(columns=drop_cols, errors='ignore').copy() | |
| y = df[TARGET].astype(int).copy() | |
| if 'sector' in X.columns: | |
| X['sector'] = X['sector'].fillna('UNKNOWN').replace('', 'UNKNOWN') | |
| return X, y | |
| def estimate_round_trip_charges(buy_price, exit_price, qty): | |
| buy_turnover = float(buy_price) * float(qty) | |
| sell_turnover = float(exit_price) * float(qty) | |
| turnover = buy_turnover + sell_turnover | |
| brokerage = 40.0 | |
| stt = 0.001 * sell_turnover | |
| txn = 0.0003503 * turnover | |
| sebi = 0.000001 * turnover | |
| stamp = 0.00003 * buy_turnover | |
| gst = 0.18 * (brokerage + txn + sebi) | |
| return round(brokerage + stt + txn + sebi + stamp + gst, 2) | |
| def compute_trade_pnl_1to1(df: pd.DataFrame) -> pd.DataFrame: | |
| out = df.copy() | |
| out['bt_buy_price'] = pd.to_numeric(out['bt_buy_price'], errors='coerce') | |
| out['bt_stop_loss'] = pd.to_numeric(out['bt_stop_loss'], errors='coerce') | |
| out['bt_target_1'] = pd.to_numeric(out['bt_target_1'], errors='coerce') | |
| out['bt_qty_per_lot'] = pd.to_numeric(out['bt_qty_per_lot'], errors='coerce').fillna(0) | |
| out['label_1to1'] = pd.to_numeric(out['label_1to1'], errors='coerce').fillna(0).astype(int) | |
| out['exit_price_1to1'] = np.where(out['label_1to1'].eq(1), out['bt_target_1'], out['bt_stop_loss']) | |
| out['gross_pnl_1to1'] = np.where( | |
| out['label_1to1'].eq(1), | |
| (out['bt_target_1'] - out['bt_buy_price']) * out['bt_qty_per_lot'], | |
| -(out['bt_buy_price'] - out['bt_stop_loss']) * out['bt_qty_per_lot'], | |
| ) | |
| out['est_charges_1to1'] = [ | |
| estimate_round_trip_charges(bp, ep, q) | |
| for bp, ep, q in zip(out['bt_buy_price'], out['exit_price_1to1'], out['bt_qty_per_lot']) | |
| ] | |
| out['net_pnl_1to1'] = out['gross_pnl_1to1'] - out['est_charges_1to1'] | |
| return out | |
| def evaluate_model(model_name: str, preprocessor_path: Path, model_path: Path, df: pd.DataFrame, threshold: float): | |
| import tensorflow as tf | |
| X_raw, y = build_feature_matrix(df) | |
| preprocessor = joblib.load(preprocessor_path) | |
| model = tf.keras.models.load_model(model_path, compile=False) | |
| X = preprocessor.transform(X_raw) | |
| if hasattr(X, 'toarray'): | |
| X = X.toarray() | |
| y_prob = model.predict(X, verbose=0).ravel() | |
| kept_mask = y_prob >= threshold | |
| kept_df = df.loc[kept_mask].copy() | |
| kept_df = compute_trade_pnl_1to1(kept_df) if not kept_df.empty else kept_df | |
| total_count = int(len(df)) | |
| kept_count = int(kept_mask.sum()) | |
| keep_rate = float(kept_count / total_count) if total_count else 0.0 | |
| summary = { | |
| 'date': FRIDAY_DATE, | |
| 'model_name': model_name, | |
| 'threshold': threshold, | |
| 'rows_total': total_count, | |
| 'rows_kept': kept_count, | |
| 'keep_rate': keep_rate, | |
| 'kept_hit_rate_1to1': float(kept_df['label_1to1'].mean()) if kept_count > 0 else None, | |
| 'gross_pnl_1to1': float(kept_df['gross_pnl_1to1'].sum()) if kept_count > 0 else 0.0, | |
| 'est_charges_1to1': float(kept_df['est_charges_1to1'].sum()) if kept_count > 0 else 0.0, | |
| 'net_pnl_1to1': float(kept_df['net_pnl_1to1'].sum()) if kept_count > 0 else 0.0, | |
| 'avg_score_kept': float(np.mean(y_prob[kept_mask])) if kept_count > 0 else None, | |
| } | |
| pred_df = pd.DataFrame({ | |
| 'trade_key': df['trade_key'] if 'trade_key' in df.columns else np.arange(len(df)), | |
| 'trade_date': df['trade_date'], | |
| 'symbol': df['symbol'] if 'symbol' in df.columns else None, | |
| 'direction': df['direction'] if 'direction' in df.columns else None, | |
| 'label_1to1': df['label_1to1'] if 'label_1to1' in df.columns else None, | |
| f'{model_name}_prob': y_prob, | |
| f'{model_name}_kept_thr_{str(threshold).replace('.', '_')}': kept_mask.astype(int), | |
| }) | |
| return summary, pred_df | |
| def baseline_no_gate(df: pd.DataFrame) -> dict: | |
| work = compute_trade_pnl_1to1(df) | |
| return { | |
| 'date': FRIDAY_DATE, | |
| 'model_name': 'no_gate', | |
| 'threshold': None, | |
| 'rows_total': int(len(work)), | |
| 'rows_kept': int(len(work)), | |
| 'keep_rate': 1.0, | |
| 'kept_hit_rate_1to1': float(work['label_1to1'].mean()) if len(work) else None, | |
| 'gross_pnl_1to1': float(work['gross_pnl_1to1'].sum()) if len(work) else 0.0, | |
| 'est_charges_1to1': float(work['est_charges_1to1'].sum()) if len(work) else 0.0, | |
| 'net_pnl_1to1': float(work['net_pnl_1to1'].sum()) if len(work) else 0.0, | |
| 'avg_score_kept': None, | |
| } | |
| def main(): | |
| df = load_friday_data() | |
| print(f'Friday rows found: {len(df)} for {FRIDAY_DATE}') | |
| print(f'Dataset date range includes Friday from: {resolve_data_path()}') | |
| summaries = [baseline_no_gate(df)] | |
| for threshold in THRESHOLDS: | |
| pred_frames = [] | |
| print('\n' + '=' * 80) | |
| print(f'Friday-only comparison at threshold = {threshold}') | |
| print('=' * 80) | |
| for cfg in MODELS: | |
| print(f"Evaluating {cfg['name']}...") | |
| summary, preds = evaluate_model( | |
| model_name=cfg['name'], | |
| preprocessor_path=cfg['preprocessor'], | |
| model_path=cfg['model'], | |
| df=df, | |
| threshold=threshold, | |
| ) | |
| summaries.append(summary) | |
| pred_frames.append(preds) | |
| print(json.dumps(summary, indent=2)) | |
| merged_preds = pred_frames[0] | |
| for p in pred_frames[1:]: | |
| keep_cols = [c for c in p.columns if c not in {'trade_date', 'symbol', 'direction', 'label_1to1'}] | |
| merged_preds = merged_preds.merge(p[keep_cols], on='trade_key', how='left') | |
| preds_out = OUT_DIR / f'friday_only_model_predictions_thr_{str(threshold).replace('.', '_')}.csv' | |
| merged_preds.to_csv(preds_out, index=False) | |
| print(f'Saved per-trade predictions to: {preds_out}') | |
| summary_df = pd.DataFrame(summaries) | |
| summary_out = OUT_DIR / 'friday_only_model_summary.csv' | |
| summary_df.to_csv(summary_out, index=False) | |
| print('\nSaved Friday-only summary to:', summary_out) | |
| print('\nRanking by estimated net pnl (best first):') | |
| show_cols = [ | |
| 'model_name', 'threshold', 'rows_kept', 'keep_rate', 'kept_hit_rate_1to1', | |
| 'gross_pnl_1to1', 'est_charges_1to1', 'net_pnl_1to1', 'avg_score_kept' | |
| ] | |
| print(summary_df[show_cols].sort_values(['net_pnl_1to1', 'kept_hit_rate_1to1'], ascending=[False, False]).to_string(index=False)) | |
| if __name__ == '__main__': | |
| main() | |