from __future__ import annotations from dataclasses import dataclass from datetime import datetime, timedelta, timezone import pandas as pd from .analytics import AnalyticsEngine @dataclass(frozen=True) class BacktestScenario: name: str rows: list[dict] next_day_return_pct: float ticker: str = "TEST" @dataclass(frozen=True) class BacktestSuite: tuning: list[BacktestScenario] holdout: list[BacktestScenario] def _row( title: str, timestamp: datetime, ensemble_pol: float, finbert_pol: float, roberta_pol: float, finbert_score: float, roberta_score: float, conviction: float, significance: float, ) -> dict: return { "title": title, "timestamp": timestamp.isoformat(), "ensemble_pol": ensemble_pol, "finbert_pol": finbert_pol, "roberta_pol": roberta_pol, "finbert_score": finbert_score, "roberta_score": roberta_score, "agreement": 1.0, "conviction": conviction, "significance": significance, } def build_benchmark_suite(reference_time: datetime | None = None) -> BacktestSuite: now = reference_time or datetime.now(timezone.utc) tuning = [ BacktestScenario( name="tuning_fresh_bullish_consensus", ticker="TSLA", next_day_return_pct=1.2, rows=[ _row("TSLA beats estimates and raises guidance after record deliveries", now, 0.86, 0.91, 0.79, 0.97, 0.90, 0.82, 0.95), _row("Analyst upgrades TSLA and raises price target", now - timedelta(hours=3), 0.72, 0.80, 0.63, 0.92, 0.84, 0.71, 0.88), _row("TSLA wins major battery contract in growth push", now - timedelta(hours=8), 0.61, 0.68, 0.53, 0.88, 0.80, 0.61, 0.82), ], ), BacktestScenario( name="tuning_fresh_bearish_consensus", ticker="TSLA", next_day_return_pct=-1.4, rows=[ _row("TSLA cuts guidance as revenue falls below estimates", now, -0.88, -0.93, -0.81, 0.97, 0.90, 0.86, 0.96), _row("SEC investigation and lawsuit deepen pressure on TSLA stock", now - timedelta(hours=2), -0.78, -0.85, -0.68, 0.94, 0.86, 0.76, 0.91), _row("Analyst downgrade sends TSLA lower on demand fears", now - timedelta(hours=5), -0.67, -0.72, -0.59, 0.89, 0.82, 0.64, 0.84), ], ), BacktestScenario( name="tuning_conflicted_flow", ticker="TSLA", next_day_return_pct=0.1, rows=[ _row("TSLA beats estimates but warns on margin headwinds", now, 0.18, 0.24, 0.10, 0.81, 0.76, 0.20, 0.72), _row("Analyst downgrade offsets recent TSLA rally", now - timedelta(hours=4), -0.22, -0.28, -0.14, 0.82, 0.74, 0.22, 0.75), _row("Investors await TSLA delivery update as outlook remains uncertain", now - timedelta(hours=9), 0.02, 0.04, 0.00, 0.70, 0.66, 0.04, 0.63), ], ), BacktestScenario( name="tuning_stale_positive_signal", ticker="TSLA", next_day_return_pct=0.1, rows=[ _row("TSLA beats estimates and raises guidance", now - timedelta(days=5), 0.82, 0.90, 0.74, 0.95, 0.87, 0.79, 0.92), _row("Investors await TSLA update as outlook remains uncertain", now, 0.01, 0.03, 0.00, 0.71, 0.67, 0.02, 0.60), ], ), ] holdout = [ BacktestScenario( name="holdout_broad_bullish_repricing", ticker="NVDA", next_day_return_pct=1.1, rows=[ _row("NVIDIA tops estimates and raises outlook on AI demand", now, 0.84, 0.89, 0.77, 0.96, 0.89, 0.80, 0.94), _row("Brokerage upgrade lifts NVIDIA price target after strong guidance", now - timedelta(hours=2), 0.69, 0.74, 0.61, 0.91, 0.84, 0.66, 0.87), _row("NVIDIA secures major cloud partnership expansion", now - timedelta(hours=6), 0.56, 0.61, 0.48, 0.87, 0.80, 0.54, 0.80), ], ), BacktestScenario( name="holdout_major_singleton_earnings", ticker="AAPL", next_day_return_pct=0.8, rows=[ _row("Apple beats estimates and raises guidance for next quarter", now, 0.80, 0.86, 0.72, 0.95, 0.87, 0.77, 0.93), ], ), BacktestScenario( name="holdout_broad_bearish_repricing", ticker="NFLX", next_day_return_pct=-1.1, rows=[ _row("Netflix misses estimates and cuts outlook as subscriber growth slows", now, -0.84, -0.89, -0.77, 0.96, 0.90, 0.81, 0.95), _row("Analyst downgrade hits Netflix after weak guidance", now - timedelta(hours=3), -0.66, -0.72, -0.58, 0.90, 0.83, 0.63, 0.86), _row("Probe and lawsuit add pressure to Netflix shares", now - timedelta(hours=7), -0.58, -0.64, -0.49, 0.88, 0.81, 0.55, 0.82), ], ), BacktestScenario( name="holdout_mixed_crosscurrents", ticker="AMZN", next_day_return_pct=0.05, rows=[ _row("Amazon wins cloud contract but warns on margin pressure", now, 0.17, 0.22, 0.10, 0.82, 0.76, 0.19, 0.74), _row("Analyst downgrade trims Amazon target after recent rally", now - timedelta(hours=4), -0.19, -0.25, -0.11, 0.81, 0.73, 0.20, 0.73), _row("Investors stay cautious ahead of Amazon operating update", now - timedelta(hours=9), 0.00, 0.02, 0.00, 0.70, 0.66, 0.01, 0.61), ], ), BacktestScenario( name="holdout_thin_generic_positive", ticker="META", next_day_return_pct=0.18, rows=[ _row("Meta launches new consumer feature across more markets", now, 0.33, 0.37, 0.27, 0.82, 0.78, 0.29, 0.67), ], ), BacktestScenario( name="holdout_thin_generic_negative", ticker="DIS", next_day_return_pct=-0.14, rows=[ _row("Disney faces production delay at key studio release", now, -0.31, -0.35, -0.26, 0.84, 0.77, 0.27, 0.66), ], ), BacktestScenario( name="holdout_stale_positive_without_followthrough", ticker="CRM", next_day_return_pct=0.12, rows=[ _row("Salesforce announces partnership expansion and upbeat commentary", now - timedelta(days=4), 0.61, 0.67, 0.52, 0.89, 0.81, 0.58, 0.83), _row("Traders await Salesforce update as visibility remains mixed", now, 0.01, 0.03, 0.00, 0.71, 0.67, 0.02, 0.60), ], ), BacktestScenario( name="holdout_mild_positive_lean", ticker="ORCL", next_day_return_pct=0.42, rows=[ _row("Oracle partnership expands enterprise demand pipeline", now, 0.39, 0.44, 0.31, 0.85, 0.79, 0.35, 0.73), _row("Analyst note turns constructive on Oracle cloud growth", now - timedelta(hours=5), 0.28, 0.33, 0.20, 0.81, 0.74, 0.25, 0.70), ], ), BacktestScenario( name="holdout_mild_negative_lean", ticker="INTC", next_day_return_pct=-0.46, rows=[ _row("Intel downgrade reflects weaker PC demand expectations", now, -0.41, -0.46, -0.34, 0.86, 0.80, 0.37, 0.75), _row("Intel delay raises execution concerns for next launch", now - timedelta(hours=4), -0.29, -0.34, -0.22, 0.82, 0.76, 0.26, 0.71), ], ), ] return BacktestSuite(tuning=tuning, holdout=holdout) def expected_direction(next_day_return_pct: float, neutral_band_pct: float = 0.35) -> str: if next_day_return_pct >= neutral_band_pct: return "UP" if next_day_return_pct <= -neutral_band_pct: return "DOWN" return "MIXED" def _target_score(next_day_return_pct: float) -> int: normalized = max(-1.0, min(1.0, next_day_return_pct / 2.0)) return int(round((normalized + 1.0) * 50.0)) def run_backtest( scenarios: list[BacktestScenario], engine: AnalyticsEngine | None = None, neutral_band_pct: float = 0.35, ) -> dict: sentiment_engine = engine or AnalyticsEngine() results = [] for scenario in scenarios: summary = sentiment_engine.get_summary(pd.DataFrame(scenario.rows)) actual_call = expected_direction(scenario.next_day_return_pct, neutral_band_pct=neutral_band_pct) predicted_call = summary["direction_call"] results.append( { "scenario": scenario.name, "ticker": scenario.ticker, "next_day_return_pct": float(scenario.next_day_return_pct), "actual_call": actual_call, "predicted_call": predicted_call, "direction_score": int(summary["direction_score"]), "direction_confidence": int(summary["direction_confidence"]), "score_error": abs(int(summary["direction_score"]) - _target_score(scenario.next_day_return_pct)), "is_correct": predicted_call == actual_call, "overcalled": actual_call == "MIXED" and predicted_call in {"UP", "DOWN"}, "undercalled": actual_call in {"UP", "DOWN"} and predicted_call == "MIXED", } ) result_frame = pd.DataFrame(results) decisive_mask = result_frame["predicted_call"].isin(["UP", "DOWN"]) actual_decisive_mask = result_frame["actual_call"].isin(["UP", "DOWN"]) mixed_actual_mask = result_frame["actual_call"].eq("MIXED") decisive_precision = float(result_frame.loc[decisive_mask, "is_correct"].mean()) if decisive_mask.any() else 0.0 decisive_recall = float( result_frame.loc[actual_decisive_mask, "predicted_call"].eq(result_frame.loc[actual_decisive_mask, "actual_call"]).mean() ) if actual_decisive_mask.any() else 0.0 mixed_accuracy = float( result_frame.loc[mixed_actual_mask, "predicted_call"].eq("MIXED").mean() ) if mixed_actual_mask.any() else 0.0 return { "results": result_frame.to_dict(orient="records"), "metrics": { "scenario_count": int(len(result_frame)), "overall_accuracy": float(result_frame["is_correct"].mean()) if not result_frame.empty else 0.0, "decisive_precision": decisive_precision, "decisive_recall": decisive_recall, "mixed_accuracy": mixed_accuracy, "coverage": float(decisive_mask.mean()) if not result_frame.empty else 0.0, "overcall_rate": float(result_frame["overcalled"].mean()) if not result_frame.empty else 0.0, "undercall_rate": float(result_frame["undercalled"].mean()) if not result_frame.empty else 0.0, "mean_score_error": float(result_frame["score_error"].mean()) if not result_frame.empty else 0.0, }, }