Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from datetime import datetime, timedelta, timezone | |
| import pandas as pd | |
| from .analytics import AnalyticsEngine | |
| class BacktestScenario: | |
| name: str | |
| rows: list[dict] | |
| next_day_return_pct: float | |
| ticker: str = "TEST" | |
| class BacktestSuite: | |
| tuning: list[BacktestScenario] | |
| holdout: list[BacktestScenario] | |
| def _row( | |
| title: str, | |
| timestamp: datetime, | |
| ensemble_pol: float, | |
| finbert_pol: float, | |
| roberta_pol: float, | |
| finbert_score: float, | |
| roberta_score: float, | |
| conviction: float, | |
| significance: float, | |
| ) -> dict: | |
| return { | |
| "title": title, | |
| "timestamp": timestamp.isoformat(), | |
| "ensemble_pol": ensemble_pol, | |
| "finbert_pol": finbert_pol, | |
| "roberta_pol": roberta_pol, | |
| "finbert_score": finbert_score, | |
| "roberta_score": roberta_score, | |
| "agreement": 1.0, | |
| "conviction": conviction, | |
| "significance": significance, | |
| } | |
| def build_benchmark_suite(reference_time: datetime | None = None) -> BacktestSuite: | |
| now = reference_time or datetime.now(timezone.utc) | |
| tuning = [ | |
| BacktestScenario( | |
| name="tuning_fresh_bullish_consensus", | |
| ticker="TSLA", | |
| next_day_return_pct=1.2, | |
| rows=[ | |
| _row("TSLA beats estimates and raises guidance after record deliveries", now, 0.86, 0.91, 0.79, 0.97, 0.90, 0.82, 0.95), | |
| _row("Analyst upgrades TSLA and raises price target", now - timedelta(hours=3), 0.72, 0.80, 0.63, 0.92, 0.84, 0.71, 0.88), | |
| _row("TSLA wins major battery contract in growth push", now - timedelta(hours=8), 0.61, 0.68, 0.53, 0.88, 0.80, 0.61, 0.82), | |
| ], | |
| ), | |
| BacktestScenario( | |
| name="tuning_fresh_bearish_consensus", | |
| ticker="TSLA", | |
| next_day_return_pct=-1.4, | |
| rows=[ | |
| _row("TSLA cuts guidance as revenue falls below estimates", now, -0.88, -0.93, -0.81, 0.97, 0.90, 0.86, 0.96), | |
| _row("SEC investigation and lawsuit deepen pressure on TSLA stock", now - timedelta(hours=2), -0.78, -0.85, -0.68, 0.94, 0.86, 0.76, 0.91), | |
| _row("Analyst downgrade sends TSLA lower on demand fears", now - timedelta(hours=5), -0.67, -0.72, -0.59, 0.89, 0.82, 0.64, 0.84), | |
| ], | |
| ), | |
| BacktestScenario( | |
| name="tuning_conflicted_flow", | |
| ticker="TSLA", | |
| next_day_return_pct=0.1, | |
| rows=[ | |
| _row("TSLA beats estimates but warns on margin headwinds", now, 0.18, 0.24, 0.10, 0.81, 0.76, 0.20, 0.72), | |
| _row("Analyst downgrade offsets recent TSLA rally", now - timedelta(hours=4), -0.22, -0.28, -0.14, 0.82, 0.74, 0.22, 0.75), | |
| _row("Investors await TSLA delivery update as outlook remains uncertain", now - timedelta(hours=9), 0.02, 0.04, 0.00, 0.70, 0.66, 0.04, 0.63), | |
| ], | |
| ), | |
| BacktestScenario( | |
| name="tuning_stale_positive_signal", | |
| ticker="TSLA", | |
| next_day_return_pct=0.1, | |
| rows=[ | |
| _row("TSLA beats estimates and raises guidance", now - timedelta(days=5), 0.82, 0.90, 0.74, 0.95, 0.87, 0.79, 0.92), | |
| _row("Investors await TSLA update as outlook remains uncertain", now, 0.01, 0.03, 0.00, 0.71, 0.67, 0.02, 0.60), | |
| ], | |
| ), | |
| ] | |
| holdout = [ | |
| BacktestScenario( | |
| name="holdout_broad_bullish_repricing", | |
| ticker="NVDA", | |
| next_day_return_pct=1.1, | |
| rows=[ | |
| _row("NVIDIA tops estimates and raises outlook on AI demand", now, 0.84, 0.89, 0.77, 0.96, 0.89, 0.80, 0.94), | |
| _row("Brokerage upgrade lifts NVIDIA price target after strong guidance", now - timedelta(hours=2), 0.69, 0.74, 0.61, 0.91, 0.84, 0.66, 0.87), | |
| _row("NVIDIA secures major cloud partnership expansion", now - timedelta(hours=6), 0.56, 0.61, 0.48, 0.87, 0.80, 0.54, 0.80), | |
| ], | |
| ), | |
| BacktestScenario( | |
| name="holdout_major_singleton_earnings", | |
| ticker="AAPL", | |
| next_day_return_pct=0.8, | |
| rows=[ | |
| _row("Apple beats estimates and raises guidance for next quarter", now, 0.80, 0.86, 0.72, 0.95, 0.87, 0.77, 0.93), | |
| ], | |
| ), | |
| BacktestScenario( | |
| name="holdout_broad_bearish_repricing", | |
| ticker="NFLX", | |
| next_day_return_pct=-1.1, | |
| rows=[ | |
| _row("Netflix misses estimates and cuts outlook as subscriber growth slows", now, -0.84, -0.89, -0.77, 0.96, 0.90, 0.81, 0.95), | |
| _row("Analyst downgrade hits Netflix after weak guidance", now - timedelta(hours=3), -0.66, -0.72, -0.58, 0.90, 0.83, 0.63, 0.86), | |
| _row("Probe and lawsuit add pressure to Netflix shares", now - timedelta(hours=7), -0.58, -0.64, -0.49, 0.88, 0.81, 0.55, 0.82), | |
| ], | |
| ), | |
| BacktestScenario( | |
| name="holdout_mixed_crosscurrents", | |
| ticker="AMZN", | |
| next_day_return_pct=0.05, | |
| rows=[ | |
| _row("Amazon wins cloud contract but warns on margin pressure", now, 0.17, 0.22, 0.10, 0.82, 0.76, 0.19, 0.74), | |
| _row("Analyst downgrade trims Amazon target after recent rally", now - timedelta(hours=4), -0.19, -0.25, -0.11, 0.81, 0.73, 0.20, 0.73), | |
| _row("Investors stay cautious ahead of Amazon operating update", now - timedelta(hours=9), 0.00, 0.02, 0.00, 0.70, 0.66, 0.01, 0.61), | |
| ], | |
| ), | |
| BacktestScenario( | |
| name="holdout_thin_generic_positive", | |
| ticker="META", | |
| next_day_return_pct=0.18, | |
| rows=[ | |
| _row("Meta launches new consumer feature across more markets", now, 0.33, 0.37, 0.27, 0.82, 0.78, 0.29, 0.67), | |
| ], | |
| ), | |
| BacktestScenario( | |
| name="holdout_thin_generic_negative", | |
| ticker="DIS", | |
| next_day_return_pct=-0.14, | |
| rows=[ | |
| _row("Disney faces production delay at key studio release", now, -0.31, -0.35, -0.26, 0.84, 0.77, 0.27, 0.66), | |
| ], | |
| ), | |
| BacktestScenario( | |
| name="holdout_stale_positive_without_followthrough", | |
| ticker="CRM", | |
| next_day_return_pct=0.12, | |
| rows=[ | |
| _row("Salesforce announces partnership expansion and upbeat commentary", now - timedelta(days=4), 0.61, 0.67, 0.52, 0.89, 0.81, 0.58, 0.83), | |
| _row("Traders await Salesforce update as visibility remains mixed", now, 0.01, 0.03, 0.00, 0.71, 0.67, 0.02, 0.60), | |
| ], | |
| ), | |
| BacktestScenario( | |
| name="holdout_mild_positive_lean", | |
| ticker="ORCL", | |
| next_day_return_pct=0.42, | |
| rows=[ | |
| _row("Oracle partnership expands enterprise demand pipeline", now, 0.39, 0.44, 0.31, 0.85, 0.79, 0.35, 0.73), | |
| _row("Analyst note turns constructive on Oracle cloud growth", now - timedelta(hours=5), 0.28, 0.33, 0.20, 0.81, 0.74, 0.25, 0.70), | |
| ], | |
| ), | |
| BacktestScenario( | |
| name="holdout_mild_negative_lean", | |
| ticker="INTC", | |
| next_day_return_pct=-0.46, | |
| rows=[ | |
| _row("Intel downgrade reflects weaker PC demand expectations", now, -0.41, -0.46, -0.34, 0.86, 0.80, 0.37, 0.75), | |
| _row("Intel delay raises execution concerns for next launch", now - timedelta(hours=4), -0.29, -0.34, -0.22, 0.82, 0.76, 0.26, 0.71), | |
| ], | |
| ), | |
| ] | |
| return BacktestSuite(tuning=tuning, holdout=holdout) | |
| def expected_direction(next_day_return_pct: float, neutral_band_pct: float = 0.35) -> str: | |
| if next_day_return_pct >= neutral_band_pct: | |
| return "UP" | |
| if next_day_return_pct <= -neutral_band_pct: | |
| return "DOWN" | |
| return "MIXED" | |
| def _target_score(next_day_return_pct: float) -> int: | |
| normalized = max(-1.0, min(1.0, next_day_return_pct / 2.0)) | |
| return int(round((normalized + 1.0) * 50.0)) | |
| def run_backtest( | |
| scenarios: list[BacktestScenario], | |
| engine: AnalyticsEngine | None = None, | |
| neutral_band_pct: float = 0.35, | |
| ) -> dict: | |
| sentiment_engine = engine or AnalyticsEngine() | |
| results = [] | |
| for scenario in scenarios: | |
| summary = sentiment_engine.get_summary(pd.DataFrame(scenario.rows)) | |
| actual_call = expected_direction(scenario.next_day_return_pct, neutral_band_pct=neutral_band_pct) | |
| predicted_call = summary["direction_call"] | |
| results.append( | |
| { | |
| "scenario": scenario.name, | |
| "ticker": scenario.ticker, | |
| "next_day_return_pct": float(scenario.next_day_return_pct), | |
| "actual_call": actual_call, | |
| "predicted_call": predicted_call, | |
| "direction_score": int(summary["direction_score"]), | |
| "direction_confidence": int(summary["direction_confidence"]), | |
| "score_error": abs(int(summary["direction_score"]) - _target_score(scenario.next_day_return_pct)), | |
| "is_correct": predicted_call == actual_call, | |
| "overcalled": actual_call == "MIXED" and predicted_call in {"UP", "DOWN"}, | |
| "undercalled": actual_call in {"UP", "DOWN"} and predicted_call == "MIXED", | |
| } | |
| ) | |
| result_frame = pd.DataFrame(results) | |
| decisive_mask = result_frame["predicted_call"].isin(["UP", "DOWN"]) | |
| actual_decisive_mask = result_frame["actual_call"].isin(["UP", "DOWN"]) | |
| mixed_actual_mask = result_frame["actual_call"].eq("MIXED") | |
| decisive_precision = float(result_frame.loc[decisive_mask, "is_correct"].mean()) if decisive_mask.any() else 0.0 | |
| decisive_recall = float( | |
| result_frame.loc[actual_decisive_mask, "predicted_call"].eq(result_frame.loc[actual_decisive_mask, "actual_call"]).mean() | |
| ) if actual_decisive_mask.any() else 0.0 | |
| mixed_accuracy = float( | |
| result_frame.loc[mixed_actual_mask, "predicted_call"].eq("MIXED").mean() | |
| ) if mixed_actual_mask.any() else 0.0 | |
| return { | |
| "results": result_frame.to_dict(orient="records"), | |
| "metrics": { | |
| "scenario_count": int(len(result_frame)), | |
| "overall_accuracy": float(result_frame["is_correct"].mean()) if not result_frame.empty else 0.0, | |
| "decisive_precision": decisive_precision, | |
| "decisive_recall": decisive_recall, | |
| "mixed_accuracy": mixed_accuracy, | |
| "coverage": float(decisive_mask.mean()) if not result_frame.empty else 0.0, | |
| "overcall_rate": float(result_frame["overcalled"].mean()) if not result_frame.empty else 0.0, | |
| "undercall_rate": float(result_frame["undercalled"].mean()) if not result_frame.empty else 0.0, | |
| "mean_score_error": float(result_frame["score_error"].mean()) if not result_frame.empty else 0.0, | |
| }, | |
| } | |