finbert_anaylzer / engine /backtest.py
Jitendra12421's picture
Upload 12 files
16ae9d0 verified
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
import pandas as pd
from .analytics import AnalyticsEngine
@dataclass(frozen=True)
class BacktestScenario:
name: str
rows: list[dict]
next_day_return_pct: float
ticker: str = "TEST"
@dataclass(frozen=True)
class BacktestSuite:
tuning: list[BacktestScenario]
holdout: list[BacktestScenario]
def _row(
title: str,
timestamp: datetime,
ensemble_pol: float,
finbert_pol: float,
roberta_pol: float,
finbert_score: float,
roberta_score: float,
conviction: float,
significance: float,
) -> dict:
return {
"title": title,
"timestamp": timestamp.isoformat(),
"ensemble_pol": ensemble_pol,
"finbert_pol": finbert_pol,
"roberta_pol": roberta_pol,
"finbert_score": finbert_score,
"roberta_score": roberta_score,
"agreement": 1.0,
"conviction": conviction,
"significance": significance,
}
def build_benchmark_suite(reference_time: datetime | None = None) -> BacktestSuite:
now = reference_time or datetime.now(timezone.utc)
tuning = [
BacktestScenario(
name="tuning_fresh_bullish_consensus",
ticker="TSLA",
next_day_return_pct=1.2,
rows=[
_row("TSLA beats estimates and raises guidance after record deliveries", now, 0.86, 0.91, 0.79, 0.97, 0.90, 0.82, 0.95),
_row("Analyst upgrades TSLA and raises price target", now - timedelta(hours=3), 0.72, 0.80, 0.63, 0.92, 0.84, 0.71, 0.88),
_row("TSLA wins major battery contract in growth push", now - timedelta(hours=8), 0.61, 0.68, 0.53, 0.88, 0.80, 0.61, 0.82),
],
),
BacktestScenario(
name="tuning_fresh_bearish_consensus",
ticker="TSLA",
next_day_return_pct=-1.4,
rows=[
_row("TSLA cuts guidance as revenue falls below estimates", now, -0.88, -0.93, -0.81, 0.97, 0.90, 0.86, 0.96),
_row("SEC investigation and lawsuit deepen pressure on TSLA stock", now - timedelta(hours=2), -0.78, -0.85, -0.68, 0.94, 0.86, 0.76, 0.91),
_row("Analyst downgrade sends TSLA lower on demand fears", now - timedelta(hours=5), -0.67, -0.72, -0.59, 0.89, 0.82, 0.64, 0.84),
],
),
BacktestScenario(
name="tuning_conflicted_flow",
ticker="TSLA",
next_day_return_pct=0.1,
rows=[
_row("TSLA beats estimates but warns on margin headwinds", now, 0.18, 0.24, 0.10, 0.81, 0.76, 0.20, 0.72),
_row("Analyst downgrade offsets recent TSLA rally", now - timedelta(hours=4), -0.22, -0.28, -0.14, 0.82, 0.74, 0.22, 0.75),
_row("Investors await TSLA delivery update as outlook remains uncertain", now - timedelta(hours=9), 0.02, 0.04, 0.00, 0.70, 0.66, 0.04, 0.63),
],
),
BacktestScenario(
name="tuning_stale_positive_signal",
ticker="TSLA",
next_day_return_pct=0.1,
rows=[
_row("TSLA beats estimates and raises guidance", now - timedelta(days=5), 0.82, 0.90, 0.74, 0.95, 0.87, 0.79, 0.92),
_row("Investors await TSLA update as outlook remains uncertain", now, 0.01, 0.03, 0.00, 0.71, 0.67, 0.02, 0.60),
],
),
]
holdout = [
BacktestScenario(
name="holdout_broad_bullish_repricing",
ticker="NVDA",
next_day_return_pct=1.1,
rows=[
_row("NVIDIA tops estimates and raises outlook on AI demand", now, 0.84, 0.89, 0.77, 0.96, 0.89, 0.80, 0.94),
_row("Brokerage upgrade lifts NVIDIA price target after strong guidance", now - timedelta(hours=2), 0.69, 0.74, 0.61, 0.91, 0.84, 0.66, 0.87),
_row("NVIDIA secures major cloud partnership expansion", now - timedelta(hours=6), 0.56, 0.61, 0.48, 0.87, 0.80, 0.54, 0.80),
],
),
BacktestScenario(
name="holdout_major_singleton_earnings",
ticker="AAPL",
next_day_return_pct=0.8,
rows=[
_row("Apple beats estimates and raises guidance for next quarter", now, 0.80, 0.86, 0.72, 0.95, 0.87, 0.77, 0.93),
],
),
BacktestScenario(
name="holdout_broad_bearish_repricing",
ticker="NFLX",
next_day_return_pct=-1.1,
rows=[
_row("Netflix misses estimates and cuts outlook as subscriber growth slows", now, -0.84, -0.89, -0.77, 0.96, 0.90, 0.81, 0.95),
_row("Analyst downgrade hits Netflix after weak guidance", now - timedelta(hours=3), -0.66, -0.72, -0.58, 0.90, 0.83, 0.63, 0.86),
_row("Probe and lawsuit add pressure to Netflix shares", now - timedelta(hours=7), -0.58, -0.64, -0.49, 0.88, 0.81, 0.55, 0.82),
],
),
BacktestScenario(
name="holdout_mixed_crosscurrents",
ticker="AMZN",
next_day_return_pct=0.05,
rows=[
_row("Amazon wins cloud contract but warns on margin pressure", now, 0.17, 0.22, 0.10, 0.82, 0.76, 0.19, 0.74),
_row("Analyst downgrade trims Amazon target after recent rally", now - timedelta(hours=4), -0.19, -0.25, -0.11, 0.81, 0.73, 0.20, 0.73),
_row("Investors stay cautious ahead of Amazon operating update", now - timedelta(hours=9), 0.00, 0.02, 0.00, 0.70, 0.66, 0.01, 0.61),
],
),
BacktestScenario(
name="holdout_thin_generic_positive",
ticker="META",
next_day_return_pct=0.18,
rows=[
_row("Meta launches new consumer feature across more markets", now, 0.33, 0.37, 0.27, 0.82, 0.78, 0.29, 0.67),
],
),
BacktestScenario(
name="holdout_thin_generic_negative",
ticker="DIS",
next_day_return_pct=-0.14,
rows=[
_row("Disney faces production delay at key studio release", now, -0.31, -0.35, -0.26, 0.84, 0.77, 0.27, 0.66),
],
),
BacktestScenario(
name="holdout_stale_positive_without_followthrough",
ticker="CRM",
next_day_return_pct=0.12,
rows=[
_row("Salesforce announces partnership expansion and upbeat commentary", now - timedelta(days=4), 0.61, 0.67, 0.52, 0.89, 0.81, 0.58, 0.83),
_row("Traders await Salesforce update as visibility remains mixed", now, 0.01, 0.03, 0.00, 0.71, 0.67, 0.02, 0.60),
],
),
BacktestScenario(
name="holdout_mild_positive_lean",
ticker="ORCL",
next_day_return_pct=0.42,
rows=[
_row("Oracle partnership expands enterprise demand pipeline", now, 0.39, 0.44, 0.31, 0.85, 0.79, 0.35, 0.73),
_row("Analyst note turns constructive on Oracle cloud growth", now - timedelta(hours=5), 0.28, 0.33, 0.20, 0.81, 0.74, 0.25, 0.70),
],
),
BacktestScenario(
name="holdout_mild_negative_lean",
ticker="INTC",
next_day_return_pct=-0.46,
rows=[
_row("Intel downgrade reflects weaker PC demand expectations", now, -0.41, -0.46, -0.34, 0.86, 0.80, 0.37, 0.75),
_row("Intel delay raises execution concerns for next launch", now - timedelta(hours=4), -0.29, -0.34, -0.22, 0.82, 0.76, 0.26, 0.71),
],
),
]
return BacktestSuite(tuning=tuning, holdout=holdout)
def expected_direction(next_day_return_pct: float, neutral_band_pct: float = 0.35) -> str:
if next_day_return_pct >= neutral_band_pct:
return "UP"
if next_day_return_pct <= -neutral_band_pct:
return "DOWN"
return "MIXED"
def _target_score(next_day_return_pct: float) -> int:
normalized = max(-1.0, min(1.0, next_day_return_pct / 2.0))
return int(round((normalized + 1.0) * 50.0))
def run_backtest(
scenarios: list[BacktestScenario],
engine: AnalyticsEngine | None = None,
neutral_band_pct: float = 0.35,
) -> dict:
sentiment_engine = engine or AnalyticsEngine()
results = []
for scenario in scenarios:
summary = sentiment_engine.get_summary(pd.DataFrame(scenario.rows))
actual_call = expected_direction(scenario.next_day_return_pct, neutral_band_pct=neutral_band_pct)
predicted_call = summary["direction_call"]
results.append(
{
"scenario": scenario.name,
"ticker": scenario.ticker,
"next_day_return_pct": float(scenario.next_day_return_pct),
"actual_call": actual_call,
"predicted_call": predicted_call,
"direction_score": int(summary["direction_score"]),
"direction_confidence": int(summary["direction_confidence"]),
"score_error": abs(int(summary["direction_score"]) - _target_score(scenario.next_day_return_pct)),
"is_correct": predicted_call == actual_call,
"overcalled": actual_call == "MIXED" and predicted_call in {"UP", "DOWN"},
"undercalled": actual_call in {"UP", "DOWN"} and predicted_call == "MIXED",
}
)
result_frame = pd.DataFrame(results)
decisive_mask = result_frame["predicted_call"].isin(["UP", "DOWN"])
actual_decisive_mask = result_frame["actual_call"].isin(["UP", "DOWN"])
mixed_actual_mask = result_frame["actual_call"].eq("MIXED")
decisive_precision = float(result_frame.loc[decisive_mask, "is_correct"].mean()) if decisive_mask.any() else 0.0
decisive_recall = float(
result_frame.loc[actual_decisive_mask, "predicted_call"].eq(result_frame.loc[actual_decisive_mask, "actual_call"]).mean()
) if actual_decisive_mask.any() else 0.0
mixed_accuracy = float(
result_frame.loc[mixed_actual_mask, "predicted_call"].eq("MIXED").mean()
) if mixed_actual_mask.any() else 0.0
return {
"results": result_frame.to_dict(orient="records"),
"metrics": {
"scenario_count": int(len(result_frame)),
"overall_accuracy": float(result_frame["is_correct"].mean()) if not result_frame.empty else 0.0,
"decisive_precision": decisive_precision,
"decisive_recall": decisive_recall,
"mixed_accuracy": mixed_accuracy,
"coverage": float(decisive_mask.mean()) if not result_frame.empty else 0.0,
"overcall_rate": float(result_frame["overcalled"].mean()) if not result_frame.empty else 0.0,
"undercall_rate": float(result_frame["undercalled"].mean()) if not result_frame.empty else 0.0,
"mean_score_error": float(result_frame["score_error"].mean()) if not result_frame.empty else 0.0,
},
}