from datetime import datetime, timedelta, timezone from pathlib import Path import sys import pandas as pd sys.path.insert(0, str(Path(__file__).resolve().parents[1])) from data.scraper import NewsScraper from engine.analytics import AnalyticsEngine def _headline_frame(rows): return pd.DataFrame(rows) def test_scraper_init(): scraper = NewsScraper(limit=10) assert scraper.limit == 10 def test_scraper_query_diversity(): scraper = NewsScraper(limit=600) queries = scraper._build_queries("TSLA") assert len(queries) >= 50 def test_summary_does_not_eager_load_models(): engine = AnalyticsEngine() assert engine.finbert is None assert engine.distilroberta is None assert engine.ranker is None now = datetime.now(timezone.utc) df = _headline_frame( [ { "title": "TSLA beats estimates and raises guidance", "timestamp": now.isoformat(), "ensemble_pol": 0.82, "finbert_pol": 0.9, "roberta_pol": 0.75, "finbert_score": 0.95, "roberta_score": 0.88, "agreement": 1.0, "conviction": 0.8, "significance": 0.9, } ] ) summary = engine.get_summary(df) assert summary["direction_call"] in {"UP", "MIXED"} assert summary["direction_score"] >= 50 assert summary["event_support"] > 0.5 assert engine.finbert is None assert engine.distilroberta is None assert engine.ranker is None def test_positive_direction_summary_is_bullish(): engine = AnalyticsEngine() now = datetime.now(timezone.utc) df = _headline_frame( [ { "title": "TSLA beats estimates and raises guidance after record deliveries", "timestamp": now.isoformat(), "ensemble_pol": 0.86, "finbert_pol": 0.91, "roberta_pol": 0.79, "finbert_score": 0.97, "roberta_score": 0.9, "agreement": 1.0, "conviction": 0.82, "significance": 0.95, }, { "title": "Analyst upgrades TSLA and raises price target", "timestamp": (now - timedelta(hours=3)).isoformat(), "ensemble_pol": 0.72, "finbert_pol": 0.8, "roberta_pol": 0.63, "finbert_score": 0.92, "roberta_score": 0.84, "agreement": 1.0, "conviction": 0.71, "significance": 0.88, }, { "title": "TSLA wins major battery contract in growth push", "timestamp": (now - timedelta(hours=8)).isoformat(), "ensemble_pol": 0.61, "finbert_pol": 0.68, "roberta_pol": 0.53, "finbert_score": 0.88, "roberta_score": 0.8, "agreement": 1.0, "conviction": 0.61, "significance": 0.82, }, ] ) summary = engine.get_summary(df) assert summary["direction_call"] == "UP" assert summary["direction_score"] >= 60 assert "Bullish" in summary["state_title"] assert summary["bullish_pressure"] > summary["bearish_pressure"] assert summary["state_explanation"] assert summary["bullish_drivers"] def test_negative_direction_summary_is_bearish(): engine = AnalyticsEngine() now = datetime.now(timezone.utc) df = _headline_frame( [ { "title": "TSLA cuts guidance as revenue falls below estimates", "timestamp": now.isoformat(), "ensemble_pol": -0.88, "finbert_pol": -0.93, "roberta_pol": -0.81, "finbert_score": 0.97, "roberta_score": 0.9, "agreement": 1.0, "conviction": 0.86, "significance": 0.96, }, { "title": "SEC investigation and lawsuit deepen pressure on TSLA stock", "timestamp": (now - timedelta(hours=2)).isoformat(), "ensemble_pol": -0.78, "finbert_pol": -0.85, "roberta_pol": -0.68, "finbert_score": 0.94, "roberta_score": 0.86, "agreement": 1.0, "conviction": 0.76, "significance": 0.91, }, { "title": "Analyst downgrade sends TSLA lower on demand fears", "timestamp": (now - timedelta(hours=5)).isoformat(), "ensemble_pol": -0.67, "finbert_pol": -0.72, "roberta_pol": -0.59, "finbert_score": 0.89, "roberta_score": 0.82, "agreement": 1.0, "conviction": 0.64, "significance": 0.84, }, ] ) summary = engine.get_summary(df) assert summary["direction_call"] == "DOWN" assert summary["direction_score"] <= 40 assert "Bearish" in summary["state_title"] assert summary["bearish_pressure"] > summary["bullish_pressure"] assert summary["bearish_risks"] def test_mixed_flow_lowers_confidence(): engine = AnalyticsEngine() now = datetime.now(timezone.utc) df = _headline_frame( [ { "title": "TSLA beats estimates but warns on margin headwinds", "timestamp": now.isoformat(), "ensemble_pol": 0.18, "finbert_pol": 0.24, "roberta_pol": 0.1, "finbert_score": 0.81, "roberta_score": 0.76, "agreement": 1.0, "conviction": 0.2, "significance": 0.72, }, { "title": "Analyst downgrade offsets recent TSLA rally", "timestamp": (now - timedelta(hours=4)).isoformat(), "ensemble_pol": -0.22, "finbert_pol": -0.28, "roberta_pol": -0.14, "finbert_score": 0.82, "roberta_score": 0.74, "agreement": 1.0, "conviction": 0.22, "significance": 0.75, }, { "title": "Investors await TSLA delivery update as outlook remains uncertain", "timestamp": (now - timedelta(hours=9)).isoformat(), "ensemble_pol": 0.02, "finbert_pol": 0.04, "roberta_pol": 0.0, "finbert_score": 0.7, "roberta_score": 0.66, "agreement": 1.0, "conviction": 0.04, "significance": 0.63, }, ] ) summary = engine.get_summary(df) assert summary["direction_confidence"] < 70 assert summary["direction_call"] == "MIXED" assert 35 <= summary["direction_score"] <= 65 def test_single_generic_headline_is_not_overcalled(): engine = AnalyticsEngine() now = datetime.now(timezone.utc) df = _headline_frame( [ { "title": "TSLA launches new product for mass market buyers", "timestamp": now.isoformat(), "ensemble_pol": 0.35, "finbert_pol": 0.38, "roberta_pol": 0.3, "finbert_score": 0.82, "roberta_score": 0.79, "agreement": 1.0, "conviction": 0.3, "significance": 0.68, } ] ) summary = engine.get_summary(df) assert summary["direction_call"] == "MIXED" assert 45 <= summary["direction_score"] <= 58 assert summary["headline_concentration"] >= 0.95 assert summary["effective_articles"] <= 1.1 def test_major_singleton_event_can_escape_midpoint_bias(): engine = AnalyticsEngine() now = datetime.now(timezone.utc) df = _headline_frame( [ { "title": "AAPL beats estimates and raises guidance for next quarter", "timestamp": now.isoformat(), "ensemble_pol": 0.8, "finbert_pol": 0.86, "roberta_pol": 0.72, "finbert_score": 0.95, "roberta_score": 0.87, "agreement": 1.0, "conviction": 0.77, "significance": 0.93, } ] ) summary = engine.get_summary(df) assert summary["direction_call"] == "UP" assert summary["direction_score"] >= 57 assert summary["vibe"] >= 7 assert summary["event_support"] >= 0.72 def test_stale_signal_needs_fresh_confirmation(): engine = AnalyticsEngine() now = datetime.now(timezone.utc) df = _headline_frame( [ { "title": "TSLA beats estimates and raises guidance", "timestamp": (now - timedelta(days=5)).isoformat(), "ensemble_pol": 0.82, "finbert_pol": 0.9, "roberta_pol": 0.74, "finbert_score": 0.95, "roberta_score": 0.87, "agreement": 1.0, "conviction": 0.79, "significance": 0.92, }, { "title": "Investors await TSLA update as outlook remains uncertain", "timestamp": now.isoformat(), "ensemble_pol": 0.01, "finbert_pol": 0.03, "roberta_pol": 0.0, "finbert_score": 0.71, "roberta_score": 0.67, "agreement": 1.0, "conviction": 0.02, "significance": 0.6, }, ] ) summary = engine.get_summary(df) assert summary["direction_call"] == "MIXED" assert summary["direction_confidence"] < 55 assert summary["recency_support"] < 0.7 def test_vibe_scale_moves_off_center_for_mild_directional_lean(): engine = AnalyticsEngine() now = datetime.now(timezone.utc) bullish = _headline_frame( [ { "title": "Oracle partnership expands enterprise demand pipeline", "timestamp": now.isoformat(), "ensemble_pol": 0.39, "finbert_pol": 0.44, "roberta_pol": 0.31, "finbert_score": 0.85, "roberta_score": 0.79, "agreement": 1.0, "conviction": 0.35, "significance": 0.73, }, { "title": "Analyst note turns constructive on Oracle cloud growth", "timestamp": (now - timedelta(hours=5)).isoformat(), "ensemble_pol": 0.28, "finbert_pol": 0.33, "roberta_pol": 0.2, "finbert_score": 0.81, "roberta_score": 0.74, "agreement": 1.0, "conviction": 0.25, "significance": 0.7, }, ] ) bearish = _headline_frame( [ { "title": "Intel downgrade reflects weaker PC demand expectations", "timestamp": now.isoformat(), "ensemble_pol": -0.41, "finbert_pol": -0.46, "roberta_pol": -0.34, "finbert_score": 0.86, "roberta_score": 0.8, "agreement": 1.0, "conviction": 0.37, "significance": 0.75, }, { "title": "Intel delay raises execution concerns for next launch", "timestamp": (now - timedelta(hours=4)).isoformat(), "ensemble_pol": -0.29, "finbert_pol": -0.34, "roberta_pol": -0.22, "finbert_score": 0.82, "roberta_score": 0.76, "agreement": 1.0, "conviction": 0.26, "significance": 0.71, }, ] ) bullish_summary = engine.get_summary(bullish) bearish_summary = engine.get_summary(bearish) assert bullish_summary["vibe"] >= 6 assert bearish_summary["vibe"] <= 4 def test_estimate_time(): engine = AnalyticsEngine() eta = engine.estimate_time(600) assert 20 <= eta <= 200 eta_small = engine.estimate_time(50) assert eta_small < eta def test_self_calibration(): engine = AnalyticsEngine() initial_eta = engine.estimate_time(600) engine.record_timing("finbert_per_batch", 0.1, 1) engine.record_timing("roberta_per_batch", 0.05, 1) engine.record_timing("ranker_per_batch", 0.15, 1) engine.record_timing("scrape_per_article", 0.005, 1) calibrated_eta = engine.estimate_time(600) assert calibrated_eta < initial_eta def test_cleanup(): with open("test_dummy.csv", "w", encoding="utf-8") as file: file.write("test") NewsScraper.cleanup() import glob assert len(glob.glob("test_dummy.csv")) == 0