finbert_anaylzer / tests /test_engine.py
Jitendra12421's picture
Upload 12 files
16ae9d0 verified
from datetime import datetime, timedelta, timezone
from pathlib import Path
import sys
import pandas as pd
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from data.scraper import NewsScraper
from engine.analytics import AnalyticsEngine
def _headline_frame(rows):
return pd.DataFrame(rows)
def test_scraper_init():
scraper = NewsScraper(limit=10)
assert scraper.limit == 10
def test_scraper_query_diversity():
scraper = NewsScraper(limit=600)
queries = scraper._build_queries("TSLA")
assert len(queries) >= 50
def test_summary_does_not_eager_load_models():
engine = AnalyticsEngine()
assert engine.finbert is None
assert engine.distilroberta is None
assert engine.ranker is None
now = datetime.now(timezone.utc)
df = _headline_frame(
[
{
"title": "TSLA beats estimates and raises guidance",
"timestamp": now.isoformat(),
"ensemble_pol": 0.82,
"finbert_pol": 0.9,
"roberta_pol": 0.75,
"finbert_score": 0.95,
"roberta_score": 0.88,
"agreement": 1.0,
"conviction": 0.8,
"significance": 0.9,
}
]
)
summary = engine.get_summary(df)
assert summary["direction_call"] in {"UP", "MIXED"}
assert summary["direction_score"] >= 50
assert summary["event_support"] > 0.5
assert engine.finbert is None
assert engine.distilroberta is None
assert engine.ranker is None
def test_positive_direction_summary_is_bullish():
engine = AnalyticsEngine()
now = datetime.now(timezone.utc)
df = _headline_frame(
[
{
"title": "TSLA beats estimates and raises guidance after record deliveries",
"timestamp": now.isoformat(),
"ensemble_pol": 0.86,
"finbert_pol": 0.91,
"roberta_pol": 0.79,
"finbert_score": 0.97,
"roberta_score": 0.9,
"agreement": 1.0,
"conviction": 0.82,
"significance": 0.95,
},
{
"title": "Analyst upgrades TSLA and raises price target",
"timestamp": (now - timedelta(hours=3)).isoformat(),
"ensemble_pol": 0.72,
"finbert_pol": 0.8,
"roberta_pol": 0.63,
"finbert_score": 0.92,
"roberta_score": 0.84,
"agreement": 1.0,
"conviction": 0.71,
"significance": 0.88,
},
{
"title": "TSLA wins major battery contract in growth push",
"timestamp": (now - timedelta(hours=8)).isoformat(),
"ensemble_pol": 0.61,
"finbert_pol": 0.68,
"roberta_pol": 0.53,
"finbert_score": 0.88,
"roberta_score": 0.8,
"agreement": 1.0,
"conviction": 0.61,
"significance": 0.82,
},
]
)
summary = engine.get_summary(df)
assert summary["direction_call"] == "UP"
assert summary["direction_score"] >= 60
assert "Bullish" in summary["state_title"]
assert summary["bullish_pressure"] > summary["bearish_pressure"]
assert summary["state_explanation"]
assert summary["bullish_drivers"]
def test_negative_direction_summary_is_bearish():
engine = AnalyticsEngine()
now = datetime.now(timezone.utc)
df = _headline_frame(
[
{
"title": "TSLA cuts guidance as revenue falls below estimates",
"timestamp": now.isoformat(),
"ensemble_pol": -0.88,
"finbert_pol": -0.93,
"roberta_pol": -0.81,
"finbert_score": 0.97,
"roberta_score": 0.9,
"agreement": 1.0,
"conviction": 0.86,
"significance": 0.96,
},
{
"title": "SEC investigation and lawsuit deepen pressure on TSLA stock",
"timestamp": (now - timedelta(hours=2)).isoformat(),
"ensemble_pol": -0.78,
"finbert_pol": -0.85,
"roberta_pol": -0.68,
"finbert_score": 0.94,
"roberta_score": 0.86,
"agreement": 1.0,
"conviction": 0.76,
"significance": 0.91,
},
{
"title": "Analyst downgrade sends TSLA lower on demand fears",
"timestamp": (now - timedelta(hours=5)).isoformat(),
"ensemble_pol": -0.67,
"finbert_pol": -0.72,
"roberta_pol": -0.59,
"finbert_score": 0.89,
"roberta_score": 0.82,
"agreement": 1.0,
"conviction": 0.64,
"significance": 0.84,
},
]
)
summary = engine.get_summary(df)
assert summary["direction_call"] == "DOWN"
assert summary["direction_score"] <= 40
assert "Bearish" in summary["state_title"]
assert summary["bearish_pressure"] > summary["bullish_pressure"]
assert summary["bearish_risks"]
def test_mixed_flow_lowers_confidence():
engine = AnalyticsEngine()
now = datetime.now(timezone.utc)
df = _headline_frame(
[
{
"title": "TSLA beats estimates but warns on margin headwinds",
"timestamp": now.isoformat(),
"ensemble_pol": 0.18,
"finbert_pol": 0.24,
"roberta_pol": 0.1,
"finbert_score": 0.81,
"roberta_score": 0.76,
"agreement": 1.0,
"conviction": 0.2,
"significance": 0.72,
},
{
"title": "Analyst downgrade offsets recent TSLA rally",
"timestamp": (now - timedelta(hours=4)).isoformat(),
"ensemble_pol": -0.22,
"finbert_pol": -0.28,
"roberta_pol": -0.14,
"finbert_score": 0.82,
"roberta_score": 0.74,
"agreement": 1.0,
"conviction": 0.22,
"significance": 0.75,
},
{
"title": "Investors await TSLA delivery update as outlook remains uncertain",
"timestamp": (now - timedelta(hours=9)).isoformat(),
"ensemble_pol": 0.02,
"finbert_pol": 0.04,
"roberta_pol": 0.0,
"finbert_score": 0.7,
"roberta_score": 0.66,
"agreement": 1.0,
"conviction": 0.04,
"significance": 0.63,
},
]
)
summary = engine.get_summary(df)
assert summary["direction_confidence"] < 70
assert summary["direction_call"] == "MIXED"
assert 35 <= summary["direction_score"] <= 65
def test_single_generic_headline_is_not_overcalled():
engine = AnalyticsEngine()
now = datetime.now(timezone.utc)
df = _headline_frame(
[
{
"title": "TSLA launches new product for mass market buyers",
"timestamp": now.isoformat(),
"ensemble_pol": 0.35,
"finbert_pol": 0.38,
"roberta_pol": 0.3,
"finbert_score": 0.82,
"roberta_score": 0.79,
"agreement": 1.0,
"conviction": 0.3,
"significance": 0.68,
}
]
)
summary = engine.get_summary(df)
assert summary["direction_call"] == "MIXED"
assert 45 <= summary["direction_score"] <= 58
assert summary["headline_concentration"] >= 0.95
assert summary["effective_articles"] <= 1.1
def test_major_singleton_event_can_escape_midpoint_bias():
engine = AnalyticsEngine()
now = datetime.now(timezone.utc)
df = _headline_frame(
[
{
"title": "AAPL beats estimates and raises guidance for next quarter",
"timestamp": now.isoformat(),
"ensemble_pol": 0.8,
"finbert_pol": 0.86,
"roberta_pol": 0.72,
"finbert_score": 0.95,
"roberta_score": 0.87,
"agreement": 1.0,
"conviction": 0.77,
"significance": 0.93,
}
]
)
summary = engine.get_summary(df)
assert summary["direction_call"] == "UP"
assert summary["direction_score"] >= 57
assert summary["vibe"] >= 7
assert summary["event_support"] >= 0.72
def test_stale_signal_needs_fresh_confirmation():
engine = AnalyticsEngine()
now = datetime.now(timezone.utc)
df = _headline_frame(
[
{
"title": "TSLA beats estimates and raises guidance",
"timestamp": (now - timedelta(days=5)).isoformat(),
"ensemble_pol": 0.82,
"finbert_pol": 0.9,
"roberta_pol": 0.74,
"finbert_score": 0.95,
"roberta_score": 0.87,
"agreement": 1.0,
"conviction": 0.79,
"significance": 0.92,
},
{
"title": "Investors await TSLA update as outlook remains uncertain",
"timestamp": now.isoformat(),
"ensemble_pol": 0.01,
"finbert_pol": 0.03,
"roberta_pol": 0.0,
"finbert_score": 0.71,
"roberta_score": 0.67,
"agreement": 1.0,
"conviction": 0.02,
"significance": 0.6,
},
]
)
summary = engine.get_summary(df)
assert summary["direction_call"] == "MIXED"
assert summary["direction_confidence"] < 55
assert summary["recency_support"] < 0.7
def test_vibe_scale_moves_off_center_for_mild_directional_lean():
engine = AnalyticsEngine()
now = datetime.now(timezone.utc)
bullish = _headline_frame(
[
{
"title": "Oracle partnership expands enterprise demand pipeline",
"timestamp": now.isoformat(),
"ensemble_pol": 0.39,
"finbert_pol": 0.44,
"roberta_pol": 0.31,
"finbert_score": 0.85,
"roberta_score": 0.79,
"agreement": 1.0,
"conviction": 0.35,
"significance": 0.73,
},
{
"title": "Analyst note turns constructive on Oracle cloud growth",
"timestamp": (now - timedelta(hours=5)).isoformat(),
"ensemble_pol": 0.28,
"finbert_pol": 0.33,
"roberta_pol": 0.2,
"finbert_score": 0.81,
"roberta_score": 0.74,
"agreement": 1.0,
"conviction": 0.25,
"significance": 0.7,
},
]
)
bearish = _headline_frame(
[
{
"title": "Intel downgrade reflects weaker PC demand expectations",
"timestamp": now.isoformat(),
"ensemble_pol": -0.41,
"finbert_pol": -0.46,
"roberta_pol": -0.34,
"finbert_score": 0.86,
"roberta_score": 0.8,
"agreement": 1.0,
"conviction": 0.37,
"significance": 0.75,
},
{
"title": "Intel delay raises execution concerns for next launch",
"timestamp": (now - timedelta(hours=4)).isoformat(),
"ensemble_pol": -0.29,
"finbert_pol": -0.34,
"roberta_pol": -0.22,
"finbert_score": 0.82,
"roberta_score": 0.76,
"agreement": 1.0,
"conviction": 0.26,
"significance": 0.71,
},
]
)
bullish_summary = engine.get_summary(bullish)
bearish_summary = engine.get_summary(bearish)
assert bullish_summary["vibe"] >= 6
assert bearish_summary["vibe"] <= 4
def test_estimate_time():
engine = AnalyticsEngine()
eta = engine.estimate_time(600)
assert 20 <= eta <= 200
eta_small = engine.estimate_time(50)
assert eta_small < eta
def test_self_calibration():
engine = AnalyticsEngine()
initial_eta = engine.estimate_time(600)
engine.record_timing("finbert_per_batch", 0.1, 1)
engine.record_timing("roberta_per_batch", 0.05, 1)
engine.record_timing("ranker_per_batch", 0.15, 1)
engine.record_timing("scrape_per_article", 0.005, 1)
calibrated_eta = engine.estimate_time(600)
assert calibrated_eta < initial_eta
def test_cleanup():
with open("test_dummy.csv", "w", encoding="utf-8") as file:
file.write("test")
NewsScraper.cleanup()
import glob
assert len(glob.glob("test_dummy.csv")) == 0