Spaces:

Jitendra12421
/

finbert_anaylzer

Sleeping

App Files Files Community

finbert_anaylzer / tests /test_engine.py

Jitendra12421

Upload 12 files

16ae9d0 verified about 2 months ago

raw

history blame contribute delete

13.2 kB

	from datetime import datetime, timedelta, timezone
	from pathlib import Path
	import sys

	import pandas as pd

	sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

	from data.scraper import NewsScraper
	from engine.analytics import AnalyticsEngine


	def _headline_frame(rows):
	return pd.DataFrame(rows)


	def test_scraper_init():
	scraper = NewsScraper(limit=10)
	assert scraper.limit == 10


	def test_scraper_query_diversity():
	scraper = NewsScraper(limit=600)
	queries = scraper._build_queries("TSLA")
	assert len(queries) >= 50


	def test_summary_does_not_eager_load_models():
	engine = AnalyticsEngine()
	assert engine.finbert is None
	assert engine.distilroberta is None
	assert engine.ranker is None

	now = datetime.now(timezone.utc)
	df = _headline_frame(
	[
	{
	"title": "TSLA beats estimates and raises guidance",
	"timestamp": now.isoformat(),
	"ensemble_pol": 0.82,
	"finbert_pol": 0.9,
	"roberta_pol": 0.75,
	"finbert_score": 0.95,
	"roberta_score": 0.88,
	"agreement": 1.0,
	"conviction": 0.8,
	"significance": 0.9,
	}
	]
	)
	summary = engine.get_summary(df)
	assert summary["direction_call"] in {"UP", "MIXED"}
	assert summary["direction_score"] >= 50
	assert summary["event_support"] > 0.5
	assert engine.finbert is None
	assert engine.distilroberta is None
	assert engine.ranker is None


	def test_positive_direction_summary_is_bullish():
	engine = AnalyticsEngine()
	now = datetime.now(timezone.utc)
	df = _headline_frame(
	[
	{
	"title": "TSLA beats estimates and raises guidance after record deliveries",
	"timestamp": now.isoformat(),
	"ensemble_pol": 0.86,
	"finbert_pol": 0.91,
	"roberta_pol": 0.79,
	"finbert_score": 0.97,
	"roberta_score": 0.9,
	"agreement": 1.0,
	"conviction": 0.82,
	"significance": 0.95,
	},
	{
	"title": "Analyst upgrades TSLA and raises price target",
	"timestamp": (now - timedelta(hours=3)).isoformat(),
	"ensemble_pol": 0.72,
	"finbert_pol": 0.8,
	"roberta_pol": 0.63,
	"finbert_score": 0.92,
	"roberta_score": 0.84,
	"agreement": 1.0,
	"conviction": 0.71,
	"significance": 0.88,
	},
	{
	"title": "TSLA wins major battery contract in growth push",
	"timestamp": (now - timedelta(hours=8)).isoformat(),
	"ensemble_pol": 0.61,
	"finbert_pol": 0.68,
	"roberta_pol": 0.53,
	"finbert_score": 0.88,
	"roberta_score": 0.8,
	"agreement": 1.0,
	"conviction": 0.61,
	"significance": 0.82,
	},
	]
	)

	summary = engine.get_summary(df)
	assert summary["direction_call"] == "UP"
	assert summary["direction_score"] >= 60
	assert "Bullish" in summary["state_title"]
	assert summary["bullish_pressure"] > summary["bearish_pressure"]
	assert summary["state_explanation"]
	assert summary["bullish_drivers"]


	def test_negative_direction_summary_is_bearish():
	engine = AnalyticsEngine()
	now = datetime.now(timezone.utc)
	df = _headline_frame(
	[
	{
	"title": "TSLA cuts guidance as revenue falls below estimates",
	"timestamp": now.isoformat(),
	"ensemble_pol": -0.88,
	"finbert_pol": -0.93,
	"roberta_pol": -0.81,
	"finbert_score": 0.97,
	"roberta_score": 0.9,
	"agreement": 1.0,
	"conviction": 0.86,
	"significance": 0.96,
	},
	{
	"title": "SEC investigation and lawsuit deepen pressure on TSLA stock",
	"timestamp": (now - timedelta(hours=2)).isoformat(),
	"ensemble_pol": -0.78,
	"finbert_pol": -0.85,
	"roberta_pol": -0.68,
	"finbert_score": 0.94,
	"roberta_score": 0.86,
	"agreement": 1.0,
	"conviction": 0.76,
	"significance": 0.91,
	},
	{
	"title": "Analyst downgrade sends TSLA lower on demand fears",
	"timestamp": (now - timedelta(hours=5)).isoformat(),
	"ensemble_pol": -0.67,
	"finbert_pol": -0.72,
	"roberta_pol": -0.59,
	"finbert_score": 0.89,
	"roberta_score": 0.82,
	"agreement": 1.0,
	"conviction": 0.64,
	"significance": 0.84,
	},
	]
	)

	summary = engine.get_summary(df)
	assert summary["direction_call"] == "DOWN"
	assert summary["direction_score"] <= 40
	assert "Bearish" in summary["state_title"]
	assert summary["bearish_pressure"] > summary["bullish_pressure"]
	assert summary["bearish_risks"]


	def test_mixed_flow_lowers_confidence():
	engine = AnalyticsEngine()
	now = datetime.now(timezone.utc)
	df = _headline_frame(
	[
	{
	"title": "TSLA beats estimates but warns on margin headwinds",
	"timestamp": now.isoformat(),
	"ensemble_pol": 0.18,
	"finbert_pol": 0.24,
	"roberta_pol": 0.1,
	"finbert_score": 0.81,
	"roberta_score": 0.76,
	"agreement": 1.0,
	"conviction": 0.2,
	"significance": 0.72,
	},
	{
	"title": "Analyst downgrade offsets recent TSLA rally",
	"timestamp": (now - timedelta(hours=4)).isoformat(),
	"ensemble_pol": -0.22,
	"finbert_pol": -0.28,
	"roberta_pol": -0.14,
	"finbert_score": 0.82,
	"roberta_score": 0.74,
	"agreement": 1.0,
	"conviction": 0.22,
	"significance": 0.75,
	},
	{
	"title": "Investors await TSLA delivery update as outlook remains uncertain",
	"timestamp": (now - timedelta(hours=9)).isoformat(),
	"ensemble_pol": 0.02,
	"finbert_pol": 0.04,
	"roberta_pol": 0.0,
	"finbert_score": 0.7,
	"roberta_score": 0.66,
	"agreement": 1.0,
	"conviction": 0.04,
	"significance": 0.63,
	},
	]
	)

	summary = engine.get_summary(df)
	assert summary["direction_confidence"] < 70
	assert summary["direction_call"] == "MIXED"
	assert 35 <= summary["direction_score"] <= 65


	def test_single_generic_headline_is_not_overcalled():
	engine = AnalyticsEngine()
	now = datetime.now(timezone.utc)
	df = _headline_frame(
	[
	{
	"title": "TSLA launches new product for mass market buyers",
	"timestamp": now.isoformat(),
	"ensemble_pol": 0.35,
	"finbert_pol": 0.38,
	"roberta_pol": 0.3,
	"finbert_score": 0.82,
	"roberta_score": 0.79,
	"agreement": 1.0,
	"conviction": 0.3,
	"significance": 0.68,
	}
	]
	)

	summary = engine.get_summary(df)
	assert summary["direction_call"] == "MIXED"
	assert 45 <= summary["direction_score"] <= 58
	assert summary["headline_concentration"] >= 0.95
	assert summary["effective_articles"] <= 1.1


	def test_major_singleton_event_can_escape_midpoint_bias():
	engine = AnalyticsEngine()
	now = datetime.now(timezone.utc)
	df = _headline_frame(
	[
	{
	"title": "AAPL beats estimates and raises guidance for next quarter",
	"timestamp": now.isoformat(),
	"ensemble_pol": 0.8,
	"finbert_pol": 0.86,
	"roberta_pol": 0.72,
	"finbert_score": 0.95,
	"roberta_score": 0.87,
	"agreement": 1.0,
	"conviction": 0.77,
	"significance": 0.93,
	}
	]
	)

	summary = engine.get_summary(df)
	assert summary["direction_call"] == "UP"
	assert summary["direction_score"] >= 57
	assert summary["vibe"] >= 7
	assert summary["event_support"] >= 0.72


	def test_stale_signal_needs_fresh_confirmation():
	engine = AnalyticsEngine()
	now = datetime.now(timezone.utc)
	df = _headline_frame(
	[
	{
	"title": "TSLA beats estimates and raises guidance",
	"timestamp": (now - timedelta(days=5)).isoformat(),
	"ensemble_pol": 0.82,
	"finbert_pol": 0.9,
	"roberta_pol": 0.74,
	"finbert_score": 0.95,
	"roberta_score": 0.87,
	"agreement": 1.0,
	"conviction": 0.79,
	"significance": 0.92,
	},
	{
	"title": "Investors await TSLA update as outlook remains uncertain",
	"timestamp": now.isoformat(),
	"ensemble_pol": 0.01,
	"finbert_pol": 0.03,
	"roberta_pol": 0.0,
	"finbert_score": 0.71,
	"roberta_score": 0.67,
	"agreement": 1.0,
	"conviction": 0.02,
	"significance": 0.6,
	},
	]
	)

	summary = engine.get_summary(df)
	assert summary["direction_call"] == "MIXED"
	assert summary["direction_confidence"] < 55
	assert summary["recency_support"] < 0.7


	def test_vibe_scale_moves_off_center_for_mild_directional_lean():
	engine = AnalyticsEngine()
	now = datetime.now(timezone.utc)
	bullish = _headline_frame(
	[
	{
	"title": "Oracle partnership expands enterprise demand pipeline",
	"timestamp": now.isoformat(),
	"ensemble_pol": 0.39,
	"finbert_pol": 0.44,
	"roberta_pol": 0.31,
	"finbert_score": 0.85,
	"roberta_score": 0.79,
	"agreement": 1.0,
	"conviction": 0.35,
	"significance": 0.73,
	},
	{
	"title": "Analyst note turns constructive on Oracle cloud growth",
	"timestamp": (now - timedelta(hours=5)).isoformat(),
	"ensemble_pol": 0.28,
	"finbert_pol": 0.33,
	"roberta_pol": 0.2,
	"finbert_score": 0.81,
	"roberta_score": 0.74,
	"agreement": 1.0,
	"conviction": 0.25,
	"significance": 0.7,
	},
	]
	)
	bearish = _headline_frame(
	[
	{
	"title": "Intel downgrade reflects weaker PC demand expectations",
	"timestamp": now.isoformat(),
	"ensemble_pol": -0.41,
	"finbert_pol": -0.46,
	"roberta_pol": -0.34,
	"finbert_score": 0.86,
	"roberta_score": 0.8,
	"agreement": 1.0,
	"conviction": 0.37,
	"significance": 0.75,
	},
	{
	"title": "Intel delay raises execution concerns for next launch",
	"timestamp": (now - timedelta(hours=4)).isoformat(),
	"ensemble_pol": -0.29,
	"finbert_pol": -0.34,
	"roberta_pol": -0.22,
	"finbert_score": 0.82,
	"roberta_score": 0.76,
	"agreement": 1.0,
	"conviction": 0.26,
	"significance": 0.71,
	},
	]
	)

	bullish_summary = engine.get_summary(bullish)
	bearish_summary = engine.get_summary(bearish)

	assert bullish_summary["vibe"] >= 6
	assert bearish_summary["vibe"] <= 4


	def test_estimate_time():
	engine = AnalyticsEngine()
	eta = engine.estimate_time(600)
	assert 20 <= eta <= 200

	eta_small = engine.estimate_time(50)
	assert eta_small < eta


	def test_self_calibration():
	engine = AnalyticsEngine()
	initial_eta = engine.estimate_time(600)

	engine.record_timing("finbert_per_batch", 0.1, 1)
	engine.record_timing("roberta_per_batch", 0.05, 1)
	engine.record_timing("ranker_per_batch", 0.15, 1)
	engine.record_timing("scrape_per_article", 0.005, 1)

	calibrated_eta = engine.estimate_time(600)
	assert calibrated_eta < initial_eta


	def test_cleanup():
	with open("test_dummy.csv", "w", encoding="utf-8") as file:
	file.write("test")
	NewsScraper.cleanup()
	import glob

	assert len(glob.glob("test_dummy.csv")) == 0