Spaces:

Jitendra12421
/

finbert_anaylzer

Sleeping

File size: 13,169 Bytes

from datetime import datetime, timedelta, timezone
from pathlib import Path
import sys

import pandas as pd

sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

from data.scraper import NewsScraper
from engine.analytics import AnalyticsEngine


def _headline_frame(rows):
    return pd.DataFrame(rows)


def test_scraper_init():
    scraper = NewsScraper(limit=10)
    assert scraper.limit == 10


def test_scraper_query_diversity():
    scraper = NewsScraper(limit=600)
    queries = scraper._build_queries("TSLA")
    assert len(queries) >= 50


def test_summary_does_not_eager_load_models():
    engine = AnalyticsEngine()
    assert engine.finbert is None
    assert engine.distilroberta is None
    assert engine.ranker is None

    now = datetime.now(timezone.utc)
    df = _headline_frame(
        [
            {
                "title": "TSLA beats estimates and raises guidance",
                "timestamp": now.isoformat(),
                "ensemble_pol": 0.82,
                "finbert_pol": 0.9,
                "roberta_pol": 0.75,
                "finbert_score": 0.95,
                "roberta_score": 0.88,
                "agreement": 1.0,
                "conviction": 0.8,
                "significance": 0.9,
            }
        ]
    )
    summary = engine.get_summary(df)
    assert summary["direction_call"] in {"UP", "MIXED"}
    assert summary["direction_score"] >= 50
    assert summary["event_support"] > 0.5
    assert engine.finbert is None
    assert engine.distilroberta is None
    assert engine.ranker is None


def test_positive_direction_summary_is_bullish():
    engine = AnalyticsEngine()
    now = datetime.now(timezone.utc)
    df = _headline_frame(
        [
            {
                "title": "TSLA beats estimates and raises guidance after record deliveries",
                "timestamp": now.isoformat(),
                "ensemble_pol": 0.86,
                "finbert_pol": 0.91,
                "roberta_pol": 0.79,
                "finbert_score": 0.97,
                "roberta_score": 0.9,
                "agreement": 1.0,
                "conviction": 0.82,
                "significance": 0.95,
            },
            {
                "title": "Analyst upgrades TSLA and raises price target",
                "timestamp": (now - timedelta(hours=3)).isoformat(),
                "ensemble_pol": 0.72,
                "finbert_pol": 0.8,
                "roberta_pol": 0.63,
                "finbert_score": 0.92,
                "roberta_score": 0.84,
                "agreement": 1.0,
                "conviction": 0.71,
                "significance": 0.88,
            },
            {
                "title": "TSLA wins major battery contract in growth push",
                "timestamp": (now - timedelta(hours=8)).isoformat(),
                "ensemble_pol": 0.61,
                "finbert_pol": 0.68,
                "roberta_pol": 0.53,
                "finbert_score": 0.88,
                "roberta_score": 0.8,
                "agreement": 1.0,
                "conviction": 0.61,
                "significance": 0.82,
            },
        ]
    )

    summary = engine.get_summary(df)
    assert summary["direction_call"] == "UP"
    assert summary["direction_score"] >= 60
    assert "Bullish" in summary["state_title"]
    assert summary["bullish_pressure"] > summary["bearish_pressure"]
    assert summary["state_explanation"]
    assert summary["bullish_drivers"]


def test_negative_direction_summary_is_bearish():
    engine = AnalyticsEngine()
    now = datetime.now(timezone.utc)
    df = _headline_frame(
        [
            {
                "title": "TSLA cuts guidance as revenue falls below estimates",
                "timestamp": now.isoformat(),
                "ensemble_pol": -0.88,
                "finbert_pol": -0.93,
                "roberta_pol": -0.81,
                "finbert_score": 0.97,
                "roberta_score": 0.9,
                "agreement": 1.0,
                "conviction": 0.86,
                "significance": 0.96,
            },
            {
                "title": "SEC investigation and lawsuit deepen pressure on TSLA stock",
                "timestamp": (now - timedelta(hours=2)).isoformat(),
                "ensemble_pol": -0.78,
                "finbert_pol": -0.85,
                "roberta_pol": -0.68,
                "finbert_score": 0.94,
                "roberta_score": 0.86,
                "agreement": 1.0,
                "conviction": 0.76,
                "significance": 0.91,
            },
            {
                "title": "Analyst downgrade sends TSLA lower on demand fears",
                "timestamp": (now - timedelta(hours=5)).isoformat(),
                "ensemble_pol": -0.67,
                "finbert_pol": -0.72,
                "roberta_pol": -0.59,
                "finbert_score": 0.89,
                "roberta_score": 0.82,
                "agreement": 1.0,
                "conviction": 0.64,
                "significance": 0.84,
            },
        ]
    )

    summary = engine.get_summary(df)
    assert summary["direction_call"] == "DOWN"
    assert summary["direction_score"] <= 40
    assert "Bearish" in summary["state_title"]
    assert summary["bearish_pressure"] > summary["bullish_pressure"]
    assert summary["bearish_risks"]


def test_mixed_flow_lowers_confidence():
    engine = AnalyticsEngine()
    now = datetime.now(timezone.utc)
    df = _headline_frame(
        [
            {
                "title": "TSLA beats estimates but warns on margin headwinds",
                "timestamp": now.isoformat(),
                "ensemble_pol": 0.18,
                "finbert_pol": 0.24,
                "roberta_pol": 0.1,
                "finbert_score": 0.81,
                "roberta_score": 0.76,
                "agreement": 1.0,
                "conviction": 0.2,
                "significance": 0.72,
            },
            {
                "title": "Analyst downgrade offsets recent TSLA rally",
                "timestamp": (now - timedelta(hours=4)).isoformat(),
                "ensemble_pol": -0.22,
                "finbert_pol": -0.28,
                "roberta_pol": -0.14,
                "finbert_score": 0.82,
                "roberta_score": 0.74,
                "agreement": 1.0,
                "conviction": 0.22,
                "significance": 0.75,
            },
            {
                "title": "Investors await TSLA delivery update as outlook remains uncertain",
                "timestamp": (now - timedelta(hours=9)).isoformat(),
                "ensemble_pol": 0.02,
                "finbert_pol": 0.04,
                "roberta_pol": 0.0,
                "finbert_score": 0.7,
                "roberta_score": 0.66,
                "agreement": 1.0,
                "conviction": 0.04,
                "significance": 0.63,
            },
        ]
    )

    summary = engine.get_summary(df)
    assert summary["direction_confidence"] < 70
    assert summary["direction_call"] == "MIXED"
    assert 35 <= summary["direction_score"] <= 65


def test_single_generic_headline_is_not_overcalled():
    engine = AnalyticsEngine()
    now = datetime.now(timezone.utc)
    df = _headline_frame(
        [
            {
                "title": "TSLA launches new product for mass market buyers",
                "timestamp": now.isoformat(),
                "ensemble_pol": 0.35,
                "finbert_pol": 0.38,
                "roberta_pol": 0.3,
                "finbert_score": 0.82,
                "roberta_score": 0.79,
                "agreement": 1.0,
                "conviction": 0.3,
                "significance": 0.68,
            }
        ]
    )

    summary = engine.get_summary(df)
    assert summary["direction_call"] == "MIXED"
    assert 45 <= summary["direction_score"] <= 58
    assert summary["headline_concentration"] >= 0.95
    assert summary["effective_articles"] <= 1.1


def test_major_singleton_event_can_escape_midpoint_bias():
    engine = AnalyticsEngine()
    now = datetime.now(timezone.utc)
    df = _headline_frame(
        [
            {
                "title": "AAPL beats estimates and raises guidance for next quarter",
                "timestamp": now.isoformat(),
                "ensemble_pol": 0.8,
                "finbert_pol": 0.86,
                "roberta_pol": 0.72,
                "finbert_score": 0.95,
                "roberta_score": 0.87,
                "agreement": 1.0,
                "conviction": 0.77,
                "significance": 0.93,
            }
        ]
    )

    summary = engine.get_summary(df)
    assert summary["direction_call"] == "UP"
    assert summary["direction_score"] >= 57
    assert summary["vibe"] >= 7
    assert summary["event_support"] >= 0.72


def test_stale_signal_needs_fresh_confirmation():
    engine = AnalyticsEngine()
    now = datetime.now(timezone.utc)
    df = _headline_frame(
        [
            {
                "title": "TSLA beats estimates and raises guidance",
                "timestamp": (now - timedelta(days=5)).isoformat(),
                "ensemble_pol": 0.82,
                "finbert_pol": 0.9,
                "roberta_pol": 0.74,
                "finbert_score": 0.95,
                "roberta_score": 0.87,
                "agreement": 1.0,
                "conviction": 0.79,
                "significance": 0.92,
            },
            {
                "title": "Investors await TSLA update as outlook remains uncertain",
                "timestamp": now.isoformat(),
                "ensemble_pol": 0.01,
                "finbert_pol": 0.03,
                "roberta_pol": 0.0,
                "finbert_score": 0.71,
                "roberta_score": 0.67,
                "agreement": 1.0,
                "conviction": 0.02,
                "significance": 0.6,
            },
        ]
    )

    summary = engine.get_summary(df)
    assert summary["direction_call"] == "MIXED"
    assert summary["direction_confidence"] < 55
    assert summary["recency_support"] < 0.7


def test_vibe_scale_moves_off_center_for_mild_directional_lean():
    engine = AnalyticsEngine()
    now = datetime.now(timezone.utc)
    bullish = _headline_frame(
        [
            {
                "title": "Oracle partnership expands enterprise demand pipeline",
                "timestamp": now.isoformat(),
                "ensemble_pol": 0.39,
                "finbert_pol": 0.44,
                "roberta_pol": 0.31,
                "finbert_score": 0.85,
                "roberta_score": 0.79,
                "agreement": 1.0,
                "conviction": 0.35,
                "significance": 0.73,
            },
            {
                "title": "Analyst note turns constructive on Oracle cloud growth",
                "timestamp": (now - timedelta(hours=5)).isoformat(),
                "ensemble_pol": 0.28,
                "finbert_pol": 0.33,
                "roberta_pol": 0.2,
                "finbert_score": 0.81,
                "roberta_score": 0.74,
                "agreement": 1.0,
                "conviction": 0.25,
                "significance": 0.7,
            },
        ]
    )
    bearish = _headline_frame(
        [
            {
                "title": "Intel downgrade reflects weaker PC demand expectations",
                "timestamp": now.isoformat(),
                "ensemble_pol": -0.41,
                "finbert_pol": -0.46,
                "roberta_pol": -0.34,
                "finbert_score": 0.86,
                "roberta_score": 0.8,
                "agreement": 1.0,
                "conviction": 0.37,
                "significance": 0.75,
            },
            {
                "title": "Intel delay raises execution concerns for next launch",
                "timestamp": (now - timedelta(hours=4)).isoformat(),
                "ensemble_pol": -0.29,
                "finbert_pol": -0.34,
                "roberta_pol": -0.22,
                "finbert_score": 0.82,
                "roberta_score": 0.76,
                "agreement": 1.0,
                "conviction": 0.26,
                "significance": 0.71,
            },
        ]
    )

    bullish_summary = engine.get_summary(bullish)
    bearish_summary = engine.get_summary(bearish)

    assert bullish_summary["vibe"] >= 6
    assert bearish_summary["vibe"] <= 4


def test_estimate_time():
    engine = AnalyticsEngine()
    eta = engine.estimate_time(600)
    assert 20 <= eta <= 200

    eta_small = engine.estimate_time(50)
    assert eta_small < eta


def test_self_calibration():
    engine = AnalyticsEngine()
    initial_eta = engine.estimate_time(600)

    engine.record_timing("finbert_per_batch", 0.1, 1)
    engine.record_timing("roberta_per_batch", 0.05, 1)
    engine.record_timing("ranker_per_batch", 0.15, 1)
    engine.record_timing("scrape_per_article", 0.005, 1)

    calibrated_eta = engine.estimate_time(600)
    assert calibrated_eta < initial_eta


def test_cleanup():
    with open("test_dummy.csv", "w", encoding="utf-8") as file:
        file.write("test")
    NewsScraper.cleanup()
    import glob

    assert len(glob.glob("test_dummy.csv")) == 0