File size: 12,583 Bytes

c5a2a63

"""News Intelligence v1.0 — Real-Time News Sentiment + Event Detection
FinBERT-based sentiment scoring with event classification.
Falls back to regex-based analysis if FinBERT unavailable.
"""
import re, os, json, requests
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Tuple
import numpy as np

# ── Event detection keywords ─────────────────────────────────
EVENT_PATTERNS = {
    'earnings':      ['earnings', 'quarterly', 'revenue', 'eps', 'profit', 'q[1-4]', 'fiscal'],
    'fed':           ['federal reserve', 'fed', 'fomc', 'interest rate', 'rate hike', 'rate cut', 'powell'],
    'cpi':           ['cpi', 'inflation', 'consumer price', 'core pce'],
    'jobs':          ['jobs report', 'unemployment', 'nfp', 'nonfarm payroll', 'labor'],
    'lawsuit':       ['lawsuit', 'sec', 'doj', 'investigation', 'antitrust', 'fine', 'settlement'],
    'merger':        ['merger', 'acquisition', 'acquire', 'buyout', 'merging', 'takeover'],
    'dividend':      ['dividend', 'buyback', 'share repurchase', 'dividend yield'],
    'split':         ['stock split', 'split', 'reverse split'],
    'upgrade':       ['upgrade', 'upgraded', 'overweight', 'buy rating', 'price target raised'],
    'downgrade':     ['downgrade', 'downgraded', 'underweight', 'sell rating', 'price target cut'],
    'product':       ['product launch', 'new product', 'iphone', 'ai model', 'release date'],
    'supply_chain':  ['supply chain', 'shortage', 'inventory', 'chip shortage', 'factory'],
    'macro':         ['gdp', 'recession', 'economic growth', 'fiscal policy', 'stimulus'],
    'geopolitical':  ['war', 'sanctions', 'tension', 'china', 'trade war', 'tariff'],
    'analyst':       ['analyst', 'wall street', 'target price', 'consensus'],
}

BULLISH_WORDS = [
    'beat', 'strong', 'growth', 'surge', 'rally', 'bullish', 'outperform',
    'exceed', 'record', 'milestone', 'breakthrough', 'partnership', 'launch',
    'innovation', 'momentum', 'premium', 'dominant', 'leader', 'expansion'
]

BEARISH_WORDS = [
    'miss', 'weak', 'decline', 'drop', 'crash', 'bearish', 'underperform',
    'loss', 'concern', 'warning', 'risk', 'lawsuit', 'investigation',
    'fraud', 'default', 'bankruptcy', 'layoff', 'cut', 'slash', 'downturn',
    'recession', 'contagion', 'crisis', 'collapse'
]


class NewsIntelligence:
    """Multi-source news sentiment with FinBERT + rule-based fallback."""

    def __init__(self, finbert_available: bool = None, cache_dir: str = ".cache/news"):
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)
        self._finbert = None
        self._tokenizer = None
        self._sentiment_cache = {}  # ticker -> {date: score}

        if finbert_available is None:
            try:
                from transformers import AutoTokenizer, AutoModelForSequenceClassification
                self._tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
                self._finbert = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
                self._finbert.eval()
                finbert_available = True
            except Exception:
                finbert_available = False
        self.use_finbert = finbert_available

    def classify_event(self, headline: str, summary: str = "") -> Tuple[str, float]:
        """Classify article into event type and severity (0-1)."""
        text = (headline + " " + summary).lower()
        scores = {}
        for event_type, patterns in EVENT_PATTERNS.items():
            score = 0
            for pat in patterns:
                count = len(re.findall(pat, text))
                score += count
            if score > 0:
                scores[event_type] = score

        if not scores:
            return 'general', 0.1

        best = max(scores, key=scores.get)
        return best, min(1.0, scores[best] * 0.5)

    def rule_sentiment(self, headline: str, summary: str = "") -> Dict:
        """Rule-based sentiment as fallback when FinBERT unavailable."""
        text = (headline + " " + summary).lower()
        bull = sum(text.count(w) for w in BULLISH_WORDS)
        bear = sum(text.count(w) for w in BEARISH_WORDS)
        total = bull + bear + 1e-10
        # Map to 0-100 scale
        sentiment = 50 + (bull - bear) / total * 50
        confidence = min(1.0, total * 0.1)
        return {
            'score': max(0, min(100, sentiment)),
            'confidence': confidence,
            'method': 'rule'
        }

    def finbert_sentiment(self, headline: str, summary: str = "") -> Dict:
        """FinBERT inference. Returns score 0-100."""
        if not self.use_finbert:
            return self.rule_sentiment(headline, summary)

        import torch
        text = headline
        if summary:
            text += ". " + summary[:500]

        inputs = self._tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self._finbert(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)[0].numpy()

        # FinBERT: [negative, neutral, positive]
        neg, neu, pos = probs
        # Map to 0-100
        score = 50 + (pos - neg) * 50
        confidence = 1 - neu  # Higher confidence when less neutral

        return {
            'score': max(0, min(100, score)),
            'confidence': float(confidence),
            'probs': {'negative': float(neg), 'neutral': float(neu), 'positive': float(pos)},
            'method': 'finbert'
        }

    def analyze_article(self, headline: str, summary: str = "",
                        timestamp: str = None) -> Dict:
        """Full article analysis: sentiment + event classification."""
        event_type, event_severity = self.classify_event(headline, summary)
        sentiment = self.finbert_sentiment(headline, summary)

        # Adjust sentiment for event context
        event_sentiment_override = {
            'earnings': 0,
            'fed': -10,
            'lawsuit': -25,
            'upgrade': +20,
            'downgrade': -20,
            'merger': +15,
            'dividend': +10,
            'product': +15,
        }
        adj_score = sentiment['score']
        if event_type in event_sentiment_override:
            adj_score += event_sentiment_override[event_type]
            sentiment['adjusted_score'] = max(0, min(100, adj_score))
        else:
            sentiment['adjusted_score'] = adj_score

        return {
            'headline': headline,
            'summary': summary[:200] if summary else "",
            'timestamp': timestamp or datetime.now().isoformat(),
            'sentiment': sentiment,
            'event': {
                'type': event_type,
                'severity': event_severity,
            }
        }

    def fetch_newsapi(self, query: str, api_key: str = None, days: int = 7) -> List[Dict]:
        """Fetch news from NewsAPI. Returns list of article analyses."""
        if not api_key:
            api_key = os.environ.get('NEWSAPI_KEY')
        if not api_key:
            return self._mock_news(query)

        from_date = (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d')
        url = f"https://newsapi.org/v2/everything?q={query}&from={from_date}&sortBy=publishedAt&language=en&apiKey={api_key}"

        try:
            r = requests.get(url, timeout=15)
            r.raise_for_status()
            articles = r.json().get('articles', [])
            results = []
            for art in articles[:10]:
                analysis = self.analyze_article(
                    art.get('title', ''),
                    art.get('description', ''),
                    art.get('publishedAt')
                )
                results.append(analysis)
            return results
        except Exception as e:
            print(f"NewsAPI error: {e}")
            return self._mock_news(query)

    def fetch_yfinance_news(self, ticker: str) -> List[Dict]:
        """Fetch news from yfinance."""
        try:
            import yfinance as yf
            t = yf.Ticker(ticker)
            news = t.news or []
            results = []
            for item in news[:10]:
                title = item.get('title', '') or item.get('content', {}).get('title', '')
                summary = item.get('summary', '') or item.get('content', {}).get('summary', '')
                analysis = self.analyze_article(title, summary)
                results.append(analysis)
            return results
        except Exception as e:
            print(f"yfinance news error: {e}")
            return self._mock_news(ticker)

    def aggregate_sentiment(self, articles: List[Dict]) -> Dict:
        """Aggregate sentiment across articles with recency weighting."""
        if not articles:
            return {'score': 50, 'confidence': 0, 'volume': 0, 'trend': 'neutral'}

        scores = []
        for art in articles:
            adj = art['sentiment'].get('adjusted_score', art['sentiment']['score'])
            conf = art['sentiment'].get('confidence', 0.5)
            scores.append((adj, conf))

        if not scores:
            return {'score': 50, 'confidence': 0, 'volume': 0, 'trend': 'neutral'}

        # Weighted average by confidence
        total_weight = sum(conf for _, conf in scores) + 1e-10
        weighted_score = sum(s * c for s, c in scores) / total_weight

        # Count by sentiment
        bullish = sum(1 for s, _ in scores if s > 55)
        bearish = sum(1 for s, _ in scores if s < 45)
        neutral = sum(1 for s, _ in scores if 45 <= s <= 55)

        volume = len(scores)
        if bullish > bearish * 2:
            trend = 'strong_bullish'
        elif bullish > bearish:
            trend = 'bullish'
        elif bearish > bullish * 2:
            trend = 'strong_bearish'
        elif bearish > bullish:
            trend = 'bearish'
        else:
            trend = 'mixed'

        # Dominant event
        events = [a['event']['type'] for a in articles]
        event_counts = {}
        for e in events:
            event_counts[e] = event_counts.get(e, 0) + 1
        dominant_event = max(event_counts, key=event_counts.get) if event_counts else 'general'

        return {
            'score': round(weighted_score, 1),
            'confidence': round(total_weight / volume, 2),
            'volume': volume,
            'trend': trend,
            'bullish_count': bullish,
            'bearish_count': bearish,
            'neutral_count': neutral,
            'dominant_event': dominant_event,
            'event_counts': event_counts,
        }

    def _mock_news(self, query: str) -> List[Dict]:
        """Mock news for testing without API keys."""
        mock = [
            f"{query} beats earnings expectations, revenue surges 15%",
            f"{query} announces new AI product partnership",
            f"Analysts upgrade {query} to overweight, target raised to $500",
            f"{query} faces supply chain headwinds in Q3",
            f"{query} maintains guidance despite macro uncertainty",
        ]
        return [self.analyze_article(h) for h in mock]

    def get_full_analysis(self, ticker: str, market: str = 'US', period_days: int = 7) -> Dict:
        """Full news intelligence pipeline for a ticker."""
        # Try yfinance first
        articles = self.fetch_yfinance_news(ticker)

        # If insufficient, try NewsAPI
        if len(articles) < 3:
            api_articles = self.fetch_newsapi(ticker, days=period_days)
            articles.extend(api_articles)

        # Deduplicate by headline
        seen = set()
        unique = []
        for a in articles:
            key = a['headline'][:50].lower()
            if key not in seen:
                seen.add(key)
                unique.append(a)

        sentiment = self.aggregate_sentiment(unique)
        sentiment['articles'] = unique[:5]  # Top 5
        sentiment['ticker'] = ticker
        sentiment['market'] = market
        sentiment['timestamp'] = datetime.now().isoformat()
        return sentiment


if __name__ == '__main__':
    ni = NewsIntelligence()
    result = ni.get_full_analysis('AAPL')
    print(f"Sentiment Score: {result['score']}/100")
    print(f"Trend: {result['trend']}")
    print(f"Dominant Event: {result['dominant_event']}")
    print(f"Article Count: {result['volume']}")
    for art in result['articles'][:3]:
        print(f"\n  📰 {art['headline']}")
        print(f"     Score: {art['sentiment']['adjusted_score']:.1f} | Event: {art['event']['type']}")