Premchan369
/

alphaforge-quant-system

+"""News + Sentiment Alpha Model using FinBERT."""
+import numpy as np
+import pandas as pd
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+from typing import List, Dict, Optional
+import warnings
+warnings.filterwarnings('ignore')
+class SentimentAlphaModel:
+    """Financial sentiment analysis using FinBERT"""
+    def __init__(self, model_name: str = "ProsusAI/finbert",
+                 device: str = 'cpu', max_length: int = 512):
+        self.model_name = model_name
+        self.device = device
+        self.max_length = max_length
+        print(f"Loading FinBERT model: {model_name}")
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            self.model.to(device)
+            self.model.eval()
+            self.pipeline = pipeline(
+                "sentiment-analysis",
+                model=self.model,
+                tokenizer=self.tokenizer,
+                device=0 if device == 'cuda' else -1
+            )
+            self.is_loaded = True
+        except Exception as e:
+            print(f"Error loading FinBERT: {e}")
+            self.is_loaded = False
+    def analyze_text(self, text: str) -> Dict:
+        """Analyze sentiment of a single text"""
+        if not self.is_loaded:
+            return {'label': 'neutral', 'score': 0.5, 'sentiment_score': 0.0}
+        try:
+            result = self.pipeline(text[:self.max_length])[0]
+            label = result['label'].lower()
+            score = result['score']
+            # Convert to numeric sentiment score (-1 to 1)
+            if label == 'positive':
+                sentiment_score = score
+            elif label == 'negative':
+                sentiment_score = -score
+            else:
+                sentiment_score = 0.0
+            return {
+                'label': label,
+                'score': score,
+                'sentiment_score': sentiment_score
+            }
+        except Exception as e:
+            print(f"Error analyzing text: {e}")
+            return {'label': 'neutral', 'score': 0.5, 'sentiment_score': 0.0}
+    def analyze_batch(self, texts: List[str], batch_size: int = 32) -> List[Dict]:
+        """Analyze sentiment for a batch of texts"""
+        if not self.is_loaded:
+            return [{'label': 'neutral', 'score': 0.5, 'sentiment_score': 0.0} for _ in texts]
+        results = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i+batch_size]
+            try:
+                batch_results = self.pipeline(batch)
+                for res in batch_results:
+                    label = res['label'].lower()
+                    score = res['score']
+                    if label == 'positive':
+                        sentiment_score = score
+                    elif label == 'negative':
+                        sentiment_score = -score
+                    else:
+                        sentiment_score = 0.0
+                    results.append({
+                        'label': label,
+                        'score': score,
+                        'sentiment_score': sentiment_score
+                    })
+            except Exception as e:
+                print(f"Error in batch: {e}")
+                for _ in batch:
+                    results.append({'label': 'neutral', 'score': 0.5, 'sentiment_score': 0.0})
+        return results
+    def generate_sentiment_alpha(self, news_data: pd.DataFrame,
+                                  ticker_col: str = 'ticker',
+                                  text_col: str = 'text',
+                                  date_col: str = 'date',
+                                  window: int = 5) -> pd.DataFrame:
+        """
+        Generate daily sentiment alpha scores per asset
+        news_data: DataFrame with columns [date, ticker, text]
+        Returns: DataFrame with [date, ticker, sentiment_alpha]
+        """
+        if not self.is_loaded:
+            print("FinBERT not loaded, returning zeros")
+            return pd.DataFrame({
+                'date': news_data[date_col].unique(),
+                'sentiment_alpha': 0.0
+            })
+        print(f"Analyzing sentiment for {len(news_data)} news items...")
+        # Analyze all texts
+        texts = news_data[text_col].tolist()
+        sentiments = self.analyze_batch(texts)
+        news_data = news_data.copy()
+        news_data['sentiment_score'] = [s['sentiment_score'] for s in sentiments]
+        news_data['sentiment_magnitude'] = [abs(s['sentiment_score']) for s in sentiments]
+        # Aggregate by ticker and date
+        daily_sentiment = news_data.groupby([date_col, ticker_col]).agg({
+            'sentiment_score': ['mean', 'std', 'count'],
+            'sentiment_magnitude': 'mean'
+        }).reset_index()
+        daily_sentiment.columns = [date_col, ticker_col, 'sentiment_mean',
+                                    'sentiment_std', 'sentiment_count', 'sentiment_magnitude']
+        # Apply confidence weighting (more articles = more confident)
+        daily_sentiment['confidence'] = np.minimum(daily_sentiment['sentiment_count'] / 5, 1.0)
+        daily_sentiment['sentiment_alpha'] = (
+            daily_sentiment['sentiment_mean'] * daily_sentiment['confidence']
+        )
+        # Rolling window smoothing
+        daily_sentiment = daily_sentiment.sort_values([ticker_col, date_col])
+        daily_sentiment['sentiment_alpha_smooth'] = daily_sentiment.groupby(ticker_col)[
+            'sentiment_alpha'
+        ].transform(lambda x: x.rolling(window, min_periods=1).mean())
+        return daily_sentiment[[date_col, ticker_col, 'sentiment_alpha_smooth',
+                                 'sentiment_count', 'confidence']]
+    def generate_synthetic_news(self, tickers: List[str],
+                                 dates: pd.DatetimeIndex,
+                                 n_news_per_day: int = 3) -> pd.DataFrame:
+        """Generate synthetic financial news for testing"""
+        np.random.seed(42)
+        templates_positive = [
+            "{ticker} reports strong quarterly earnings, beating analyst expectations",
+            "{ticker} announces new product launch, stock rises in pre-market",
+            "Analysts upgrade {ticker} to buy rating, price target raised",
+            "{ticker} secures major contract, revenue outlook improved",
+            "{ticker} demonstrates strong growth in emerging markets"
+        ]
+        templates_negative = [
+            "{ticker} misses earnings expectations, stock falls sharply",
+            "{ticker} faces regulatory scrutiny, shares decline",
+            "Analysts downgrade {ticker} amid slowing growth concerns",
+            "{ticker} announces layoffs as part of restructuring plan",
+            "Supply chain issues impact {ticker} quarterly guidance"
+        ]
+        templates_neutral = [
+            "{ticker} maintains dividend policy, no changes expected",
+            "{ticker} announces board restructuring, effective next quarter",
+            "Market awaits {ticker} earnings report due next week",
+            "{ticker} trading volume remains within normal range",
+            "Analysts maintain hold rating on {ticker}"
+        ]
+        news_items = []
+        for date in dates:
+            for ticker in tickers:
+                for _ in range(n_news_per_day):
+                    sentiment_type = np.random.choice(['pos', 'neg', 'neu'],
+                                                       p=[0.35, 0.35, 0.3])
+                    if sentiment_type == 'pos':
+                        text = np.random.choice(templates_positive).format(ticker=ticker)
+                    elif sentiment_type == 'neg':
+                        text = np.random.choice(templates_negative).format(ticker=ticker)
+                    else:
+                        text = np.random.choice(templates_neutral).format(ticker=ticker)
+                    news_items.append({
+                        'date': date,
+                        'ticker': ticker,
+                        'text': text,
+                        'source': 'synthetic'
+                    })
+        return pd.DataFrame(news_items)