Premchan369 commited on
Commit
d2e9075
·
verified ·
1 Parent(s): 958d6b7

Upload sentiment_model.py

Browse files
Files changed (1) hide show
  1. sentiment_model.py +197 -0
sentiment_model.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """News + Sentiment Alpha Model using FinBERT."""
2
+ import numpy as np
3
+ import pandas as pd
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
6
+ from typing import List, Dict, Optional
7
+ import warnings
8
+ warnings.filterwarnings('ignore')
9
+
10
+
11
+ class SentimentAlphaModel:
12
+ """Financial sentiment analysis using FinBERT"""
13
+
14
+ def __init__(self, model_name: str = "ProsusAI/finbert",
15
+ device: str = 'cpu', max_length: int = 512):
16
+ self.model_name = model_name
17
+ self.device = device
18
+ self.max_length = max_length
19
+
20
+ print(f"Loading FinBERT model: {model_name}")
21
+ try:
22
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
24
+ self.model.to(device)
25
+ self.model.eval()
26
+ self.pipeline = pipeline(
27
+ "sentiment-analysis",
28
+ model=self.model,
29
+ tokenizer=self.tokenizer,
30
+ device=0 if device == 'cuda' else -1
31
+ )
32
+ self.is_loaded = True
33
+ except Exception as e:
34
+ print(f"Error loading FinBERT: {e}")
35
+ self.is_loaded = False
36
+
37
+ def analyze_text(self, text: str) -> Dict:
38
+ """Analyze sentiment of a single text"""
39
+ if not self.is_loaded:
40
+ return {'label': 'neutral', 'score': 0.5, 'sentiment_score': 0.0}
41
+
42
+ try:
43
+ result = self.pipeline(text[:self.max_length])[0]
44
+ label = result['label'].lower()
45
+ score = result['score']
46
+
47
+ # Convert to numeric sentiment score (-1 to 1)
48
+ if label == 'positive':
49
+ sentiment_score = score
50
+ elif label == 'negative':
51
+ sentiment_score = -score
52
+ else:
53
+ sentiment_score = 0.0
54
+
55
+ return {
56
+ 'label': label,
57
+ 'score': score,
58
+ 'sentiment_score': sentiment_score
59
+ }
60
+ except Exception as e:
61
+ print(f"Error analyzing text: {e}")
62
+ return {'label': 'neutral', 'score': 0.5, 'sentiment_score': 0.0}
63
+
64
+ def analyze_batch(self, texts: List[str], batch_size: int = 32) -> List[Dict]:
65
+ """Analyze sentiment for a batch of texts"""
66
+ if not self.is_loaded:
67
+ return [{'label': 'neutral', 'score': 0.5, 'sentiment_score': 0.0} for _ in texts]
68
+
69
+ results = []
70
+ for i in range(0, len(texts), batch_size):
71
+ batch = texts[i:i+batch_size]
72
+ try:
73
+ batch_results = self.pipeline(batch)
74
+ for res in batch_results:
75
+ label = res['label'].lower()
76
+ score = res['score']
77
+ if label == 'positive':
78
+ sentiment_score = score
79
+ elif label == 'negative':
80
+ sentiment_score = -score
81
+ else:
82
+ sentiment_score = 0.0
83
+ results.append({
84
+ 'label': label,
85
+ 'score': score,
86
+ 'sentiment_score': sentiment_score
87
+ })
88
+ except Exception as e:
89
+ print(f"Error in batch: {e}")
90
+ for _ in batch:
91
+ results.append({'label': 'neutral', 'score': 0.5, 'sentiment_score': 0.0})
92
+
93
+ return results
94
+
95
+ def generate_sentiment_alpha(self, news_data: pd.DataFrame,
96
+ ticker_col: str = 'ticker',
97
+ text_col: str = 'text',
98
+ date_col: str = 'date',
99
+ window: int = 5) -> pd.DataFrame:
100
+ """
101
+ Generate daily sentiment alpha scores per asset
102
+
103
+ news_data: DataFrame with columns [date, ticker, text]
104
+ Returns: DataFrame with [date, ticker, sentiment_alpha]
105
+ """
106
+ if not self.is_loaded:
107
+ print("FinBERT not loaded, returning zeros")
108
+ return pd.DataFrame({
109
+ 'date': news_data[date_col].unique(),
110
+ 'sentiment_alpha': 0.0
111
+ })
112
+
113
+ print(f"Analyzing sentiment for {len(news_data)} news items...")
114
+
115
+ # Analyze all texts
116
+ texts = news_data[text_col].tolist()
117
+ sentiments = self.analyze_batch(texts)
118
+
119
+ news_data = news_data.copy()
120
+ news_data['sentiment_score'] = [s['sentiment_score'] for s in sentiments]
121
+ news_data['sentiment_magnitude'] = [abs(s['sentiment_score']) for s in sentiments]
122
+
123
+ # Aggregate by ticker and date
124
+ daily_sentiment = news_data.groupby([date_col, ticker_col]).agg({
125
+ 'sentiment_score': ['mean', 'std', 'count'],
126
+ 'sentiment_magnitude': 'mean'
127
+ }).reset_index()
128
+
129
+ daily_sentiment.columns = [date_col, ticker_col, 'sentiment_mean',
130
+ 'sentiment_std', 'sentiment_count', 'sentiment_magnitude']
131
+
132
+ # Apply confidence weighting (more articles = more confident)
133
+ daily_sentiment['confidence'] = np.minimum(daily_sentiment['sentiment_count'] / 5, 1.0)
134
+ daily_sentiment['sentiment_alpha'] = (
135
+ daily_sentiment['sentiment_mean'] * daily_sentiment['confidence']
136
+ )
137
+
138
+ # Rolling window smoothing
139
+ daily_sentiment = daily_sentiment.sort_values([ticker_col, date_col])
140
+ daily_sentiment['sentiment_alpha_smooth'] = daily_sentiment.groupby(ticker_col)[
141
+ 'sentiment_alpha'
142
+ ].transform(lambda x: x.rolling(window, min_periods=1).mean())
143
+
144
+ return daily_sentiment[[date_col, ticker_col, 'sentiment_alpha_smooth',
145
+ 'sentiment_count', 'confidence']]
146
+
147
+ def generate_synthetic_news(self, tickers: List[str],
148
+ dates: pd.DatetimeIndex,
149
+ n_news_per_day: int = 3) -> pd.DataFrame:
150
+ """Generate synthetic financial news for testing"""
151
+ np.random.seed(42)
152
+
153
+ templates_positive = [
154
+ "{ticker} reports strong quarterly earnings, beating analyst expectations",
155
+ "{ticker} announces new product launch, stock rises in pre-market",
156
+ "Analysts upgrade {ticker} to buy rating, price target raised",
157
+ "{ticker} secures major contract, revenue outlook improved",
158
+ "{ticker} demonstrates strong growth in emerging markets"
159
+ ]
160
+
161
+ templates_negative = [
162
+ "{ticker} misses earnings expectations, stock falls sharply",
163
+ "{ticker} faces regulatory scrutiny, shares decline",
164
+ "Analysts downgrade {ticker} amid slowing growth concerns",
165
+ "{ticker} announces layoffs as part of restructuring plan",
166
+ "Supply chain issues impact {ticker} quarterly guidance"
167
+ ]
168
+
169
+ templates_neutral = [
170
+ "{ticker} maintains dividend policy, no changes expected",
171
+ "{ticker} announces board restructuring, effective next quarter",
172
+ "Market awaits {ticker} earnings report due next week",
173
+ "{ticker} trading volume remains within normal range",
174
+ "Analysts maintain hold rating on {ticker}"
175
+ ]
176
+
177
+ news_items = []
178
+ for date in dates:
179
+ for ticker in tickers:
180
+ for _ in range(n_news_per_day):
181
+ sentiment_type = np.random.choice(['pos', 'neg', 'neu'],
182
+ p=[0.35, 0.35, 0.3])
183
+ if sentiment_type == 'pos':
184
+ text = np.random.choice(templates_positive).format(ticker=ticker)
185
+ elif sentiment_type == 'neg':
186
+ text = np.random.choice(templates_negative).format(ticker=ticker)
187
+ else:
188
+ text = np.random.choice(templates_neutral).format(ticker=ticker)
189
+
190
+ news_items.append({
191
+ 'date': date,
192
+ 'ticker': ticker,
193
+ 'text': text,
194
+ 'source': 'synthetic'
195
+ })
196
+
197
+ return pd.DataFrame(news_items)