ckharche commited on
Commit
9606d57
·
verified ·
1 Parent(s): 461b81a

Update trade_analysis/enhanced_sentiment.py

Browse files
Files changed (1) hide show
  1. trade_analysis/enhanced_sentiment.py +572 -572
trade_analysis/enhanced_sentiment.py CHANGED
@@ -1,573 +1,573 @@
1
- # trade_analysis/enhanced_sentiment.py
2
-
3
- import torch
4
- import torch.nn as nn
5
- from transformers import (
6
- AutoTokenizer, AutoModelForSequenceClassification,
7
- AutoModelForCausalLM, BitsAndBytesConfig, pipeline
8
- )
9
- from typing import Dict, List, Optional, Tuple
10
- import numpy as np
11
- import pandas as pd
12
- import asyncio
13
- from concurrent.futures import ThreadPoolExecutor
14
- import json
15
- import os
16
- from datetime import datetime, timedelta
17
- import warnings
18
- warnings.filterwarnings("ignore")
19
-
20
- class EnhancedFinancialSentimentAnalyzer:
21
- """
22
- SOTA Financial Sentiment Analysis using 2025 models
23
- Optimized for H100/H200 GPUs and momentum trading
24
- """
25
-
26
- def __init__(self, device: str = "auto"):
27
- self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
28
- self.models = {}
29
- self.tokenizers = {}
30
- self.pipelines = {}
31
-
32
- # Enhanced model configuration - WORKING MODELS ONLY
33
- self.model_configs = {
34
- # Tier 1: SOTA Financial Models (2025)
35
- 'finbert_prosus': {
36
- 'model_id': 'ProsusAI/finbert',
37
- 'weight': 0.25,
38
- 'type': 'classification',
39
- 'specialization': 'general_financial'
40
- },
41
- 'finbert_tone': {
42
- 'model_id': 'yiyanghkust/finbert-tone',
43
- 'weight': 0.25,
44
- 'type': 'classification',
45
- 'specialization': 'tone_analysis'
46
- },
47
- 'roberta_financial': {
48
- 'model_id': 'cardiffnlp/twitter-roberta-base-sentiment-latest',
49
- 'weight': 0.20,
50
- 'type': 'classification',
51
- 'specialization': 'social_sentiment'
52
- },
53
- 'distilroberta_financial': {
54
- 'model_id': 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis',
55
- 'weight': 0.20,
56
- 'type': 'classification',
57
- 'specialization': 'news_sentiment'
58
- },
59
-
60
- # Tier 2: Specialized Models
61
- 'fintwit_bert': {
62
- 'model_id': 'StephanAkkerman/FinTwitBERT-sentiment',
63
- 'weight': 0.10,
64
- 'type': 'classification',
65
- 'specialization': 'social_trading'
66
- }
67
- }
68
-
69
- # Renormalize weights
70
- total_weight = sum(config['weight'] for config in self.model_configs.values())
71
- for config in self.model_configs.values():
72
- config['weight'] /= total_weight
73
-
74
- def initialize_models(self):
75
- """Load all sentiment models"""
76
- print("Loading Enhanced Financial Sentiment Models...")
77
-
78
- for model_key, config in self.model_configs.items():
79
- try:
80
- print(f"Loading {model_key}...")
81
-
82
- if config['type'] == 'classification':
83
- # Load classification models
84
- self.tokenizers[model_key] = AutoTokenizer.from_pretrained(
85
- config['model_id'],
86
- trust_remote_code=True
87
- )
88
- self.models[model_key] = AutoModelForSequenceClassification.from_pretrained(
89
- config['model_id'],
90
- trust_remote_code=True
91
- ).to(self.device)
92
-
93
- elif config['type'] == 'causal':
94
- # Skip causal models for now since they're having issues
95
- print(f"Skipping causal model {model_key} - focusing on classification models")
96
- config['weight'] = 0
97
- continue
98
-
99
- print(f"✅ {model_key} loaded successfully")
100
-
101
- except Exception as e:
102
- print(f"❌ Failed to load {model_key}: {e}")
103
- config['weight'] = 0
104
-
105
- # Create sentiment pipeline for fast inference
106
- self._create_pipelines()
107
- print(f"✅ Loaded {len(self.models)} sentiment models")
108
-
109
- def _create_pipelines(self):
110
- """Create HuggingFace pipelines for efficient inference"""
111
- for model_key, config in self.model_configs.items():
112
- if config['weight'] > 0 and model_key in self.models:
113
- if config['type'] == 'classification':
114
- try:
115
- self.pipelines[model_key] = pipeline(
116
- "sentiment-analysis",
117
- model=self.models[model_key],
118
- tokenizer=self.tokenizers[model_key],
119
- device=0 if torch.cuda.is_available() else -1,
120
- return_all_scores=True
121
- )
122
- except Exception as e:
123
- print(f"Failed to create pipeline for {model_key}: {e}")
124
-
125
- def analyze_comprehensive_sentiment(self, news_df: pd.DataFrame, social_df: pd.DataFrame, symbol: str) -> Dict:
126
- """
127
- Comprehensive sentiment analysis for momentum trading
128
- """
129
- if news_df.empty and social_df.empty:
130
- return self._default_sentiment()
131
-
132
- # Prepare text data
133
- texts = []
134
- metadata = []
135
-
136
- # Add news headlines
137
- if not news_df.empty:
138
- for _, row in news_df.iterrows():
139
- text = row.get('headline', '') or row.get('title', '')
140
- if text:
141
- texts.append(str(text))
142
- metadata.append({
143
- 'source': 'news',
144
- 'timestamp': row.get('datetime', datetime.now()),
145
- 'impact': self._calculate_news_impact(str(text))
146
- })
147
-
148
- # Add social media content
149
- if not social_df.empty:
150
- for _, row in social_df.iterrows():
151
- text = row.get('title', '') or row.get('content', '')
152
- if text:
153
- texts.append(str(text))
154
- metadata.append({
155
- 'source': 'social',
156
- 'timestamp': row.get('created_utc', datetime.now()),
157
- 'score': row.get('score', 0)
158
- })
159
-
160
- if not texts:
161
- return self._default_sentiment()
162
-
163
- # Run ensemble sentiment analysis
164
- sentiment_results = self._run_ensemble_sentiment(texts)
165
-
166
- # Calculate weighted sentiment scores
167
- financial_sentiment = self._calculate_financial_sentiment(sentiment_results, metadata)
168
- social_sentiment = self._calculate_social_sentiment(sentiment_results, metadata)
169
-
170
- # Economic impact analysis
171
- economic_impact = self._analyze_economic_impact(texts)
172
-
173
- # Create momentum-focused composite score
174
- composite_score = self._calculate_momentum_composite(
175
- financial_sentiment, social_sentiment, economic_impact
176
- )
177
-
178
- # Generate key themes for transparency
179
- key_themes = self._extract_key_themes(texts, sentiment_results)
180
-
181
- return {
182
- 'financial_sentiment': financial_sentiment,
183
- 'social_sentiment': social_sentiment,
184
- 'economic_impact': economic_impact,
185
- 'composite_score': composite_score,
186
- 'confidence': self._calculate_confidence(sentiment_results),
187
- 'key_themes': key_themes,
188
- 'model_count': len([k for k, v in self.model_configs.items() if v['weight'] > 0])
189
- }
190
-
191
- def _run_ensemble_sentiment(self, texts: List[str]) -> Dict:
192
- """Run all available models on the text data"""
193
- results = {}
194
-
195
- for model_key, config in self.model_configs.items():
196
- if config['weight'] == 0 or model_key not in self.models:
197
- continue
198
-
199
- try:
200
- if config['type'] == 'classification':
201
- # Use pipeline for fast inference
202
- if model_key in self.pipelines:
203
- predictions = []
204
- for text in texts:
205
- result = self.pipelines[model_key](text[:512])
206
- # Convert to standardized score
207
- if isinstance(result, list) and len(result) > 0:
208
- if isinstance(result[0], dict):
209
- score = self._standardize_classification_score(result)
210
- else:
211
- score = self._standardize_classification_score(result[0])
212
- else:
213
- score = 0.0
214
- predictions.append(score)
215
- else:
216
- predictions = self._run_classification_batch(texts, model_key)
217
-
218
- elif config['type'] == 'causal':
219
- # Skip causal for now
220
- continue
221
-
222
- results[model_key] = {
223
- 'predictions': predictions,
224
- 'weight': config['weight'],
225
- 'specialization': config['specialization']
226
- }
227
-
228
- except Exception as e:
229
- print(f"Error running {model_key}: {e}")
230
- continue
231
-
232
- return results
233
-
234
- def _run_classification_batch(self, texts: List[str], model_key: str) -> List[float]:
235
- """Run classification model in batches"""
236
- model = self.models[model_key]
237
- tokenizer = self.tokenizers[model_key]
238
-
239
- predictions = []
240
- batch_size = 8 # Reduced for stability
241
-
242
- for i in range(0, len(texts), batch_size):
243
- batch_texts = texts[i:i + batch_size]
244
-
245
- try:
246
- inputs = tokenizer(
247
- batch_texts,
248
- padding=True,
249
- truncation=True,
250
- max_length=512,
251
- return_tensors="pt"
252
- ).to(self.device)
253
-
254
- with torch.no_grad():
255
- outputs = model(**inputs)
256
- probs = torch.softmax(outputs.logits, dim=-1)
257
-
258
- for prob in probs:
259
- if prob.shape[0] == 3: # [negative, neutral, positive]
260
- score = prob[2].item() - prob[0].item()
261
- else: # [negative, positive]
262
- score = prob[1].item() - prob[0].item()
263
- predictions.append(score)
264
- except Exception as e:
265
- print(f"Batch processing error: {e}")
266
- # Add neutral scores for failed batch
267
- predictions.extend([0.0] * len(batch_texts))
268
-
269
- return predictions
270
-
271
- def _standardize_classification_score(self, result) -> float:
272
- """Convert pipeline output to standardized score"""
273
- if not result:
274
- return 0.0
275
-
276
- try:
277
- # Handle nested list structure
278
- if isinstance(result, list) and len(result) > 0:
279
- if isinstance(result[0], list):
280
- result = result[0]
281
-
282
- # Convert to dict if not already
283
- if isinstance(result, list):
284
- scores = {}
285
- for item in result:
286
- if isinstance(item, dict) and 'label' in item:
287
- scores[item['label'].upper()] = item['score']
288
- else:
289
- return 0.0
290
-
291
- positive_labels = ['POSITIVE', 'POS', 'BULLISH', 'LABEL_2']
292
- negative_labels = ['NEGATIVE', 'NEG', 'BEARISH', 'LABEL_0']
293
-
294
- positive_score = sum(scores.get(label, 0) for label in positive_labels)
295
- negative_score = sum(scores.get(label, 0) for label in negative_labels)
296
-
297
- return positive_score - negative_score
298
- except Exception as e:
299
- print(f"Score standardization error: {e}")
300
- return 0.0
301
-
302
- def _calculate_financial_sentiment(self, results: Dict, metadata: List[Dict]) -> float:
303
- """Calculate weighted financial sentiment score"""
304
- if not results:
305
- return 0.0
306
-
307
- weighted_scores = []
308
- total_weight = 0
309
-
310
- for model_key, model_results in results.items():
311
- predictions = model_results['predictions']
312
- weight = model_results['weight']
313
- specialization = model_results['specialization']
314
-
315
- # Apply specialization bonus
316
- if specialization in ['general_financial', 'earnings', 'news_sentiment']:
317
- weight *= 1.2
318
-
319
- # Weight by news impact
320
- for i, pred in enumerate(predictions[:len(metadata)]):
321
- meta = metadata[i] if i < len(metadata) else {'source': 'unknown', 'impact': 1.0}
322
- if meta['source'] == 'news':
323
- impact_weight = meta.get('impact', 1.0)
324
- weighted_scores.append(pred * weight * impact_weight)
325
- total_weight += weight * impact_weight
326
- else:
327
- weighted_scores.append(pred * weight)
328
- total_weight += weight
329
-
330
- return sum(weighted_scores) / max(total_weight, 1)
331
-
332
- def _calculate_social_sentiment(self, results: Dict, metadata: List[Dict]) -> float:
333
- """Calculate social media sentiment score"""
334
- if not results:
335
- return 0.0
336
-
337
- social_scores = []
338
-
339
- for model_key, model_results in results.items():
340
- predictions = model_results['predictions']
341
- specialization = model_results['specialization']
342
-
343
- # Prioritize social-specific models
344
- weight = 1.5 if specialization == 'social_sentiment' else 1.0
345
-
346
- for i, pred in enumerate(predictions[:len(metadata)]):
347
- meta = metadata[i] if i < len(metadata) else {'source': 'unknown', 'score': 0}
348
- if meta['source'] == 'social':
349
- # Weight by social score (upvotes, likes, etc.)
350
- social_weight = min(max(meta.get('score', 0) / 10, 0.5), 2.0)
351
- social_scores.append(pred * weight * social_weight)
352
-
353
- return np.mean(social_scores) if social_scores else 0.0
354
-
355
- def _analyze_economic_impact(self, texts: List[str]) -> float:
356
- """Analyze economic impact using keyword analysis"""
357
- impact_keywords = {
358
- 'high_impact': ['fed', 'federal reserve', 'inflation', 'gdp', 'unemployment', 'interest rate'],
359
- 'medium_impact': ['earnings', 'revenue', 'profit', 'guidance', 'outlook'],
360
- 'market_structure': ['merger', 'acquisition', 'ipo', 'split', 'dividend']
361
- }
362
-
363
- total_impact = 0
364
- impact_count = 0
365
-
366
- for text in texts:
367
- text_lower = text.lower()
368
-
369
- # High impact events
370
- high_matches = sum(1 for keyword in impact_keywords['high_impact']
371
- if keyword in text_lower)
372
- if high_matches > 0:
373
- total_impact += high_matches * 3
374
- impact_count += 1
375
-
376
- # Medium impact events
377
- medium_matches = sum(1 for keyword in impact_keywords['medium_impact']
378
- if keyword in text_lower)
379
- if medium_matches > 0:
380
- total_impact += medium_matches * 2
381
- impact_count += 1
382
-
383
- # Market structure events
384
- structure_matches = sum(1 for keyword in impact_keywords['market_structure']
385
- if keyword in text_lower)
386
- if structure_matches > 0:
387
- total_impact += structure_matches * 1.5
388
- impact_count += 1
389
-
390
- return total_impact / max(impact_count, 1)
391
-
392
- def _calculate_momentum_composite(self, financial_sent: float, social_sent: float,
393
- economic_impact: float) -> float:
394
- """Calculate composite score optimized for momentum trading"""
395
- # Momentum trading weights - prioritize speed and strength
396
- financial_weight = 0.5 # Primary signal
397
- social_weight = 0.2 # Secondary confirmation
398
- economic_weight = 0.3 # Impact multiplier
399
-
400
- composite = (financial_sent * financial_weight +
401
- social_sent * social_weight +
402
- economic_impact * economic_weight * 0.1) # Scale economic impact
403
-
404
- # Apply momentum amplification for strong signals
405
- if abs(composite) > 0.5:
406
- composite *= 1.2
407
-
408
- return np.clip(composite, -1.0, 1.0)
409
-
410
- def _calculate_confidence(self, results: Dict) -> str:
411
- """Calculate confidence level based on model agreement"""
412
- if not results:
413
- return "LOW"
414
-
415
- all_predictions = []
416
- for model_results in results.values():
417
- all_predictions.extend(model_results['predictions'])
418
-
419
- if not all_predictions:
420
- return "LOW"
421
-
422
- # Calculate standard deviation for agreement
423
- std_dev = np.std(all_predictions)
424
- mean_abs = np.mean(np.abs(all_predictions))
425
-
426
- if std_dev < 0.2 and mean_abs > 0.3:
427
- return "HIGH"
428
- elif std_dev < 0.4 and mean_abs > 0.2:
429
- return "MEDIUM"
430
- else:
431
- return "LOW"
432
-
433
- def _extract_key_themes(self, texts: List[str], results: Dict) -> List[Dict]:
434
- """Extract key themes with sentiment scores"""
435
- themes = []
436
-
437
- # Simple theme extraction based on high-impact content
438
- for i, text in enumerate(texts[:10]): # Limit for performance
439
- # Calculate average sentiment for this text
440
- avg_sentiment = 0
441
- model_count = 0
442
-
443
- for model_results in results.values():
444
- if i < len(model_results['predictions']):
445
- avg_sentiment += model_results['predictions'][i]
446
- model_count += 1
447
-
448
- if model_count > 0:
449
- avg_sentiment /= model_count
450
-
451
- # Only include significant sentiments
452
- if abs(avg_sentiment) > 0.3:
453
- themes.append({
454
- 'headline': text[:100],
455
- 'sentiment': round(avg_sentiment, 3),
456
- 'impact': 'HIGH' if abs(avg_sentiment) > 0.6 else 'MEDIUM'
457
- })
458
-
459
- return sorted(themes, key=lambda x: abs(x['sentiment']), reverse=True)[:5]
460
-
461
- def _calculate_news_impact(self, text: str) -> float:
462
- """Calculate news impact multiplier"""
463
- text_lower = text.lower()
464
-
465
- # High impact keywords
466
- high_impact = ['breaking', 'urgent', 'alert', 'crash', 'surge', 'halted']
467
- medium_impact = ['announces', 'reports', 'updates', 'guidance']
468
-
469
- multiplier = 1.0
470
-
471
- if any(keyword in text_lower for keyword in high_impact):
472
- multiplier = 2.0
473
- elif any(keyword in text_lower for keyword in medium_impact):
474
- multiplier = 1.5
475
-
476
- return multiplier
477
-
478
- def _default_sentiment(self) -> Dict:
479
- """Return default sentiment values"""
480
- return {
481
- 'financial_sentiment': 0.0,
482
- 'social_sentiment': 0.0,
483
- 'economic_impact': 0.0,
484
- 'composite_score': 0.0,
485
- 'confidence': 'LOW',
486
- 'key_themes': [],
487
- 'model_count': 0
488
- }
489
-
490
- # Momentum-specific analysis functions
491
- class MomentumSentimentSignals:
492
- """Generate momentum trading signals from sentiment"""
493
-
494
- @staticmethod
495
- def generate_momentum_signals(sentiment_data: Dict, timeframe: str = '5m') -> Dict:
496
- """Generate momentum signals for scalping/day trading"""
497
-
498
- composite_score = sentiment_data.get('composite_score', 0)
499
- confidence = sentiment_data.get('confidence', 'LOW')
500
- economic_impact = sentiment_data.get('economic_impact', 0)
501
-
502
- # Momentum thresholds based on timeframe
503
- thresholds = {
504
- '1m': {'strong': 0.3, 'weak': 0.15},
505
- '5m': {'strong': 0.4, 'weak': 0.2},
506
- '15m': {'strong': 0.5, 'weak': 0.25}
507
- }
508
-
509
- thresh = thresholds.get(timeframe, thresholds['5m'])
510
-
511
- # Generate signals
512
- if composite_score > thresh['strong'] and confidence in ['HIGH', 'MEDIUM']:
513
- signal = 'STRONG_BULLISH'
514
- conviction = 0.8 if confidence == 'HIGH' else 0.6
515
- elif composite_score > thresh['weak']:
516
- signal = 'WEAK_BULLISH'
517
- conviction = 0.5
518
- elif composite_score < -thresh['strong'] and confidence in ['HIGH', 'MEDIUM']:
519
- signal = 'STRONG_BEARISH'
520
- conviction = 0.8 if confidence == 'HIGH' else 0.6
521
- elif composite_score < -thresh['weak']:
522
- signal = 'WEAK_BEARISH'
523
- conviction = 0.5
524
- else:
525
- signal = 'NEUTRAL'
526
- conviction = 0.3
527
-
528
- # Economic impact multiplier
529
- if economic_impact > 3:
530
- conviction *= 1.2
531
-
532
- return {
533
- 'signal': signal,
534
- 'conviction': min(conviction, 1.0),
535
- 'timeframe': timeframe,
536
- 'composite_score': composite_score,
537
- 'economic_multiplier': economic_impact
538
- }
539
-
540
- # Initialize global analyzer instance
541
- sentiment_analyzer = None
542
-
543
- def get_sentiment_analyzer():
544
- """Get or create sentiment analyzer instance"""
545
- global sentiment_analyzer
546
- if sentiment_analyzer is None:
547
- sentiment_analyzer = EnhancedFinancialSentimentAnalyzer()
548
- sentiment_analyzer.initialize_models()
549
- return sentiment_analyzer
550
-
551
- def analyze_momentum_sentiment(news_df: pd.DataFrame, social_df: pd.DataFrame,
552
- symbol: str, timeframe: str = '5m') -> Dict:
553
- """Main function for momentum sentiment analysis"""
554
- analyzer = get_sentiment_analyzer()
555
-
556
- # Get comprehensive sentiment
557
- sentiment_data = analyzer.analyze_comprehensive_sentiment(news_df, social_df, symbol)
558
-
559
- # Generate momentum signals
560
- momentum_signals = MomentumSentimentSignals.generate_momentum_signals(
561
- sentiment_data, timeframe
562
- )
563
-
564
- # Combine results
565
- return {
566
- **sentiment_data,
567
- 'momentum_signals': momentum_signals
568
- }
569
-
570
- # For backwards compatibility with existing code
571
- class MultiModalSentimentAnalyzer(EnhancedFinancialSentimentAnalyzer):
572
- """Backwards compatibility class"""
573
  pass
 
1
+ # trade_analysis/enhanced_sentiment.py
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers import (
6
+ AutoTokenizer, AutoModelForSequenceClassification,
7
+ AutoModelForCausalLM, BitsAndBytesConfig, pipeline
8
+ )
9
+ from typing import Dict, List, Optional, Tuple
10
+ import numpy as np
11
+ import pandas as pd
12
+ import asyncio
13
+ from concurrent.futures import ThreadPoolExecutor
14
+ import json
15
+ import os
16
+ from datetime import datetime, timedelta
17
+ import warnings
18
+ warnings.filterwarnings("ignore")
19
+
20
+ class EnhancedFinancialSentimentAnalyzer:
21
+ """
22
+ SOTA Financial Sentiment Analysis using 2025 models
23
+ Optimized for H100/H200 GPUs and momentum trading
24
+ """
25
+
26
+ def __init__(self, device: str = "auto"):
27
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
28
+ self.models = {}
29
+ self.tokenizers = {}
30
+ self.pipelines = {}
31
+
32
+ # Enhanced model configuration - WORKING MODELS ONLY
33
+ self.model_configs = {
34
+ # Tier 1: SOTA Financial Models (2025)
35
+ 'finbert_prosus': {
36
+ 'model_id': 'ProsusAI/finbert',
37
+ 'weight': 0.25,
38
+ 'type': 'classification',
39
+ 'specialization': 'general_financial'
40
+ # },
41
+ # 'finbert_tone': {
42
+ # 'model_id': 'yiyanghkust/finbert-tone',
43
+ # 'weight': 0.25,
44
+ # 'type': 'classification',
45
+ # 'specialization': 'tone_analysis'
46
+ # },
47
+ # 'roberta_financial': {
48
+ # 'model_id': 'cardiffnlp/twitter-roberta-base-sentiment-latest',
49
+ # 'weight': 0.20,
50
+ # 'type': 'classification',
51
+ # 'specialization': 'social_sentiment'
52
+ # },
53
+ # 'distilroberta_financial': {
54
+ # 'model_id': 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis',
55
+ # 'weight': 0.20,
56
+ # 'type': 'classification',
57
+ # 'specialization': 'news_sentiment'
58
+ # },
59
+
60
+ # # Tier 2: Specialized Models
61
+ # 'fintwit_bert': {
62
+ # 'model_id': 'StephanAkkerman/FinTwitBERT-sentiment',
63
+ # 'weight': 0.10,
64
+ # 'type': 'classification',
65
+ # 'specialization': 'social_trading'
66
+ # }
67
+ }
68
+
69
+ # Renormalize weights
70
+ total_weight = sum(config['weight'] for config in self.model_configs.values())
71
+ for config in self.model_configs.values():
72
+ config['weight'] /= total_weight
73
+
74
+ def initialize_models(self):
75
+ """Load all sentiment models"""
76
+ print("Loading Enhanced Financial Sentiment Models...")
77
+
78
+ for model_key, config in self.model_configs.items():
79
+ try:
80
+ print(f"Loading {model_key}...")
81
+
82
+ if config['type'] == 'classification':
83
+ # Load classification models
84
+ self.tokenizers[model_key] = AutoTokenizer.from_pretrained(
85
+ config['model_id'],
86
+ trust_remote_code=True
87
+ )
88
+ self.models[model_key] = AutoModelForSequenceClassification.from_pretrained(
89
+ config['model_id'],
90
+ trust_remote_code=True
91
+ ).to(self.device)
92
+
93
+ elif config['type'] == 'causal':
94
+ # Skip causal models for now since they're having issues
95
+ print(f"Skipping causal model {model_key} - focusing on classification models")
96
+ config['weight'] = 0
97
+ continue
98
+
99
+ print(f"✅ {model_key} loaded successfully")
100
+
101
+ except Exception as e:
102
+ print(f"❌ Failed to load {model_key}: {e}")
103
+ config['weight'] = 0
104
+
105
+ # Create sentiment pipeline for fast inference
106
+ self._create_pipelines()
107
+ print(f"✅ Loaded {len(self.models)} sentiment models")
108
+
109
+ def _create_pipelines(self):
110
+ """Create HuggingFace pipelines for efficient inference"""
111
+ for model_key, config in self.model_configs.items():
112
+ if config['weight'] > 0 and model_key in self.models:
113
+ if config['type'] == 'classification':
114
+ try:
115
+ self.pipelines[model_key] = pipeline(
116
+ "sentiment-analysis",
117
+ model=self.models[model_key],
118
+ tokenizer=self.tokenizers[model_key],
119
+ device=0 if torch.cuda.is_available() else -1,
120
+ return_all_scores=True
121
+ )
122
+ except Exception as e:
123
+ print(f"Failed to create pipeline for {model_key}: {e}")
124
+
125
+ def analyze_comprehensive_sentiment(self, news_df: pd.DataFrame, social_df: pd.DataFrame, symbol: str) -> Dict:
126
+ """
127
+ Comprehensive sentiment analysis for momentum trading
128
+ """
129
+ if news_df.empty and social_df.empty:
130
+ return self._default_sentiment()
131
+
132
+ # Prepare text data
133
+ texts = []
134
+ metadata = []
135
+
136
+ # Add news headlines
137
+ if not news_df.empty:
138
+ for _, row in news_df.iterrows():
139
+ text = row.get('headline', '') or row.get('title', '')
140
+ if text:
141
+ texts.append(str(text))
142
+ metadata.append({
143
+ 'source': 'news',
144
+ 'timestamp': row.get('datetime', datetime.now()),
145
+ 'impact': self._calculate_news_impact(str(text))
146
+ })
147
+
148
+ # Add social media content
149
+ if not social_df.empty:
150
+ for _, row in social_df.iterrows():
151
+ text = row.get('title', '') or row.get('content', '')
152
+ if text:
153
+ texts.append(str(text))
154
+ metadata.append({
155
+ 'source': 'social',
156
+ 'timestamp': row.get('created_utc', datetime.now()),
157
+ 'score': row.get('score', 0)
158
+ })
159
+
160
+ if not texts:
161
+ return self._default_sentiment()
162
+
163
+ # Run ensemble sentiment analysis
164
+ sentiment_results = self._run_ensemble_sentiment(texts)
165
+
166
+ # Calculate weighted sentiment scores
167
+ financial_sentiment = self._calculate_financial_sentiment(sentiment_results, metadata)
168
+ social_sentiment = self._calculate_social_sentiment(sentiment_results, metadata)
169
+
170
+ # Economic impact analysis
171
+ economic_impact = self._analyze_economic_impact(texts)
172
+
173
+ # Create momentum-focused composite score
174
+ composite_score = self._calculate_momentum_composite(
175
+ financial_sentiment, social_sentiment, economic_impact
176
+ )
177
+
178
+ # Generate key themes for transparency
179
+ key_themes = self._extract_key_themes(texts, sentiment_results)
180
+
181
+ return {
182
+ 'financial_sentiment': financial_sentiment,
183
+ 'social_sentiment': social_sentiment,
184
+ 'economic_impact': economic_impact,
185
+ 'composite_score': composite_score,
186
+ 'confidence': self._calculate_confidence(sentiment_results),
187
+ 'key_themes': key_themes,
188
+ 'model_count': len([k for k, v in self.model_configs.items() if v['weight'] > 0])
189
+ }
190
+
191
+ def _run_ensemble_sentiment(self, texts: List[str]) -> Dict:
192
+ """Run all available models on the text data"""
193
+ results = {}
194
+
195
+ for model_key, config in self.model_configs.items():
196
+ if config['weight'] == 0 or model_key not in self.models:
197
+ continue
198
+
199
+ try:
200
+ if config['type'] == 'classification':
201
+ # Use pipeline for fast inference
202
+ if model_key in self.pipelines:
203
+ predictions = []
204
+ for text in texts:
205
+ result = self.pipelines[model_key](text[:512])
206
+ # Convert to standardized score
207
+ if isinstance(result, list) and len(result) > 0:
208
+ if isinstance(result[0], dict):
209
+ score = self._standardize_classification_score(result)
210
+ else:
211
+ score = self._standardize_classification_score(result[0])
212
+ else:
213
+ score = 0.0
214
+ predictions.append(score)
215
+ else:
216
+ predictions = self._run_classification_batch(texts, model_key)
217
+
218
+ elif config['type'] == 'causal':
219
+ # Skip causal for now
220
+ continue
221
+
222
+ results[model_key] = {
223
+ 'predictions': predictions,
224
+ 'weight': config['weight'],
225
+ 'specialization': config['specialization']
226
+ }
227
+
228
+ except Exception as e:
229
+ print(f"Error running {model_key}: {e}")
230
+ continue
231
+
232
+ return results
233
+
234
+ def _run_classification_batch(self, texts: List[str], model_key: str) -> List[float]:
235
+ """Run classification model in batches"""
236
+ model = self.models[model_key]
237
+ tokenizer = self.tokenizers[model_key]
238
+
239
+ predictions = []
240
+ batch_size = 8 # Reduced for stability
241
+
242
+ for i in range(0, len(texts), batch_size):
243
+ batch_texts = texts[i:i + batch_size]
244
+
245
+ try:
246
+ inputs = tokenizer(
247
+ batch_texts,
248
+ padding=True,
249
+ truncation=True,
250
+ max_length=512,
251
+ return_tensors="pt"
252
+ ).to(self.device)
253
+
254
+ with torch.no_grad():
255
+ outputs = model(**inputs)
256
+ probs = torch.softmax(outputs.logits, dim=-1)
257
+
258
+ for prob in probs:
259
+ if prob.shape[0] == 3: # [negative, neutral, positive]
260
+ score = prob[2].item() - prob[0].item()
261
+ else: # [negative, positive]
262
+ score = prob[1].item() - prob[0].item()
263
+ predictions.append(score)
264
+ except Exception as e:
265
+ print(f"Batch processing error: {e}")
266
+ # Add neutral scores for failed batch
267
+ predictions.extend([0.0] * len(batch_texts))
268
+
269
+ return predictions
270
+
271
+ def _standardize_classification_score(self, result) -> float:
272
+ """Convert pipeline output to standardized score"""
273
+ if not result:
274
+ return 0.0
275
+
276
+ try:
277
+ # Handle nested list structure
278
+ if isinstance(result, list) and len(result) > 0:
279
+ if isinstance(result[0], list):
280
+ result = result[0]
281
+
282
+ # Convert to dict if not already
283
+ if isinstance(result, list):
284
+ scores = {}
285
+ for item in result:
286
+ if isinstance(item, dict) and 'label' in item:
287
+ scores[item['label'].upper()] = item['score']
288
+ else:
289
+ return 0.0
290
+
291
+ positive_labels = ['POSITIVE', 'POS', 'BULLISH', 'LABEL_2']
292
+ negative_labels = ['NEGATIVE', 'NEG', 'BEARISH', 'LABEL_0']
293
+
294
+ positive_score = sum(scores.get(label, 0) for label in positive_labels)
295
+ negative_score = sum(scores.get(label, 0) for label in negative_labels)
296
+
297
+ return positive_score - negative_score
298
+ except Exception as e:
299
+ print(f"Score standardization error: {e}")
300
+ return 0.0
301
+
302
+ def _calculate_financial_sentiment(self, results: Dict, metadata: List[Dict]) -> float:
303
+ """Calculate weighted financial sentiment score"""
304
+ if not results:
305
+ return 0.0
306
+
307
+ weighted_scores = []
308
+ total_weight = 0
309
+
310
+ for model_key, model_results in results.items():
311
+ predictions = model_results['predictions']
312
+ weight = model_results['weight']
313
+ specialization = model_results['specialization']
314
+
315
+ # Apply specialization bonus
316
+ if specialization in ['general_financial', 'earnings', 'news_sentiment']:
317
+ weight *= 1.2
318
+
319
+ # Weight by news impact
320
+ for i, pred in enumerate(predictions[:len(metadata)]):
321
+ meta = metadata[i] if i < len(metadata) else {'source': 'unknown', 'impact': 1.0}
322
+ if meta['source'] == 'news':
323
+ impact_weight = meta.get('impact', 1.0)
324
+ weighted_scores.append(pred * weight * impact_weight)
325
+ total_weight += weight * impact_weight
326
+ else:
327
+ weighted_scores.append(pred * weight)
328
+ total_weight += weight
329
+
330
+ return sum(weighted_scores) / max(total_weight, 1)
331
+
332
+ def _calculate_social_sentiment(self, results: Dict, metadata: List[Dict]) -> float:
333
+ """Calculate social media sentiment score"""
334
+ if not results:
335
+ return 0.0
336
+
337
+ social_scores = []
338
+
339
+ for model_key, model_results in results.items():
340
+ predictions = model_results['predictions']
341
+ specialization = model_results['specialization']
342
+
343
+ # Prioritize social-specific models
344
+ weight = 1.5 if specialization == 'social_sentiment' else 1.0
345
+
346
+ for i, pred in enumerate(predictions[:len(metadata)]):
347
+ meta = metadata[i] if i < len(metadata) else {'source': 'unknown', 'score': 0}
348
+ if meta['source'] == 'social':
349
+ # Weight by social score (upvotes, likes, etc.)
350
+ social_weight = min(max(meta.get('score', 0) / 10, 0.5), 2.0)
351
+ social_scores.append(pred * weight * social_weight)
352
+
353
+ return np.mean(social_scores) if social_scores else 0.0
354
+
355
+ def _analyze_economic_impact(self, texts: List[str]) -> float:
356
+ """Analyze economic impact using keyword analysis"""
357
+ impact_keywords = {
358
+ 'high_impact': ['fed', 'federal reserve', 'inflation', 'gdp', 'unemployment', 'interest rate'],
359
+ 'medium_impact': ['earnings', 'revenue', 'profit', 'guidance', 'outlook'],
360
+ 'market_structure': ['merger', 'acquisition', 'ipo', 'split', 'dividend']
361
+ }
362
+
363
+ total_impact = 0
364
+ impact_count = 0
365
+
366
+ for text in texts:
367
+ text_lower = text.lower()
368
+
369
+ # High impact events
370
+ high_matches = sum(1 for keyword in impact_keywords['high_impact']
371
+ if keyword in text_lower)
372
+ if high_matches > 0:
373
+ total_impact += high_matches * 3
374
+ impact_count += 1
375
+
376
+ # Medium impact events
377
+ medium_matches = sum(1 for keyword in impact_keywords['medium_impact']
378
+ if keyword in text_lower)
379
+ if medium_matches > 0:
380
+ total_impact += medium_matches * 2
381
+ impact_count += 1
382
+
383
+ # Market structure events
384
+ structure_matches = sum(1 for keyword in impact_keywords['market_structure']
385
+ if keyword in text_lower)
386
+ if structure_matches > 0:
387
+ total_impact += structure_matches * 1.5
388
+ impact_count += 1
389
+
390
+ return total_impact / max(impact_count, 1)
391
+
392
+ def _calculate_momentum_composite(self, financial_sent: float, social_sent: float,
393
+ economic_impact: float) -> float:
394
+ """Calculate composite score optimized for momentum trading"""
395
+ # Momentum trading weights - prioritize speed and strength
396
+ financial_weight = 0.5 # Primary signal
397
+ social_weight = 0.2 # Secondary confirmation
398
+ economic_weight = 0.3 # Impact multiplier
399
+
400
+ composite = (financial_sent * financial_weight +
401
+ social_sent * social_weight +
402
+ economic_impact * economic_weight * 0.1) # Scale economic impact
403
+
404
+ # Apply momentum amplification for strong signals
405
+ if abs(composite) > 0.5:
406
+ composite *= 1.2
407
+
408
+ return np.clip(composite, -1.0, 1.0)
409
+
410
+ def _calculate_confidence(self, results: Dict) -> str:
411
+ """Calculate confidence level based on model agreement"""
412
+ if not results:
413
+ return "LOW"
414
+
415
+ all_predictions = []
416
+ for model_results in results.values():
417
+ all_predictions.extend(model_results['predictions'])
418
+
419
+ if not all_predictions:
420
+ return "LOW"
421
+
422
+ # Calculate standard deviation for agreement
423
+ std_dev = np.std(all_predictions)
424
+ mean_abs = np.mean(np.abs(all_predictions))
425
+
426
+ if std_dev < 0.2 and mean_abs > 0.3:
427
+ return "HIGH"
428
+ elif std_dev < 0.4 and mean_abs > 0.2:
429
+ return "MEDIUM"
430
+ else:
431
+ return "LOW"
432
+
433
+ def _extract_key_themes(self, texts: List[str], results: Dict) -> List[Dict]:
434
+ """Extract key themes with sentiment scores"""
435
+ themes = []
436
+
437
+ # Simple theme extraction based on high-impact content
438
+ for i, text in enumerate(texts[:10]): # Limit for performance
439
+ # Calculate average sentiment for this text
440
+ avg_sentiment = 0
441
+ model_count = 0
442
+
443
+ for model_results in results.values():
444
+ if i < len(model_results['predictions']):
445
+ avg_sentiment += model_results['predictions'][i]
446
+ model_count += 1
447
+
448
+ if model_count > 0:
449
+ avg_sentiment /= model_count
450
+
451
+ # Only include significant sentiments
452
+ if abs(avg_sentiment) > 0.3:
453
+ themes.append({
454
+ 'headline': text[:100],
455
+ 'sentiment': round(avg_sentiment, 3),
456
+ 'impact': 'HIGH' if abs(avg_sentiment) > 0.6 else 'MEDIUM'
457
+ })
458
+
459
+ return sorted(themes, key=lambda x: abs(x['sentiment']), reverse=True)[:5]
460
+
461
+ def _calculate_news_impact(self, text: str) -> float:
462
+ """Calculate news impact multiplier"""
463
+ text_lower = text.lower()
464
+
465
+ # High impact keywords
466
+ high_impact = ['breaking', 'urgent', 'alert', 'crash', 'surge', 'halted']
467
+ medium_impact = ['announces', 'reports', 'updates', 'guidance']
468
+
469
+ multiplier = 1.0
470
+
471
+ if any(keyword in text_lower for keyword in high_impact):
472
+ multiplier = 2.0
473
+ elif any(keyword in text_lower for keyword in medium_impact):
474
+ multiplier = 1.5
475
+
476
+ return multiplier
477
+
478
+ def _default_sentiment(self) -> Dict:
479
+ """Return default sentiment values"""
480
+ return {
481
+ 'financial_sentiment': 0.0,
482
+ 'social_sentiment': 0.0,
483
+ 'economic_impact': 0.0,
484
+ 'composite_score': 0.0,
485
+ 'confidence': 'LOW',
486
+ 'key_themes': [],
487
+ 'model_count': 0
488
+ }
489
+
490
+ # Momentum-specific analysis functions
491
+ class MomentumSentimentSignals:
492
+ """Generate momentum trading signals from sentiment"""
493
+
494
+ @staticmethod
495
+ def generate_momentum_signals(sentiment_data: Dict, timeframe: str = '5m') -> Dict:
496
+ """Generate momentum signals for scalping/day trading"""
497
+
498
+ composite_score = sentiment_data.get('composite_score', 0)
499
+ confidence = sentiment_data.get('confidence', 'LOW')
500
+ economic_impact = sentiment_data.get('economic_impact', 0)
501
+
502
+ # Momentum thresholds based on timeframe
503
+ thresholds = {
504
+ '1m': {'strong': 0.3, 'weak': 0.15},
505
+ '5m': {'strong': 0.4, 'weak': 0.2},
506
+ '15m': {'strong': 0.5, 'weak': 0.25}
507
+ }
508
+
509
+ thresh = thresholds.get(timeframe, thresholds['5m'])
510
+
511
+ # Generate signals
512
+ if composite_score > thresh['strong'] and confidence in ['HIGH', 'MEDIUM']:
513
+ signal = 'STRONG_BULLISH'
514
+ conviction = 0.8 if confidence == 'HIGH' else 0.6
515
+ elif composite_score > thresh['weak']:
516
+ signal = 'WEAK_BULLISH'
517
+ conviction = 0.5
518
+ elif composite_score < -thresh['strong'] and confidence in ['HIGH', 'MEDIUM']:
519
+ signal = 'STRONG_BEARISH'
520
+ conviction = 0.8 if confidence == 'HIGH' else 0.6
521
+ elif composite_score < -thresh['weak']:
522
+ signal = 'WEAK_BEARISH'
523
+ conviction = 0.5
524
+ else:
525
+ signal = 'NEUTRAL'
526
+ conviction = 0.3
527
+
528
+ # Economic impact multiplier
529
+ if economic_impact > 3:
530
+ conviction *= 1.2
531
+
532
+ return {
533
+ 'signal': signal,
534
+ 'conviction': min(conviction, 1.0),
535
+ 'timeframe': timeframe,
536
+ 'composite_score': composite_score,
537
+ 'economic_multiplier': economic_impact
538
+ }
539
+
540
+ # Initialize global analyzer instance
541
+ sentiment_analyzer = None
542
+
543
+ def get_sentiment_analyzer():
544
+ """Get or create sentiment analyzer instance"""
545
+ global sentiment_analyzer
546
+ if sentiment_analyzer is None:
547
+ sentiment_analyzer = EnhancedFinancialSentimentAnalyzer()
548
+ sentiment_analyzer.initialize_models()
549
+ return sentiment_analyzer
550
+
551
+ def analyze_momentum_sentiment(news_df: pd.DataFrame, social_df: pd.DataFrame,
552
+ symbol: str, timeframe: str = '5m') -> Dict:
553
+ """Main function for momentum sentiment analysis"""
554
+ analyzer = get_sentiment_analyzer()
555
+
556
+ # Get comprehensive sentiment
557
+ sentiment_data = analyzer.analyze_comprehensive_sentiment(news_df, social_df, symbol)
558
+
559
+ # Generate momentum signals
560
+ momentum_signals = MomentumSentimentSignals.generate_momentum_signals(
561
+ sentiment_data, timeframe
562
+ )
563
+
564
+ # Combine results
565
+ return {
566
+ **sentiment_data,
567
+ 'momentum_signals': momentum_signals
568
+ }
569
+
570
+ # For backwards compatibility with existing code
571
+ class MultiModalSentimentAnalyzer(EnhancedFinancialSentimentAnalyzer):
572
+ """Backwards compatibility class"""
573
  pass