| | import torch |
| | import numpy as np |
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| | from scipy.stats import zscore |
| |
|
| | class SentimentAnalyzer: |
| | def __init__(self): |
| | self.models = { |
| | 'finbert': AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone"), |
| | 'financial_sentiment': AutoModelForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis") |
| | } |
| | self.tokenizers = { |
| | 'finbert': AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone"), |
| | 'financial_sentiment': AutoTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis") |
| | } |
| | self.max_length = 512 |
| |
|
| | def chunk_text(self, text, tokenizer): |
| | tokens = tokenizer.encode(text, truncation=False) |
| | return [tokens[i:i+self.max_length] for i in range(0, len(tokens), self.max_length)] |
| | |
| | def preprocess_text(self, item): |
| | title = str(item.get('title', '')).strip() |
| | content = str(item.get('content', '')).strip() |
| | text = f"{title} {content}".strip() |
| | return text if text else None |
| |
|
| |
|
| | def analyze(self, news): |
| | if not news: |
| | return {'negative': 0.33, 'neutral': 0.33, 'positive': 0.33} |
| |
|
| | sentiment_scores = [] |
| |
|
| | for item in news: |
| | if not isinstance(item, dict): |
| | continue |
| |
|
| | text = self.preprocess_text(item) |
| | if not text: |
| | continue |
| | |
| | tokenizer = self.tokenizers['financial_sentiment'] |
| | model = self.models['financial_sentiment'] |
| | |
| | tokenized_chunks = self.chunk_text(text, tokenizer) |
| | chunk_scores = [] |
| | |
| | for chunk in tokenized_chunks: |
| | inputs = tokenizer.decode(chunk, skip_special_tokens=True) |
| | inputs = tokenizer(inputs, return_tensors="pt", truncation=True, max_length=self.max_length) |
| | outputs = model(**inputs) |
| | probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) |
| | chunk_scores.append(probabilities.detach().numpy()[0]) |
| | |
| | if chunk_scores: |
| | sentiment_scores.append(np.mean(chunk_scores, axis=0)) |
| |
|
| | if not sentiment_scores: |
| | return {'negative': 0.33, 'neutral': 0.33, 'positive': 0.33} |
| |
|
| | |
| | filtered_scores = [s for s in sentiment_scores if np.abs(zscore(s)).max() < 2] |
| | avg_sentiment = np.mean(filtered_scores, axis=0) if filtered_scores else np.mean(sentiment_scores, axis=0) |
| |
|
| | return {'negative': float(avg_sentiment[0]), 'neutral': float(avg_sentiment[1]), 'positive': float(avg_sentiment[2])} |
| |
|