import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline import logging import re from typing import Dict, Tuple, Optional import jieba import emoji import os logger = logging.getLogger(__name__) class SentimentAnalyzer: """中文新聞情緒分析器 - 修正版""" def __init__(self, model_name: str = "uer/roberta-base-finetuned-jd-binary-chinese"): self.model_name = model_name self.tokenizer = None self.model = None self.classifier = None self.device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Device set to use {self.device}") # 初始化模型 self._load_model() # 情緒關鍵字典 self.positive_keywords = { '上漲', '漲', '漲幅', '上升', '增長', '成長', '利好', '利多', '買進', '看好', '樂觀', '獲利', '盈利', '突破', '新高', '強勢', '回升', '反彈', '看漲', '推薦', '買入', '增持', '超買', '牛市', '多頭', '正面', '積極', '飆漲', '大漲', '強勢', '創新高', '獲利', '成功', '贏家', '提升', '改善' } self.negative_keywords = { '下跌', '跌', '跌幅', '下滑', '下降', '減少', '衰退', '利空', '賣出', '看壞', '悲觀', '虧損', '損失', '破底', '新低', '弱勢', '下探', '重挫', '看跌', '賣出', '減持', '超賣', '熊市', '空頭', '負面', '消極', '警告', '暴跌', '大跌', '崩盤', '危機', '風險', '下修', '衰退' } def _load_model(self): """載入預訓練模型 - 修正版""" try: logger.info(f"開始載入情緒分析模型: {self.model_name}") # 載入 tokenizer logger.info("載入 tokenizer...") self.tokenizer = AutoTokenizer.from_pretrained( self.model_name, trust_remote_code=True ) # 載入模型 logger.info("載入模型...") self.model = AutoModelForSequenceClassification.from_pretrained( self.model_name, trust_remote_code=True ) # 移動到適當的設備 if self.device == "cuda": self.model = self.model.cuda() # 創建分類器管道 - 修正配置 logger.info("創建分類器管道...") self.classifier = pipeline( "text-classification", model=self.model, tokenizer=self.tokenizer, device=0 if self.device == "cuda" else -1, return_all_scores=False # 只返回最高分的結果 ) logger.info("✅ 情緒分析模型載入成功") except Exception as e: logger.error(f"❌ 載入模型時發生錯誤: {e}") logger.info("將使用關鍵字分析作為備用方案") self.classifier = None def _preprocess_text(self, text: str) -> str: """文本預處理""" try: if not text: return "" # 移除emoji text = emoji.demojize(text, language='zh') text = re.sub(r':[a-zA-Z_]+:', '', text) # 移除特殊字符 text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbf\w\s.,!?()(),。!?]', '', text) # 移除多餘空格 text = re.sub(r'\s+', ' ', text).strip() # 截斷長度 (BERT模型限制) if len(text) > 500: text = text[:500] return text except Exception as e: logger.error(f"文本預處理錯誤: {e}") return text def _keyword_sentiment(self, text: str) -> Tuple[str, float]: """基於關鍵字的情緒分析""" if not text: return "neutral", 0.5 positive_count = sum(1 for keyword in self.positive_keywords if keyword in text) negative_count = sum(1 for keyword in self.negative_keywords if keyword in text) total_keywords = positive_count + negative_count if total_keywords == 0: return "neutral", 0.5 positive_ratio = positive_count / total_keywords if positive_ratio > 0.6: return "positive", 0.7 + (positive_ratio - 0.6) * 0.75 elif positive_ratio < 0.4: return "negative", 0.3 - (0.4 - positive_ratio) * 0.75 else: return "neutral", 0.5 def analyze_sentiment(self, text: str, title: str = "") -> Dict[str, any]: """分析文本情緒 - 修正版""" try: # 合併標題和內容 full_text = f"{title} {text}" if title else text processed_text = self._preprocess_text(full_text) if not processed_text: return { "sentiment": "neutral", "confidence": 0.5, "method": "default" } # 使用模型分析 if self.classifier: try: # 修正模型調用方式 result = self.classifier(processed_text) # 處理模型結果 - 修正數據結構問題 if result: # result 是單個字典,不是列表 if isinstance(result, list) and len(result) > 0: best_result = result[0] else: best_result = result # 標籤映射 label_mapping = { 'LABEL_0': 'negative', 'LABEL_1': 'positive', 'negative': 'negative', 'positive': 'positive' } sentiment = label_mapping.get(best_result.get('label', ''), 'neutral') confidence = best_result.get('score', 0.5) # 如果信心度較低,使用關鍵字方法 if confidence < 0.7: keyword_sentiment, keyword_confidence = self._keyword_sentiment(processed_text) # 加權平均 if abs(confidence - 0.5) < abs(keyword_confidence - 0.5): sentiment = keyword_sentiment confidence = (confidence + keyword_confidence) / 2 method = "hybrid" else: method = "model" else: method = "model" return { "sentiment": sentiment, "confidence": confidence, "method": method } except Exception as e: logger.error(f"模型分析錯誤: {e}") logger.debug(f"錯誤詳情: {str(e)}") # 備用:關鍵字分析 sentiment, confidence = self._keyword_sentiment(processed_text) return { "sentiment": sentiment, "confidence": confidence, "method": "keyword" } except Exception as e: logger.error(f"情緒分析錯誤: {e}") return { "sentiment": "neutral", "confidence": 0.5, "method": "error" } def batch_analyze(self, texts: list, titles: list = None) -> list: """批量分析情緒""" results = [] titles = titles or [""] * len(texts) for i, text in enumerate(texts): title = titles[i] if i < len(titles) else "" result = self.analyze_sentiment(text, title) results.append(result) # 避免GPU記憶體問題 if i % 10 == 0 and torch.cuda.is_available(): torch.cuda.empty_cache() return results