""" 文字內容分析模組 """ import re import jieba from typing import Dict, List, Optional import logging from collections import Counter logger = logging.getLogger(__name__) class TextAnalyzer: """文字內容分析器""" def __init__(self): """初始化文字分析器""" # 初始化jieba分詞 jieba.initialize() # 情感詞典(簡化版) self.positive_words = { "好", "棒", "讚", "優秀", "完美", "喜歡", "愛", "開心", "快樂", "高興", "滿意", "驚喜", "感動", "溫暖", "美好", "精彩", "出色", "傑出", "優秀" } self.negative_words = { "壞", "差", "爛", "討厭", "恨", "生氣", "憤怒", "失望", "難過", "痛苦", "糟糕", "惡劣", "可惡", "討厭", "煩人", "無聊", "討厭", "噁心", "恐怖" } # 停用詞 self.stop_words = { "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "一個", "上", "也", "很", "到", "說", "要", "去", "你", "會", "著", "沒有", "看", "好", "自己", "這", "那", "他", "她", "它", "們", "我們", "你們" } def analyze(self, text: str, analysis_type: str = "comprehensive") -> Dict: """ 分析文字內容 Args: text: 要分析的文字 analysis_type: 分析類型 Returns: 分析結果字典 """ try: results = { "original_text": text, "analysis_type": analysis_type, "word_count": len(text), "char_count": len(text.replace(" ", "")), "sentiment": self._analyze_sentiment(text), "keywords": self._extract_keywords(text), "topics": self._extract_topics(text), "summary": "" } # 根據分析類型添加特定分析 if analysis_type in ["comprehensive", "sentiment"]: results["sentiment_score"] = self._calculate_sentiment_score(text) if analysis_type in ["comprehensive", "content_classification"]: results["content_category"] = self._classify_content(text) results["language"] = self._detect_language(text) if analysis_type in ["comprehensive", "keyword_extraction"]: results["named_entities"] = self._extract_named_entities(text) # 生成總結 results["summary"] = self._generate_summary(results) logger.info(f"文字分析完成: {analysis_type}") return results except Exception as e: logger.error(f"文字分析失敗: {e}") return {"error": str(e)} def _analyze_sentiment(self, text: str) -> str: """分析情感傾向""" words = jieba.lcut(text) positive_count = sum(1 for word in words if word in self.positive_words) negative_count = sum(1 for word in words if word in self.negative_words) if positive_count > negative_count: return "正面" elif negative_count > positive_count: return "負面" else: return "中性" def _calculate_sentiment_score(self, text: str) -> float: """計算情感分數 (-1 到 1)""" words = jieba.lcut(text) positive_count = sum(1 for word in words if word in self.positive_words) negative_count = sum(1 for word in words if word in self.negative_words) total_words = len(words) if total_words == 0: return 0.0 score = (positive_count - negative_count) / total_words return max(-1.0, min(1.0, score)) def _extract_keywords(self, text: str, top_k: int = 10) -> List[str]: """提取關鍵詞""" words = jieba.lcut(text) # 過濾停用詞和短詞 filtered_words = [ word for word in words if len(word) > 1 and word not in self.stop_words ] # 計算詞頻 word_freq = Counter(filtered_words) # 返回最常見的詞 return [word for word, freq in word_freq.most_common(top_k)] def _extract_topics(self, text: str) -> List[str]: """提取主題(簡化版)""" # 這裡使用簡單的關鍵詞提取作為主題 keywords = self._extract_keywords(text, top_k=5) # 可以根據需要添加更複雜的主題建模 return keywords def _classify_content(self, text: str) -> str: """內容分類""" # 簡化的內容分類 if any(word in text for word in ["新聞", "報導", "消息", "事件"]): return "新聞" elif any(word in text for word in ["評論", "觀點", "看法", "認為"]): return "評論" elif any(word in text for word in ["問題", "求助", "請教", "怎麼辦"]): return "問答" elif any(word in text for word in ["分享", "推薦", "介紹", "推薦"]): return "分享" else: return "一般" def _detect_language(self, text: str) -> str: """檢測語言""" # 簡單的中文檢測 chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) total_chars = len(text.replace(" ", "")) if total_chars == 0: return "未知" chinese_ratio = chinese_chars / total_chars if chinese_ratio > 0.5: return "中文" else: return "其他" def _extract_named_entities(self, text: str) -> List[str]: """提取命名實體(簡化版)""" # 簡單的實體提取 entities = [] # 提取可能的姓名(2-4個中文字符) names = re.findall(r'[\u4e00-\u9fff]{2,4}', text) entities.extend(names) # 提取可能的組織名稱 org_patterns = [ r'[\u4e00-\u9fff]+公司', r'[\u4e00-\u9fff]+大學', r'[\u4e00-\u9fff]+政府', r'[\u4e00-\u9fff]+協會' ] for pattern in org_patterns: orgs = re.findall(pattern, text) entities.extend(orgs) return list(set(entities)) def _generate_summary(self, results: Dict) -> str: """生成分析總結""" summary_parts = [] summary_parts.append(f"文字長度: {results['char_count']} 字符") summary_parts.append(f"情感傾向: {results['sentiment']}") if 'sentiment_score' in results: score = results['sentiment_score'] summary_parts.append(f"情感分數: {score:.2f}") if results['keywords']: summary_parts.append(f"主要關鍵詞: {', '.join(results['keywords'][:5])}") if 'content_category' in results: summary_parts.append(f"內容類型: {results['content_category']}") return " | ".join(summary_parts)