Spaces:
Sleeping
Sleeping
| """ | |
| 文字內容分析模組 | |
| """ | |
| import re | |
| import jieba | |
| from typing import Dict, List, Optional | |
| import logging | |
| from collections import Counter | |
| logger = logging.getLogger(__name__) | |
| class TextAnalyzer: | |
| """文字內容分析器""" | |
| def __init__(self): | |
| """初始化文字分析器""" | |
| # 初始化jieba分詞 | |
| jieba.initialize() | |
| # 情感詞典(簡化版) | |
| self.positive_words = { | |
| "好", "棒", "讚", "優秀", "完美", "喜歡", "愛", "開心", "快樂", "高興", | |
| "滿意", "驚喜", "感動", "溫暖", "美好", "精彩", "出色", "傑出", "優秀" | |
| } | |
| self.negative_words = { | |
| "壞", "差", "爛", "討厭", "恨", "生氣", "憤怒", "失望", "難過", "痛苦", | |
| "糟糕", "惡劣", "可惡", "討厭", "煩人", "無聊", "討厭", "噁心", "恐怖" | |
| } | |
| # 停用詞 | |
| self.stop_words = { | |
| "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", | |
| "一個", "上", "也", "很", "到", "說", "要", "去", "你", "會", "著", "沒有", | |
| "看", "好", "自己", "這", "那", "他", "她", "它", "們", "我們", "你們" | |
| } | |
| def analyze(self, text: str, analysis_type: str = "comprehensive") -> Dict: | |
| """ | |
| 分析文字內容 | |
| Args: | |
| text: 要分析的文字 | |
| analysis_type: 分析類型 | |
| Returns: | |
| 分析結果字典 | |
| """ | |
| try: | |
| results = { | |
| "original_text": text, | |
| "analysis_type": analysis_type, | |
| "word_count": len(text), | |
| "char_count": len(text.replace(" ", "")), | |
| "sentiment": self._analyze_sentiment(text), | |
| "keywords": self._extract_keywords(text), | |
| "topics": self._extract_topics(text), | |
| "summary": "" | |
| } | |
| # 根據分析類型添加特定分析 | |
| if analysis_type in ["comprehensive", "sentiment"]: | |
| results["sentiment_score"] = self._calculate_sentiment_score(text) | |
| if analysis_type in ["comprehensive", "content_classification"]: | |
| results["content_category"] = self._classify_content(text) | |
| results["language"] = self._detect_language(text) | |
| if analysis_type in ["comprehensive", "keyword_extraction"]: | |
| results["named_entities"] = self._extract_named_entities(text) | |
| # 生成總結 | |
| results["summary"] = self._generate_summary(results) | |
| logger.info(f"文字分析完成: {analysis_type}") | |
| return results | |
| except Exception as e: | |
| logger.error(f"文字分析失敗: {e}") | |
| return {"error": str(e)} | |
| def _analyze_sentiment(self, text: str) -> str: | |
| """分析情感傾向""" | |
| words = jieba.lcut(text) | |
| positive_count = sum(1 for word in words if word in self.positive_words) | |
| negative_count = sum(1 for word in words if word in self.negative_words) | |
| if positive_count > negative_count: | |
| return "正面" | |
| elif negative_count > positive_count: | |
| return "負面" | |
| else: | |
| return "中性" | |
| def _calculate_sentiment_score(self, text: str) -> float: | |
| """計算情感分數 (-1 到 1)""" | |
| words = jieba.lcut(text) | |
| positive_count = sum(1 for word in words if word in self.positive_words) | |
| negative_count = sum(1 for word in words if word in self.negative_words) | |
| total_words = len(words) | |
| if total_words == 0: | |
| return 0.0 | |
| score = (positive_count - negative_count) / total_words | |
| return max(-1.0, min(1.0, score)) | |
| def _extract_keywords(self, text: str, top_k: int = 10) -> List[str]: | |
| """提取關鍵詞""" | |
| words = jieba.lcut(text) | |
| # 過濾停用詞和短詞 | |
| filtered_words = [ | |
| word for word in words | |
| if len(word) > 1 and word not in self.stop_words | |
| ] | |
| # 計算詞頻 | |
| word_freq = Counter(filtered_words) | |
| # 返回最常見的詞 | |
| return [word for word, freq in word_freq.most_common(top_k)] | |
| def _extract_topics(self, text: str) -> List[str]: | |
| """提取主題(簡化版)""" | |
| # 這裡使用簡單的關鍵詞提取作為主題 | |
| keywords = self._extract_keywords(text, top_k=5) | |
| # 可以根據需要添加更複雜的主題建模 | |
| return keywords | |
| def _classify_content(self, text: str) -> str: | |
| """內容分類""" | |
| # 簡化的內容分類 | |
| if any(word in text for word in ["新聞", "報導", "消息", "事件"]): | |
| return "新聞" | |
| elif any(word in text for word in ["評論", "觀點", "看法", "認為"]): | |
| return "評論" | |
| elif any(word in text for word in ["問題", "求助", "請教", "怎麼辦"]): | |
| return "問答" | |
| elif any(word in text for word in ["分享", "推薦", "介紹", "推薦"]): | |
| return "分享" | |
| else: | |
| return "一般" | |
| def _detect_language(self, text: str) -> str: | |
| """檢測語言""" | |
| # 簡單的中文檢測 | |
| chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) | |
| total_chars = len(text.replace(" ", "")) | |
| if total_chars == 0: | |
| return "未知" | |
| chinese_ratio = chinese_chars / total_chars | |
| if chinese_ratio > 0.5: | |
| return "中文" | |
| else: | |
| return "其他" | |
| def _extract_named_entities(self, text: str) -> List[str]: | |
| """提取命名實體(簡化版)""" | |
| # 簡單的實體提取 | |
| entities = [] | |
| # 提取可能的姓名(2-4個中文字符) | |
| names = re.findall(r'[\u4e00-\u9fff]{2,4}', text) | |
| entities.extend(names) | |
| # 提取可能的組織名稱 | |
| org_patterns = [ | |
| r'[\u4e00-\u9fff]+公司', | |
| r'[\u4e00-\u9fff]+大學', | |
| r'[\u4e00-\u9fff]+政府', | |
| r'[\u4e00-\u9fff]+協會' | |
| ] | |
| for pattern in org_patterns: | |
| orgs = re.findall(pattern, text) | |
| entities.extend(orgs) | |
| return list(set(entities)) | |
| def _generate_summary(self, results: Dict) -> str: | |
| """生成分析總結""" | |
| summary_parts = [] | |
| summary_parts.append(f"文字長度: {results['char_count']} 字符") | |
| summary_parts.append(f"情感傾向: {results['sentiment']}") | |
| if 'sentiment_score' in results: | |
| score = results['sentiment_score'] | |
| summary_parts.append(f"情感分數: {score:.2f}") | |
| if results['keywords']: | |
| summary_parts.append(f"主要關鍵詞: {', '.join(results['keywords'][:5])}") | |
| if 'content_category' in results: | |
| summary_parts.append(f"內容類型: {results['content_category']}") | |
| return " | ".join(summary_parts) | |