motion_analyze / modules /text_analyzer.py
mikao007's picture
Upload 12 files
e92e423 verified
"""
文字內容分析模組
"""
import re
import jieba
from typing import Dict, List, Optional
import logging
from collections import Counter
logger = logging.getLogger(__name__)
class TextAnalyzer:
"""文字內容分析器"""
def __init__(self):
"""初始化文字分析器"""
# 初始化jieba分詞
jieba.initialize()
# 情感詞典(簡化版)
self.positive_words = {
"好", "棒", "讚", "優秀", "完美", "喜歡", "愛", "開心", "快樂", "高興",
"滿意", "驚喜", "感動", "溫暖", "美好", "精彩", "出色", "傑出", "優秀"
}
self.negative_words = {
"壞", "差", "爛", "討厭", "恨", "生氣", "憤怒", "失望", "難過", "痛苦",
"糟糕", "惡劣", "可惡", "討厭", "煩人", "無聊", "討厭", "噁心", "恐怖"
}
# 停用詞
self.stop_words = {
"的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一",
"一個", "上", "也", "很", "到", "說", "要", "去", "你", "會", "著", "沒有",
"看", "好", "自己", "這", "那", "他", "她", "它", "們", "我們", "你們"
}
def analyze(self, text: str, analysis_type: str = "comprehensive") -> Dict:
"""
分析文字內容
Args:
text: 要分析的文字
analysis_type: 分析類型
Returns:
分析結果字典
"""
try:
results = {
"original_text": text,
"analysis_type": analysis_type,
"word_count": len(text),
"char_count": len(text.replace(" ", "")),
"sentiment": self._analyze_sentiment(text),
"keywords": self._extract_keywords(text),
"topics": self._extract_topics(text),
"summary": ""
}
# 根據分析類型添加特定分析
if analysis_type in ["comprehensive", "sentiment"]:
results["sentiment_score"] = self._calculate_sentiment_score(text)
if analysis_type in ["comprehensive", "content_classification"]:
results["content_category"] = self._classify_content(text)
results["language"] = self._detect_language(text)
if analysis_type in ["comprehensive", "keyword_extraction"]:
results["named_entities"] = self._extract_named_entities(text)
# 生成總結
results["summary"] = self._generate_summary(results)
logger.info(f"文字分析完成: {analysis_type}")
return results
except Exception as e:
logger.error(f"文字分析失敗: {e}")
return {"error": str(e)}
def _analyze_sentiment(self, text: str) -> str:
"""分析情感傾向"""
words = jieba.lcut(text)
positive_count = sum(1 for word in words if word in self.positive_words)
negative_count = sum(1 for word in words if word in self.negative_words)
if positive_count > negative_count:
return "正面"
elif negative_count > positive_count:
return "負面"
else:
return "中性"
def _calculate_sentiment_score(self, text: str) -> float:
"""計算情感分數 (-1 到 1)"""
words = jieba.lcut(text)
positive_count = sum(1 for word in words if word in self.positive_words)
negative_count = sum(1 for word in words if word in self.negative_words)
total_words = len(words)
if total_words == 0:
return 0.0
score = (positive_count - negative_count) / total_words
return max(-1.0, min(1.0, score))
def _extract_keywords(self, text: str, top_k: int = 10) -> List[str]:
"""提取關鍵詞"""
words = jieba.lcut(text)
# 過濾停用詞和短詞
filtered_words = [
word for word in words
if len(word) > 1 and word not in self.stop_words
]
# 計算詞頻
word_freq = Counter(filtered_words)
# 返回最常見的詞
return [word for word, freq in word_freq.most_common(top_k)]
def _extract_topics(self, text: str) -> List[str]:
"""提取主題(簡化版)"""
# 這裡使用簡單的關鍵詞提取作為主題
keywords = self._extract_keywords(text, top_k=5)
# 可以根據需要添加更複雜的主題建模
return keywords
def _classify_content(self, text: str) -> str:
"""內容分類"""
# 簡化的內容分類
if any(word in text for word in ["新聞", "報導", "消息", "事件"]):
return "新聞"
elif any(word in text for word in ["評論", "觀點", "看法", "認為"]):
return "評論"
elif any(word in text for word in ["問題", "求助", "請教", "怎麼辦"]):
return "問答"
elif any(word in text for word in ["分享", "推薦", "介紹", "推薦"]):
return "分享"
else:
return "一般"
def _detect_language(self, text: str) -> str:
"""檢測語言"""
# 簡單的中文檢測
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
total_chars = len(text.replace(" ", ""))
if total_chars == 0:
return "未知"
chinese_ratio = chinese_chars / total_chars
if chinese_ratio > 0.5:
return "中文"
else:
return "其他"
def _extract_named_entities(self, text: str) -> List[str]:
"""提取命名實體(簡化版)"""
# 簡單的實體提取
entities = []
# 提取可能的姓名(2-4個中文字符)
names = re.findall(r'[\u4e00-\u9fff]{2,4}', text)
entities.extend(names)
# 提取可能的組織名稱
org_patterns = [
r'[\u4e00-\u9fff]+公司',
r'[\u4e00-\u9fff]+大學',
r'[\u4e00-\u9fff]+政府',
r'[\u4e00-\u9fff]+協會'
]
for pattern in org_patterns:
orgs = re.findall(pattern, text)
entities.extend(orgs)
return list(set(entities))
def _generate_summary(self, results: Dict) -> str:
"""生成分析總結"""
summary_parts = []
summary_parts.append(f"文字長度: {results['char_count']} 字符")
summary_parts.append(f"情感傾向: {results['sentiment']}")
if 'sentiment_score' in results:
score = results['sentiment_score']
summary_parts.append(f"情感分數: {score:.2f}")
if results['keywords']:
summary_parts.append(f"主要關鍵詞: {', '.join(results['keywords'][:5])}")
if 'content_category' in results:
summary_parts.append(f"內容類型: {results['content_category']}")
return " | ".join(summary_parts)