motion_analyze / modules /multimodal_fusion.py
mikao007's picture
Upload 12 files
e92e423 verified
"""
多模態融合分析模組
"""
import numpy as np
from typing import Dict, List, Optional, Tuple
import logging
logger = logging.getLogger(__name__)
class MultimodalFusion:
"""多模態融合分析器"""
def __init__(self):
"""初始化多模態融合分析器"""
# 權重設定
self.weights = {
"text": 0.4,
"image": 0.35,
"video": 0.25
}
# 情感映射
self.emotion_mapping = {
"正面": 1.0,
"中性": 0.0,
"負面": -1.0
}
def fuse_analysis(self,
text_analysis: Optional[Dict] = None,
image_analysis: Optional[Dict] = None,
video_analysis: Optional[Dict] = None) -> Dict:
"""
融合多模態分析結果
Args:
text_analysis: 文字分析結果
image_analysis: 圖片分析結果
video_analysis: 影片分析結果
Returns:
融合後的分析結果
"""
try:
results = {
"modalities": [],
"fused_sentiment": "中性",
"fused_sentiment_score": 0.0,
"content_category": "一般",
"confidence": 0.0,
"key_insights": [],
"summary": ""
}
# 收集可用的模態
available_modalities = []
if text_analysis and not text_analysis.get("error"):
available_modalities.append("text")
if image_analysis and not image_analysis.get("error"):
available_modalities.append("image")
if video_analysis and not video_analysis.get("error"):
available_modalities.append("video")
results["modalities"] = available_modalities
if not available_modalities:
results["summary"] = "無可用的分析模態"
return results
# 融合情感分析
results["fused_sentiment"], results["fused_sentiment_score"] = self._fuse_sentiment(
text_analysis, image_analysis, video_analysis, available_modalities
)
# 融合內容分類
results["content_category"] = self._fuse_content_category(
text_analysis, image_analysis, video_analysis, available_modalities
)
# 計算整體置信度
results["confidence"] = self._calculate_confidence(
text_analysis, image_analysis, video_analysis, available_modalities
)
# 提取關鍵洞察
results["key_insights"] = self._extract_key_insights(
text_analysis, image_analysis, video_analysis, available_modalities
)
# 生成總結
results["summary"] = self._generate_fusion_summary(results)
logger.info(f"多模態融合分析完成,使用模態: {available_modalities}")
return results
except Exception as e:
logger.error(f"多模態融合分析失敗: {e}")
return {"error": str(e)}
def _fuse_sentiment(self, text_analysis: Optional[Dict],
image_analysis: Optional[Dict],
video_analysis: Optional[Dict],
modalities: List[str]) -> Tuple[str, float]:
"""融合情感分析結果"""
sentiment_scores = []
weights = []
# 文字情感
if "text" in modalities and text_analysis:
text_sentiment = text_analysis.get("sentiment", "中性")
text_score = self.emotion_mapping.get(text_sentiment, 0.0)
# 如果有sentiment_score,使用它
if "sentiment_score" in text_analysis:
text_score = text_analysis["sentiment_score"]
sentiment_scores.append(text_score)
weights.append(self.weights["text"])
# 圖片情感
if "image" in modalities and image_analysis:
image_sentiment = image_analysis.get("sentiment", "中性")
image_score = self.emotion_mapping.get(image_sentiment, 0.0)
# 如果有sentiment_score,使用它
if "sentiment_score" in image_analysis:
image_score = image_analysis["sentiment_score"]
sentiment_scores.append(image_score)
weights.append(self.weights["image"])
# 影片情感
if "video" in modalities and video_analysis:
video_sentiment = video_analysis.get("audio_sentiment", "中性")
video_score = self.emotion_mapping.get(video_sentiment, 0.0)
sentiment_scores.append(video_score)
weights.append(self.weights["video"])
if not sentiment_scores:
return "中性", 0.0
# 加權平均
weighted_score = np.average(sentiment_scores, weights=weights)
# 轉換為情感標籤
if weighted_score > 0.3:
sentiment_label = "正面"
elif weighted_score < -0.3:
sentiment_label = "負面"
else:
sentiment_label = "中性"
return sentiment_label, float(weighted_score)
def _fuse_content_category(self, text_analysis: Optional[Dict],
image_analysis: Optional[Dict],
video_analysis: Optional[Dict],
modalities: List[str]) -> str:
"""融合內容分類結果"""
categories = []
# 文字分類
if "text" in modalities and text_analysis:
text_category = text_analysis.get("content_category", "一般")
categories.append(text_category)
# 圖片分類(基於場景)
if "image" in modalities and image_analysis:
image_scene = image_analysis.get("scene", "一般場景")
if "戶外" in image_scene:
categories.append("戶外")
elif "室內" in image_scene:
categories.append("室內")
else:
categories.append("一般")
# 影片分類(基於動作)
if "video" in modalities and video_analysis:
video_actions = video_analysis.get("actions", [])
if "運動" in video_actions:
categories.append("運動")
elif "靜止" in video_actions:
categories.append("靜態")
else:
categories.append("一般")
if not categories:
return "一般"
# 選擇最常見的分類
from collections import Counter
category_counts = Counter(categories)
return category_counts.most_common(1)[0][0]
def _calculate_confidence(self, text_analysis: Optional[Dict],
image_analysis: Optional[Dict],
video_analysis: Optional[Dict],
modalities: List[str]) -> float:
"""計算整體置信度"""
confidences = []
weights = []
# 文字置信度
if "text" in modalities and text_analysis:
text_conf = 0.8 # 簡化版,實際會根據分析品質計算
confidences.append(text_conf)
weights.append(self.weights["text"])
# 圖片置信度
if "image" in modalities and image_analysis:
image_conf = 0.7 # 簡化版
confidences.append(image_conf)
weights.append(self.weights["image"])
# 影片置信度
if "video" in modalities and video_analysis:
video_conf = 0.6 # 簡化版
confidences.append(video_conf)
weights.append(self.weights["video"])
if not confidences:
return 0.0
# 加權平均
return float(np.average(confidences, weights=weights))
def _extract_key_insights(self, text_analysis: Optional[Dict],
image_analysis: Optional[Dict],
video_analysis: Optional[Dict],
modalities: List[str]) -> List[str]:
"""提取關鍵洞察"""
insights = []
# 文字洞察
if "text" in modalities and text_analysis:
keywords = text_analysis.get("keywords", [])
if keywords:
insights.append(f"文字關鍵詞: {', '.join(keywords[:3])}")
topics = text_analysis.get("topics", [])
if topics:
insights.append(f"文字主題: {', '.join(topics[:2])}")
# 圖片洞察
if "image" in modalities and image_analysis:
objects = image_analysis.get("objects", [])
if objects:
insights.append(f"圖片物件: {', '.join(objects[:3])}")
scene = image_analysis.get("scene", "")
if scene:
insights.append(f"圖片場景: {scene}")
# 影片洞察
if "video" in modalities and video_analysis:
actions = video_analysis.get("actions", [])
if actions:
insights.append(f"影片動作: {', '.join(actions)}")
motion = video_analysis.get("motion", {})
if motion and motion.get("motion_type"):
insights.append(f"運動類型: {motion['motion_type']}")
return insights
def _generate_fusion_summary(self, results: Dict) -> str:
"""生成融合分析總結"""
summary_parts = []
# 模態資訊
modalities = results.get("modalities", [])
summary_parts.append(f"分析模態: {', '.join(modalities)}")
# 融合情感
sentiment = results.get("fused_sentiment", "未知")
sentiment_score = results.get("fused_sentiment_score", 0.0)
summary_parts.append(f"綜合情感: {sentiment} ({sentiment_score:.2f})")
# 內容分類
category = results.get("content_category", "一般")
summary_parts.append(f"內容類型: {category}")
# 置信度
confidence = results.get("confidence", 0.0)
summary_parts.append(f"分析置信度: {confidence:.2f}")
# 關鍵洞察
insights = results.get("key_insights", [])
if insights:
summary_parts.append(f"關鍵洞察: {'; '.join(insights[:3])}")
return " | ".join(summary_parts)