PawMatchAI / priority_detector.py
DawnC's picture
Upload 19 files
1b3ab7b verified
# %%writefile priority_detector.py
import re
import numpy as np
from typing import Dict, List, Tuple, Set, Optional, Any
from dataclasses import dataclass, field
import traceback
@dataclass
class PriorityDetectionResult:
"""優先級檢測結果"""
dimension_priorities: Dict[str, float] = field(default_factory=dict)
detected_emphases: Dict[str, List[float]] = field(default_factory=dict)
detected_rankings: Dict[str, int] = field(default_factory=dict)
detected_negatives: List[str] = field(default_factory=list)
detection_confidence: float = 1.0
class PriorityDetector:
"""
優先級檢測器
檢測使用者輸入中的優先級表達,包括強調關鍵字、排序詞、負面約束
"""
def __init__(self):
"""初始化優先級檢測器"""
self.emphasis_keywords = self._initialize_emphasis_keywords()
self.ranking_keywords = self._initialize_ranking_keywords()
self.negative_keywords = self._initialize_negative_keywords()
self.dimension_keywords = self._initialize_dimension_keywords()
self.absolute_max_priority = 2.5
def _initialize_emphasis_keywords(self) -> Dict[str, Dict[str, List[str]]]:
"""初始化強調關鍵字"""
return {
'strong_emphasis': {
'en': [
'most important', 'most importantly', 'must have', 'absolutely need',
'critical', 'essential', 'top priority', 'crucial',
'absolutely', 'definitely', 'certainly', 'paramount',
'vital', 'indispensable', 'mandatory', 'imperative'
]
},
'medium_emphasis': {
'en': [
'really want', 'prefer', 'hope for', 'would like',
'strongly prefer', 'important', 'significant',
'really need', 'very important', 'highly prefer'
]
},
'mild_emphasis': {
'en': [
'nice to have', 'ideally', 'if possible', 'bonus if',
'preferably', 'would be nice', 'hopefully',
'optimally', 'wish for'
]
}
}
def _initialize_ranking_keywords(self) -> Dict[str, List[str]]:
"""初始化排序關鍵字"""
return {
'en': [
'first', 'second', 'third', 'fourth', 'fifth',
'1st', '2nd', '3rd', '4th', '5th',
'firstly', 'secondly', 'thirdly',
'primary', 'secondary', 'tertiary'
]
}
def _initialize_negative_keywords(self) -> Dict[str, List[str]]:
"""初始化負面約束關鍵字"""
return {
'en': [
'must not', 'cannot', "don't want", "don't need",
'absolutely no', 'cannot tolerate', 'no way',
'avoid', 'never', 'not', 'refuse',
'unacceptable', 'won\'t accept'
]
}
def _initialize_dimension_keywords(self) -> Dict[str, List[str]]:
"""初始化維度關鍵字映射"""
return {
'noise': [
'quiet', 'silent', 'not noisy', "doesn't bark", 'peaceful',
'noise', 'barking', 'vocal', 'loud', 'sound'
],
'size': [
'small', 'medium', 'large', 'tiny', 'big', 'compact',
'size', 'giant', 'toy', 'miniature'
],
'grooming': [
'low maintenance', 'easy care', 'minimal grooming', 'low-maintenance',
'grooming', 'care', 'maintenance', 'brush', 'shed', 'shedding'
],
'family': [
'good with kids', 'child friendly', 'family dog',
'children', 'kids', 'family', 'toddler', 'baby'
],
'exercise': [
'active', 'exercise', 'energy', 'activity',
'lazy', 'calm', 'energetic', 'athletic', 'work full time'
],
'experience': [
'first time', 'first dog', 'beginner', 'new to dogs', 'inexperienced',
'easy to train', 'trainable', 'obedient', 'never owned', 'never had'
],
'health': [
'healthy', 'health', 'lifespan', 'longevity',
'medical', 'genetic issues'
]
}
def detect_priorities(self, user_input: str) -> PriorityDetectionResult:
"""
檢測使用者輸入中的優先級
Args:
user_input: 使用者輸入文字
Returns:
PriorityDetectionResult: 優先級檢測結果
"""
try:
if not user_input or not user_input.strip():
return PriorityDetectionResult()
normalized_input = user_input.lower().strip()
# Step 1: 檢測強調關鍵字
detected_emphases = self._detect_emphasis_keywords(normalized_input)
# Step 2: 檢測排序詞
detected_rankings = self._detect_explicit_ranking(normalized_input)
# Step 3: 檢測負面約束
detected_negatives = self._detect_negative_constraints(normalized_input)
# Step 4: 檢測所有提及的維度(即使沒有強調詞)
mentioned_dimensions = self._detect_mentioned_dimensions(normalized_input)
# Step 5: 計算疊加優先級(包括提及的維度)
dimension_priorities = self._calculate_final_priorities(
detected_emphases, detected_rankings, mentioned_dimensions
)
# Step 6: 計算信心度
detection_confidence = self._calculate_detection_confidence(
detected_emphases, detected_rankings, normalized_input
)
return PriorityDetectionResult(
dimension_priorities=dimension_priorities,
detected_emphases=detected_emphases,
detected_rankings=detected_rankings,
detected_negatives=detected_negatives,
detection_confidence=detection_confidence
)
except Exception as e:
print(f"Error detecting priorities: {str(e)}")
print(traceback.format_exc())
return PriorityDetectionResult()
def _detect_mentioned_dimensions(self, text: str) -> Set[str]:
"""
檢測文字中提及的所有維度(不需要強調詞)
Args:
text: 正規化後的輸入文字
Returns:
Set[str]: 提及的維度集合
"""
mentioned = set()
for dimension, keywords in self.dimension_keywords.items():
for keyword in keywords:
if keyword in text:
mentioned.add(dimension)
break # 一個維度只需匹配一次
return mentioned
def _detect_emphasis_keywords(self, text: str) -> Dict[str, List[float]]:
"""檢測強調關鍵字"""
detected = {}
# 定義權重倍數
emphasis_weights = {
'strong_emphasis': 2.0,
'medium_emphasis': 1.5,
'mild_emphasis': 1.2
}
# 為每個強調級別檢測
for emphasis_level, keywords_dict in self.emphasis_keywords.items():
weight = emphasis_weights[emphasis_level]
for lang, keywords in keywords_dict.items():
for keyword in keywords:
if keyword in text:
# 找到關鍵字附近的維度詞
dimensions = self._extract_nearby_dimensions(text, keyword)
for dimension in dimensions:
if dimension not in detected:
detected[dimension] = []
detected[dimension].append(weight)
return detected
def _detect_explicit_ranking(self, text: str) -> Dict[str, int]:
"""檢測明確排序詞"""
detected = {}
# 排序詞到排名的映射
ranking_map = {
'first': 1, '1st': 1, 'firstly': 1, 'primary': 1,
'second': 2, '2nd': 2, 'secondly': 2, 'secondary': 2,
'third': 3, '3rd': 3, 'thirdly': 3, 'tertiary': 3,
'fourth': 4, '4th': 4,
'fifth': 5, '5th': 5
}
for keyword in self.ranking_keywords['en']:
if keyword in text:
rank = ranking_map.get(keyword, 0)
if rank > 0:
# 找到排序詞附近的維度詞
dimensions = self._extract_nearby_dimensions(text, keyword)
for dimension in dimensions:
# 如果已經有排名,取較高優先級(較小的數字)
if dimension in detected:
detected[dimension] = min(detected[dimension], rank)
else:
detected[dimension] = rank
return detected
def _detect_negative_constraints(self, text: str) -> List[str]:
"""檢測負面約束"""
detected = []
for lang, keywords in self.negative_keywords.items():
for keyword in keywords:
if keyword in text:
# 找到負面關鍵字附近的維度詞
dimensions = self._extract_nearby_dimensions(text, keyword)
detected.extend(dimensions)
return list(set(detected))
def _extract_nearby_dimensions(self, text: str, keyword: str, window: int = 50) -> List[str]:
"""
提取關鍵字附近的維度詞
Args:
text: 文字
keyword: 關鍵字
window: 搜尋窗口大小(字元數)
Returns:
List[str]: 檢測到的維度列表
"""
detected_dimensions = []
# 找到關鍵字位置
keyword_positions = [m.start() for m in re.finditer(re.escape(keyword), text)]
for pos in keyword_positions:
# 定義搜尋窗口
start = max(0, pos - window)
end = min(len(text), pos + len(keyword) + window)
window_text = text[start:end]
# 在窗口中搜尋維度關鍵字
for dimension, dimension_keywords in self.dimension_keywords.items():
for dim_keyword in dimension_keywords:
if dim_keyword in window_text:
detected_dimensions.append(dimension)
break # 找到一個就夠了,不重複添加
return list(set(detected_dimensions))
def _calculate_final_priorities(self,
detected_emphases: Dict[str, List[float]],
detected_rankings: Dict[str, int],
mentioned_dimensions: Set[str] = None) -> Dict[str, float]:
"""
計算最終優先級(疊加邏輯)
Args:
detected_emphases: 檢測到的強調 {dimension: [weights]}
detected_rankings: 檢測到的排序 {dimension: rank}
mentioned_dimensions: 被提及但沒有強調詞的維度
Returns:
Dict[str, float]: 最終優先級分數
"""
final_priorities = {}
if mentioned_dimensions is None:
mentioned_dimensions = set()
# 合併所有提及的維度(包括強調、排序、和一般提及)
all_dimensions = set(detected_emphases.keys()) | set(detected_rankings.keys()) | mentioned_dimensions
for dimension in all_dimensions:
emphasis_scores = detected_emphases.get(dimension, [])
ranking = detected_rankings.get(dimension, 0)
is_mentioned = dimension in mentioned_dimensions
# 計算疊加分數
if emphasis_scores or ranking > 0:
# 有強調詞或排序詞
final_score = self._calculate_stacked_priority(emphasis_scores, ranking)
elif is_mentioned:
# 僅被提及(沒有強調詞),給予基本優先級提升
final_score = 1.3 # 基本提升,讓系統知道這個維度是使用者關心的
else:
final_score = 1.0
final_priorities[dimension] = final_score
return final_priorities
def _calculate_stacked_priority(self,
emphases: List[float],
ranking: int = 0) -> float:
"""
計算疊加後的優先級分數
邏輯:
1. 取最高強調作為基礎
2. 其他強調提供遞減加成
3. 排序詞轉換為權重並疊加
4. 確保不超過絕對上限 2.5
Args:
emphases: 強調權重列表
ranking: 排序位置 (1=first, 2=second, etc.)
Returns:
float: 最終優先級分數
"""
if not emphases and ranking == 0:
return 1.0
# 轉換排序為權重
ranking_weights = {
1: 2.0, # first
2: 1.7, # second
3: 1.4, # third
4: 1.2, # fourth
5: 1.1 # fifth
}
ranking_weight = ranking_weights.get(ranking, 0.0)
# 合併所有權重
all_weights = emphases.copy()
if ranking_weight > 0:
all_weights.append(ranking_weight)
if not all_weights:
return 1.0
# 排序取最高作為基礎
sorted_weights = sorted(all_weights, reverse=True)
base_score = sorted_weights[0]
# 額外權重提供遞減加成 (reduced stacking bonus)
bonus = 0.0
for i, weight in enumerate(sorted_weights[1:], start=1):
# 遞減加成: 第2個給30%, 第3個給15%, 第4個給7.5% (reduced from 50/25/12.5)
bonus += (weight - 1.0) * (0.3 / i)
final_score = min(base_score + bonus, self.absolute_max_priority)
return final_score
def _calculate_detection_confidence(self,
detected_emphases: Dict[str, List[float]],
detected_rankings: Dict[str, int],
text: str) -> float:
"""
計算檢測信心度
Args:
detected_emphases: 檢測到的強調
detected_rankings: 檢測到的排序
text: 原始文字
Returns:
float: 信心度 (0-1)
"""
confidence = 0.5 # 基礎信心度
# 有明確強調 +0.3
if detected_emphases:
confidence += 0.3
# 有明確排序 +0.2
if detected_rankings:
confidence += 0.2
# 文字長度適中 +0.1
word_count = len(text.split())
if 10 <= word_count <= 100:
confidence += 0.1
return min(1.0, confidence)
def get_detection_summary(self, result: PriorityDetectionResult) -> Dict[str, Any]:
"""
獲取檢測摘要
Args:
result: 優先級檢測結果
Returns:
Dict[str, Any]: 檢測摘要
"""
return {
'total_dimensions_detected': len(result.dimension_priorities),
'high_priority_dimensions': [
dim for dim, score in result.dimension_priorities.items() if score >= 1.5
],
'dimension_priorities': result.dimension_priorities,
'emphases_detected': len(result.detected_emphases),
'rankings_detected': len(result.detected_rankings),
'negative_constraints': result.detected_negatives,
'detection_confidence': result.detection_confidence
}
def detect_user_priorities(user_input: str) -> PriorityDetectionResult:
"""
便利函數: 檢測使用者優先級
Args:
user_input: 使用者輸入
Returns:
PriorityDetectionResult: 檢測結果
"""
detector = PriorityDetector()
return detector.detect_priorities(user_input)
def get_priority_summary(user_input: str) -> Dict[str, Any]:
"""
便利函數: 獲取優先級摘要
Args:
user_input: 使用者輸入
Returns:
Dict[str, Any]: 優先級摘要
"""
detector = PriorityDetector()
result = detector.detect_priorities(user_input)
return detector.get_detection_summary(result)