Spaces:
Running
on
Zero
Running
on
Zero
| # %%writefile priority_detector.py | |
| import re | |
| import numpy as np | |
| from typing import Dict, List, Tuple, Set, Optional, Any | |
| from dataclasses import dataclass, field | |
| import traceback | |
| class PriorityDetectionResult: | |
| """優先級檢測結果""" | |
| dimension_priorities: Dict[str, float] = field(default_factory=dict) | |
| detected_emphases: Dict[str, List[float]] = field(default_factory=dict) | |
| detected_rankings: Dict[str, int] = field(default_factory=dict) | |
| detected_negatives: List[str] = field(default_factory=list) | |
| detection_confidence: float = 1.0 | |
| class PriorityDetector: | |
| """ | |
| 優先級檢測器 | |
| 檢測使用者輸入中的優先級表達,包括強調關鍵字、排序詞、負面約束 | |
| """ | |
| def __init__(self): | |
| """初始化優先級檢測器""" | |
| self.emphasis_keywords = self._initialize_emphasis_keywords() | |
| self.ranking_keywords = self._initialize_ranking_keywords() | |
| self.negative_keywords = self._initialize_negative_keywords() | |
| self.dimension_keywords = self._initialize_dimension_keywords() | |
| self.absolute_max_priority = 2.5 | |
| def _initialize_emphasis_keywords(self) -> Dict[str, Dict[str, List[str]]]: | |
| """初始化強調關鍵字""" | |
| return { | |
| 'strong_emphasis': { | |
| 'en': [ | |
| 'most important', 'most importantly', 'must have', 'absolutely need', | |
| 'critical', 'essential', 'top priority', 'crucial', | |
| 'absolutely', 'definitely', 'certainly', 'paramount', | |
| 'vital', 'indispensable', 'mandatory', 'imperative' | |
| ] | |
| }, | |
| 'medium_emphasis': { | |
| 'en': [ | |
| 'really want', 'prefer', 'hope for', 'would like', | |
| 'strongly prefer', 'important', 'significant', | |
| 'really need', 'very important', 'highly prefer' | |
| ] | |
| }, | |
| 'mild_emphasis': { | |
| 'en': [ | |
| 'nice to have', 'ideally', 'if possible', 'bonus if', | |
| 'preferably', 'would be nice', 'hopefully', | |
| 'optimally', 'wish for' | |
| ] | |
| } | |
| } | |
| def _initialize_ranking_keywords(self) -> Dict[str, List[str]]: | |
| """初始化排序關鍵字""" | |
| return { | |
| 'en': [ | |
| 'first', 'second', 'third', 'fourth', 'fifth', | |
| '1st', '2nd', '3rd', '4th', '5th', | |
| 'firstly', 'secondly', 'thirdly', | |
| 'primary', 'secondary', 'tertiary' | |
| ] | |
| } | |
| def _initialize_negative_keywords(self) -> Dict[str, List[str]]: | |
| """初始化負面約束關鍵字""" | |
| return { | |
| 'en': [ | |
| 'must not', 'cannot', "don't want", "don't need", | |
| 'absolutely no', 'cannot tolerate', 'no way', | |
| 'avoid', 'never', 'not', 'refuse', | |
| 'unacceptable', 'won\'t accept' | |
| ] | |
| } | |
| def _initialize_dimension_keywords(self) -> Dict[str, List[str]]: | |
| """初始化維度關鍵字映射""" | |
| return { | |
| 'noise': [ | |
| 'quiet', 'silent', 'not noisy', "doesn't bark", 'peaceful', | |
| 'noise', 'barking', 'vocal', 'loud', 'sound' | |
| ], | |
| 'size': [ | |
| 'small', 'medium', 'large', 'tiny', 'big', 'compact', | |
| 'size', 'giant', 'toy', 'miniature' | |
| ], | |
| 'grooming': [ | |
| 'low maintenance', 'easy care', 'minimal grooming', 'low-maintenance', | |
| 'grooming', 'care', 'maintenance', 'brush', 'shed', 'shedding' | |
| ], | |
| 'family': [ | |
| 'good with kids', 'child friendly', 'family dog', | |
| 'children', 'kids', 'family', 'toddler', 'baby' | |
| ], | |
| 'exercise': [ | |
| 'active', 'exercise', 'energy', 'activity', | |
| 'lazy', 'calm', 'energetic', 'athletic', 'work full time' | |
| ], | |
| 'experience': [ | |
| 'first time', 'first dog', 'beginner', 'new to dogs', 'inexperienced', | |
| 'easy to train', 'trainable', 'obedient', 'never owned', 'never had' | |
| ], | |
| 'health': [ | |
| 'healthy', 'health', 'lifespan', 'longevity', | |
| 'medical', 'genetic issues' | |
| ] | |
| } | |
| def detect_priorities(self, user_input: str) -> PriorityDetectionResult: | |
| """ | |
| 檢測使用者輸入中的優先級 | |
| Args: | |
| user_input: 使用者輸入文字 | |
| Returns: | |
| PriorityDetectionResult: 優先級檢測結果 | |
| """ | |
| try: | |
| if not user_input or not user_input.strip(): | |
| return PriorityDetectionResult() | |
| normalized_input = user_input.lower().strip() | |
| # Step 1: 檢測強調關鍵字 | |
| detected_emphases = self._detect_emphasis_keywords(normalized_input) | |
| # Step 2: 檢測排序詞 | |
| detected_rankings = self._detect_explicit_ranking(normalized_input) | |
| # Step 3: 檢測負面約束 | |
| detected_negatives = self._detect_negative_constraints(normalized_input) | |
| # Step 4: 檢測所有提及的維度(即使沒有強調詞) | |
| mentioned_dimensions = self._detect_mentioned_dimensions(normalized_input) | |
| # Step 5: 計算疊加優先級(包括提及的維度) | |
| dimension_priorities = self._calculate_final_priorities( | |
| detected_emphases, detected_rankings, mentioned_dimensions | |
| ) | |
| # Step 6: 計算信心度 | |
| detection_confidence = self._calculate_detection_confidence( | |
| detected_emphases, detected_rankings, normalized_input | |
| ) | |
| return PriorityDetectionResult( | |
| dimension_priorities=dimension_priorities, | |
| detected_emphases=detected_emphases, | |
| detected_rankings=detected_rankings, | |
| detected_negatives=detected_negatives, | |
| detection_confidence=detection_confidence | |
| ) | |
| except Exception as e: | |
| print(f"Error detecting priorities: {str(e)}") | |
| print(traceback.format_exc()) | |
| return PriorityDetectionResult() | |
| def _detect_mentioned_dimensions(self, text: str) -> Set[str]: | |
| """ | |
| 檢測文字中提及的所有維度(不需要強調詞) | |
| Args: | |
| text: 正規化後的輸入文字 | |
| Returns: | |
| Set[str]: 提及的維度集合 | |
| """ | |
| mentioned = set() | |
| for dimension, keywords in self.dimension_keywords.items(): | |
| for keyword in keywords: | |
| if keyword in text: | |
| mentioned.add(dimension) | |
| break # 一個維度只需匹配一次 | |
| return mentioned | |
| def _detect_emphasis_keywords(self, text: str) -> Dict[str, List[float]]: | |
| """檢測強調關鍵字""" | |
| detected = {} | |
| # 定義權重倍數 | |
| emphasis_weights = { | |
| 'strong_emphasis': 2.0, | |
| 'medium_emphasis': 1.5, | |
| 'mild_emphasis': 1.2 | |
| } | |
| # 為每個強調級別檢測 | |
| for emphasis_level, keywords_dict in self.emphasis_keywords.items(): | |
| weight = emphasis_weights[emphasis_level] | |
| for lang, keywords in keywords_dict.items(): | |
| for keyword in keywords: | |
| if keyword in text: | |
| # 找到關鍵字附近的維度詞 | |
| dimensions = self._extract_nearby_dimensions(text, keyword) | |
| for dimension in dimensions: | |
| if dimension not in detected: | |
| detected[dimension] = [] | |
| detected[dimension].append(weight) | |
| return detected | |
| def _detect_explicit_ranking(self, text: str) -> Dict[str, int]: | |
| """檢測明確排序詞""" | |
| detected = {} | |
| # 排序詞到排名的映射 | |
| ranking_map = { | |
| 'first': 1, '1st': 1, 'firstly': 1, 'primary': 1, | |
| 'second': 2, '2nd': 2, 'secondly': 2, 'secondary': 2, | |
| 'third': 3, '3rd': 3, 'thirdly': 3, 'tertiary': 3, | |
| 'fourth': 4, '4th': 4, | |
| 'fifth': 5, '5th': 5 | |
| } | |
| for keyword in self.ranking_keywords['en']: | |
| if keyword in text: | |
| rank = ranking_map.get(keyword, 0) | |
| if rank > 0: | |
| # 找到排序詞附近的維度詞 | |
| dimensions = self._extract_nearby_dimensions(text, keyword) | |
| for dimension in dimensions: | |
| # 如果已經有排名,取較高優先級(較小的數字) | |
| if dimension in detected: | |
| detected[dimension] = min(detected[dimension], rank) | |
| else: | |
| detected[dimension] = rank | |
| return detected | |
| def _detect_negative_constraints(self, text: str) -> List[str]: | |
| """檢測負面約束""" | |
| detected = [] | |
| for lang, keywords in self.negative_keywords.items(): | |
| for keyword in keywords: | |
| if keyword in text: | |
| # 找到負面關鍵字附近的維度詞 | |
| dimensions = self._extract_nearby_dimensions(text, keyword) | |
| detected.extend(dimensions) | |
| return list(set(detected)) | |
| def _extract_nearby_dimensions(self, text: str, keyword: str, window: int = 50) -> List[str]: | |
| """ | |
| 提取關鍵字附近的維度詞 | |
| Args: | |
| text: 文字 | |
| keyword: 關鍵字 | |
| window: 搜尋窗口大小(字元數) | |
| Returns: | |
| List[str]: 檢測到的維度列表 | |
| """ | |
| detected_dimensions = [] | |
| # 找到關鍵字位置 | |
| keyword_positions = [m.start() for m in re.finditer(re.escape(keyword), text)] | |
| for pos in keyword_positions: | |
| # 定義搜尋窗口 | |
| start = max(0, pos - window) | |
| end = min(len(text), pos + len(keyword) + window) | |
| window_text = text[start:end] | |
| # 在窗口中搜尋維度關鍵字 | |
| for dimension, dimension_keywords in self.dimension_keywords.items(): | |
| for dim_keyword in dimension_keywords: | |
| if dim_keyword in window_text: | |
| detected_dimensions.append(dimension) | |
| break # 找到一個就夠了,不重複添加 | |
| return list(set(detected_dimensions)) | |
| def _calculate_final_priorities(self, | |
| detected_emphases: Dict[str, List[float]], | |
| detected_rankings: Dict[str, int], | |
| mentioned_dimensions: Set[str] = None) -> Dict[str, float]: | |
| """ | |
| 計算最終優先級(疊加邏輯) | |
| Args: | |
| detected_emphases: 檢測到的強調 {dimension: [weights]} | |
| detected_rankings: 檢測到的排序 {dimension: rank} | |
| mentioned_dimensions: 被提及但沒有強調詞的維度 | |
| Returns: | |
| Dict[str, float]: 最終優先級分數 | |
| """ | |
| final_priorities = {} | |
| if mentioned_dimensions is None: | |
| mentioned_dimensions = set() | |
| # 合併所有提及的維度(包括強調、排序、和一般提及) | |
| all_dimensions = set(detected_emphases.keys()) | set(detected_rankings.keys()) | mentioned_dimensions | |
| for dimension in all_dimensions: | |
| emphasis_scores = detected_emphases.get(dimension, []) | |
| ranking = detected_rankings.get(dimension, 0) | |
| is_mentioned = dimension in mentioned_dimensions | |
| # 計算疊加分數 | |
| if emphasis_scores or ranking > 0: | |
| # 有強調詞或排序詞 | |
| final_score = self._calculate_stacked_priority(emphasis_scores, ranking) | |
| elif is_mentioned: | |
| # 僅被提及(沒有強調詞),給予基本優先級提升 | |
| final_score = 1.3 # 基本提升,讓系統知道這個維度是使用者關心的 | |
| else: | |
| final_score = 1.0 | |
| final_priorities[dimension] = final_score | |
| return final_priorities | |
| def _calculate_stacked_priority(self, | |
| emphases: List[float], | |
| ranking: int = 0) -> float: | |
| """ | |
| 計算疊加後的優先級分數 | |
| 邏輯: | |
| 1. 取最高強調作為基礎 | |
| 2. 其他強調提供遞減加成 | |
| 3. 排序詞轉換為權重並疊加 | |
| 4. 確保不超過絕對上限 2.5 | |
| Args: | |
| emphases: 強調權重列表 | |
| ranking: 排序位置 (1=first, 2=second, etc.) | |
| Returns: | |
| float: 最終優先級分數 | |
| """ | |
| if not emphases and ranking == 0: | |
| return 1.0 | |
| # 轉換排序為權重 | |
| ranking_weights = { | |
| 1: 2.0, # first | |
| 2: 1.7, # second | |
| 3: 1.4, # third | |
| 4: 1.2, # fourth | |
| 5: 1.1 # fifth | |
| } | |
| ranking_weight = ranking_weights.get(ranking, 0.0) | |
| # 合併所有權重 | |
| all_weights = emphases.copy() | |
| if ranking_weight > 0: | |
| all_weights.append(ranking_weight) | |
| if not all_weights: | |
| return 1.0 | |
| # 排序取最高作為基礎 | |
| sorted_weights = sorted(all_weights, reverse=True) | |
| base_score = sorted_weights[0] | |
| # 額外權重提供遞減加成 (reduced stacking bonus) | |
| bonus = 0.0 | |
| for i, weight in enumerate(sorted_weights[1:], start=1): | |
| # 遞減加成: 第2個給30%, 第3個給15%, 第4個給7.5% (reduced from 50/25/12.5) | |
| bonus += (weight - 1.0) * (0.3 / i) | |
| final_score = min(base_score + bonus, self.absolute_max_priority) | |
| return final_score | |
| def _calculate_detection_confidence(self, | |
| detected_emphases: Dict[str, List[float]], | |
| detected_rankings: Dict[str, int], | |
| text: str) -> float: | |
| """ | |
| 計算檢測信心度 | |
| Args: | |
| detected_emphases: 檢測到的強調 | |
| detected_rankings: 檢測到的排序 | |
| text: 原始文字 | |
| Returns: | |
| float: 信心度 (0-1) | |
| """ | |
| confidence = 0.5 # 基礎信心度 | |
| # 有明確強調 +0.3 | |
| if detected_emphases: | |
| confidence += 0.3 | |
| # 有明確排序 +0.2 | |
| if detected_rankings: | |
| confidence += 0.2 | |
| # 文字長度適中 +0.1 | |
| word_count = len(text.split()) | |
| if 10 <= word_count <= 100: | |
| confidence += 0.1 | |
| return min(1.0, confidence) | |
| def get_detection_summary(self, result: PriorityDetectionResult) -> Dict[str, Any]: | |
| """ | |
| 獲取檢測摘要 | |
| Args: | |
| result: 優先級檢測結果 | |
| Returns: | |
| Dict[str, Any]: 檢測摘要 | |
| """ | |
| return { | |
| 'total_dimensions_detected': len(result.dimension_priorities), | |
| 'high_priority_dimensions': [ | |
| dim for dim, score in result.dimension_priorities.items() if score >= 1.5 | |
| ], | |
| 'dimension_priorities': result.dimension_priorities, | |
| 'emphases_detected': len(result.detected_emphases), | |
| 'rankings_detected': len(result.detected_rankings), | |
| 'negative_constraints': result.detected_negatives, | |
| 'detection_confidence': result.detection_confidence | |
| } | |
| def detect_user_priorities(user_input: str) -> PriorityDetectionResult: | |
| """ | |
| 便利函數: 檢測使用者優先級 | |
| Args: | |
| user_input: 使用者輸入 | |
| Returns: | |
| PriorityDetectionResult: 檢測結果 | |
| """ | |
| detector = PriorityDetector() | |
| return detector.detect_priorities(user_input) | |
| def get_priority_summary(user_input: str) -> Dict[str, Any]: | |
| """ | |
| 便利函數: 獲取優先級摘要 | |
| Args: | |
| user_input: 使用者輸入 | |
| Returns: | |
| Dict[str, Any]: 優先級摘要 | |
| """ | |
| detector = PriorityDetector() | |
| result = detector.detect_priorities(user_input) | |
| return detector.get_detection_summary(result) | |