import re from typing import Dict, List, Any # Absolute import from project root from app.utils.config import get_logger logger = get_logger(__name__) class InformationExtractor: """Extract meaningful information from user messages""" def __init__(self): self.logger = get_logger(__name__) # Try to load spaCy models self.nlp_en = None self.nlp_zh = None self._load_spacy_models() # Pattern libraries for extraction self.patterns = self._build_patterns() self.logger.info("Information Extractor initialized") def _load_spacy_models(self): """Load spaCy models if available""" try: import spacy self.nlp_en = spacy.load("en_core_web_sm") self.nlp_zh = ( spacy.load("zh_core_web_sm") if self._model_exists("zh_core_web_sm") else None ) self.logger.info("spaCy models loaded successfully") except (ImportError, OSError) as e: self.logger.warning( "spaCy models not available, using pattern-based extraction: %s", str(e) ) def _model_exists(self, model_name: str) -> bool: """Check if spaCy model exists""" try: import spacy spacy.load(model_name) return True except OSError: return False def _build_patterns(self) -> Dict[str, List[str]]: """Build regex patterns for information extraction""" return { # Name patterns "name_introductions": [ r"(?:my name is|i'm|i am|call me)\s+([a-zA-Z]+)", r"(?:this is|here is)\s+([a-zA-Z]+)(?:\s+speaking)?", r"([a-zA-Z]+)\s+(?:here|speaking)", ], # Product interests "product_mentions": { "toys": [r"\b(toy|toys|doll|dolls|game|games|puzzle|puzzles)\b"], "gifts": [r"\b(gift|gifts|present|presents)\b"], "books": [r"\b(book|books|educational|learning)\b"], "electronics": [r"\b(electronic|electronics|gadget|gadgets)\b"], }, # Order references "order_patterns": [ r"order\s+(?:#|number|no.\?)\s*(\w+)", r"order\s+(\w{4,})", r"my order\s+(\w+)", ], # Contact information "email_patterns": [r"\b[A-Za-z0-9._%+-]+[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"], "phone_patterns": [ r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b", r"\(\d{3}\)\s*\d{3}[-.\s]?\d{4}", ], # Emotional Indicators "positive_emotions": [ r"\b(happy|excited|love|great|awesome|amazing|perfect|wonderful)\b", r"\b(thank you|thanks|appreciate)\b", ], "negative_emotions": [ r"\b(frustrated|angry|disappointed|upset|annoyed|terrible|awful)\b", r"\b(problem|issue|complain|complaint|wrong|error)\b", ], # Urgency indicators "urgency_patterns": [ r"\b(urgent|asap|immediately|quickly|rush|emergency)\b", r"\b(need.{0,10}(now|today|right away))\b", ], } def extract_user_information( self, message: str, language: str = "en" ) -> Dict[str, Any]: """Extract comprehensive user information from a message""" message_lower = message.lower() extracted_info = { "personal_info": {}, "interests": [], "contact_info": {}, "emotional_state": {}, "entities": {}, "topics": [], "urgency_level": "normal", } # Extract personal information extracted_info["personal_info"] = self._extract_personal_info(message_lower) # Extract product interests extracted_info["interests"] = self._extract_interests(message_lower) # Extract contact information extracted_info["contact_info"] = self._extract_contact_info(message_lower) # Analyze emotional state extracted_info["emotional_state"] = self._analyze_emotional_state(message_lower) # Extract entities using spaCy (if available) extracted_info["entities"] = self._extract_entities_spacy(message, language) # Identify conversation topics extracted_info["topics"] = self._identify_topics(message_lower) # Assess urgency level extracted_info["urgency_level"] = self._assess_urgency(message_lower) # Log extraction results if any(extracted_info.values()): self.logger.debug("Extracted information: %s", extracted_info) return extracted_info def _extract_personal_info(self, message: str) -> Dict[str, str]: """Extract personal information like names""" personal_info = {} # Extract names for pattern in self.patterns["name_introductions"]: matches = re.findall(pattern, message, re.IGNORECASE) if matches: # Take the first match and capitalize properly name = matches[0].strip().title() if len(name) > 1 and name.isalpha(): # Basic validation personal_info["name"] = name break return personal_info def _extract_interests(self, message: str) -> List[str]: """Extract product interests and preferences""" interests = [] for category, patterns in self.patterns["product_mentions"].items(): for pattern in patterns: if re.search(pattern, message, re.IGNORECASE): category_name = category.split("_", maxsplit=1)[ 0 ] # "product_x_y" -> "product", "x_y" -> "product" if category_name not in interests: interests.append(category_name) # Alternative approach - direct pattern matching product_categories = { "toys": r"\b(toy|toys|doll|dolls|action figure|puzzle|game)\b", "gifts": r"\b(gift|gifts|present|presents)\b", "books": r"\b(book|books|reading|educational)\b", "electronics": r"\b(electronics|electronics|gadget|tablet|phone)\b", } for category, pattern in product_categories.items(): if re.search(pattern, message, re.IGNORECASE): if category not in interests: interests.append(category) return interests def _extract_contact_info(self, message: str) -> Dict[str, str]: """Extract contact information""" contact_info = {} # Extract email addresses for pattern in self.patterns["email_patterns"]: matches = re.findall(pattern, message) if matches: contact_info["email"] = matches[0] break # Extract phone numbers for pattern in self.patterns["phone_patterns"]: matches = re.findall(pattern, message) if matches: contact_info["phone"] = matches[0] break return contact_info def _analyze_emotional_state(self, message: str) -> Dict[str, Any]: """Analyze user's emotional state""" emotional_state = { "sentiment": "neutral", "emotions": [], "confidence": 0.5, } positive_count = 0 negative_count = 0 # Check for positive emotions for pattern in self.patterns["positive_emotions"]: matches = re.findall(pattern, message) if matches: positive_count += len(matches) emotional_state["emotions"].extend(matches) # Check for negative emotions for pattern in self.patterns["negative_emotions"]: matches = re.findall(pattern, message) if matches: negative_count += len(matches) emotional_state["emotions"].extend(matches) # Determine overall sentiment if positive_count > negative_count: emotional_state["sentiment"] = "positive" emotional_state["confidence"] = min(0.9, 0.5 + (positive_count * 0.1)) elif negative_count > positive_count: emotional_state["sentiment"] = "negative" emotional_state["confidence"] = min(0.9, 0.5 + (negative_count * 0.1)) return emotional_state def _extract_entities_spacy( self, message: str, language: str ) -> Dict[str, List[str]]: """Extract entities using spaCy (if available)""" entities = {} if language == "zh" and self.nlp_zh: nlp = self.nlp_zh elif self.nlp_en: nlp = self.nlp_en else: return entities # Return empty if no spaCy models try: doc = nlp(message) for ent in doc.ents: entity_type = ent.label_.lower() entity_text = ent.text.strip() if entity_type not in entities: entities[entity_type] = [] if entity_text not in entities[entity_type]: entities[entity_type].append(entity_text) except Exception as e: self.logger.warning("spaCy entity extraction failed: %s", str(e)) return entities def _identify_topics(self, message: str) -> List[str]: """Identify conversation topics""" topics = [] topic_keywords = { "order_management": r"\b(order|purchase|buy|bought|cancel|return|refund)\b", "product_inquiry": r"\b(product|item|toy|gift|available|stock|price|cost)\b", "shipping": r"\b(ship|shipping|delivery|delivered|track|tracking)\b", "support": r"\b(help|support|problem|issue|question|assist)\b", "account": r"\b(account|profile|login|password|register|sign up)\b", } for topic, pattern in topic_keywords.items(): if re.search(pattern, message, re.IGNORECASE): topics.append(topic) return topics def _assess_urgency(self, message: str) -> str: """Assess the urgency level of the massage""" urgency_score = 0 for pattern in self.patterns["urgency_patterns"]: matches = re.findall(pattern, message, re.IGNORECASE) urgency_score += len(matches) if urgency_score >= 2: return "high" elif urgency_score >= 1: return "medium" else: return "normal" def extract_conversation_summary(self, messages: List[Dict]) -> Dict[str, Any]: """Extract summary information from a conversation""" if not messages: return {} summary = { "total_messages": len(messages), "topics_discussed": set(), "user_interests": set(), "emotional_journey": [], "key_information": {}, "resolution_status": "unknown", } for msg in messages: user_input = msg.get("user_input", "") if user_input: # Extract information from each message info = self.extract_user_information(user_input) # Aggregate topics summary["topics_discussed"].update(info.get("topics", [])) # Aggregate interests summary["user_interests"].update(info.get("interests", [])) # Track emotional journey emotion = info.get("emotional_state", {}).get("sentiment", "neutral") summary["emotional_journey"].append(emotion) # Store key personal information personal_info = info.get("personal_info", {}) if personal_info: summary["key_information"].update(personal_info) # Convert sets to lists for JSON serialization summary["topics_discussed"] = list(summary["topics_discussed"]) summary["user_interests"] = List(summary["user_interests"]) return summary