Spaces:
Running
Running
| """ | |
| Intent detection module for RAG pipeline. | |
| Detects user intent from queries to enable intent-based gating. | |
| """ | |
| import re | |
| from typing import List, Dict, Set | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Intent keywords mapping | |
| INTENT_KEYWORDS: Dict[str, List[str]] = { | |
| "integration": [ | |
| "integrate", "integration", "api", "connect", "connection", "webhook", | |
| "shopify", "woocommerce", "stripe", "paypal", "payment gateway", | |
| "whatsapp", "telegram", "slack", "zapier", "ifttt", "automation" | |
| ], | |
| "billing": [ | |
| "billing", "invoice", "payment", "subscription", "plan", "pricing", | |
| "cost", "price", "charge", "fee", "refund", "cancel", "renew" | |
| ], | |
| "account": [ | |
| "account", "profile", "settings", "preferences", "user", "login", | |
| "signup", "register", "authentication", "auth" | |
| ], | |
| "password_reset": [ | |
| "password", "reset", "forgot", "change password", "update password", | |
| "password reset link", "expire", "expiry" | |
| ], | |
| "pricing": [ | |
| "pricing", "price", "plan", "cost", "subscription", "tier", "starter", | |
| "pro", "enterprise", "monthly", "yearly", "billing" | |
| ], | |
| "general": [] # Catch-all for general queries | |
| } | |
| def detect_intents(query: str) -> List[str]: | |
| """ | |
| Detect intents from a user query. | |
| Args: | |
| query: User's question | |
| Returns: | |
| List of detected intent labels (e.g., ["integration", "billing"]) | |
| """ | |
| query_lower = query.lower() | |
| detected = [] | |
| for intent, keywords in INTENT_KEYWORDS.items(): | |
| if intent == "general": | |
| continue # Skip general, it's a catch-all | |
| # Check if any keyword matches | |
| for keyword in keywords: | |
| # Use word boundary matching for better accuracy | |
| pattern = r'\b' + re.escape(keyword.lower()) + r'\b' | |
| if re.search(pattern, query_lower): | |
| detected.append(intent) | |
| break # Only add intent once | |
| # If no specific intent detected, return general | |
| if not detected: | |
| detected = ["general"] | |
| logger.info(f"Detected intents for query '{query[:50]}...': {detected}") | |
| return detected | |
| def get_intent_keywords(intents: List[str]) -> Set[str]: | |
| """ | |
| Get all keywords for a list of intents. | |
| Args: | |
| intents: List of intent labels | |
| Returns: | |
| Set of keywords for those intents | |
| """ | |
| keywords = set() | |
| for intent in intents: | |
| if intent in INTENT_KEYWORDS: | |
| keywords.update(INTENT_KEYWORDS[intent]) | |
| return keywords | |
| def check_direct_match( | |
| query: str, | |
| retrieved_chunks: List[str], | |
| intent_keywords: Set[str] = None | |
| ) -> bool: | |
| """ | |
| Check if at least one retrieved chunk contains direct matches for query intent. | |
| Args: | |
| query: User's question | |
| retrieved_chunks: List of retrieved chunk texts | |
| intent_keywords: Optional set of intent keywords to check | |
| Returns: | |
| True if at least one chunk has direct match, False otherwise | |
| """ | |
| if not retrieved_chunks: | |
| return False | |
| query_lower = query.lower() | |
| query_words = set(re.findall(r'\b\w+\b', query_lower)) | |
| # Get intent keywords if not provided | |
| if intent_keywords is None: | |
| intents = detect_intents(query) | |
| intent_keywords = get_intent_keywords(intents) | |
| # Check each chunk for direct matches | |
| for chunk in retrieved_chunks: | |
| chunk_lower = chunk.lower() | |
| # Check 1: Intent keywords must be present in chunk | |
| if intent_keywords: | |
| intent_found = any( | |
| re.search(r'\b' + re.escape(kw.lower()) + r'\b', chunk_lower) | |
| for kw in intent_keywords | |
| ) | |
| if not intent_found: | |
| continue # Skip this chunk if no intent keywords | |
| # Check 2: At least 2-3 important query words should be in chunk | |
| # (excluding common stop words) | |
| stop_words = {"the", "a", "an", "is", "are", "was", "were", "be", "been", | |
| "to", "of", "and", "or", "but", "in", "on", "at", "for", | |
| "with", "how", "what", "when", "where", "why", "do", "does"} | |
| important_words = query_words - stop_words | |
| if len(important_words) >= 2: | |
| # Need at least 2 important words to match | |
| matches = sum(1 for word in important_words if word in chunk_lower) | |
| if matches >= 2: | |
| logger.info(f"Direct match found: {matches} important words matched in chunk") | |
| return True | |
| elif len(important_words) == 1: | |
| # Single important word - require exact phrase match | |
| for word in important_words: | |
| if re.search(r'\b' + re.escape(word) + r'\b', chunk_lower): | |
| logger.info(f"Direct match found: single important word '{word}' matched") | |
| return True | |
| logger.warning("No direct match found in retrieved chunks") | |
| return False | |