""" Hybrid Keyword Extraction Utilities Provides reusable keyword extraction combining: 1. Deterministic extraction (regex + stopword filtering) 2. Semantic hints via LLM (business concepts) 3. Optional LLM-provided concepts This module can be imported by any agent needing intelligent schema discovery. """ import re import logging from typing import Dict, List, Optional, Callable, Any logger = logging.getLogger(__name__) def extract_hybrid_keywords( question: str, llm_concepts: Optional[List[str]] = None, semantic_client: Optional[Callable[[str], List[str]]] = None, ) -> Dict[str, List[str]]: """ Extract keywords using hybrid approach: deterministic + semantic + LLM concepts. Args: question: The user's natural language question llm_concepts: Optional list of high-level concepts provided by the LLM agent semantic_client: Optional callable that takes a question and returns semantic hints If None, semantic extraction is skipped Returns: Dictionary with: - 'base': Deterministic keywords from stopword filtering - 'semantic': LLM-generated semantic hints (if semantic_client provided) - 'concepts': LLM-provided concepts (if llm_concepts provided) - 'combined': Merged, deduplicated list of all keywords """ # Normalize the question normalized = question.lower().strip() # Common stopwords to filter out stopwords = { 'what', 'when', 'where', 'who', 'which', 'how', 'show', 'give', 'tell', 'get', 'find', 'list', 'display', 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'can', 'could', 'would', 'should', 'may', 'might', 'must', 'will', 'shall', 'from', 'to', 'in', 'on', 'at', 'by', 'for', 'with', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'under', 'of', 'me', 'my', 'our', 'total', 'all', 'any', 'each', 'every' } # STEP 1: Deterministic keyword extraction # Extract words, including splitting underscored identifiers words = re.findall(r'\b[a-z_][a-z0-9_]*\b', normalized) base_keywords = [] for word in words: if '_' in word: # Split underscored identifiers (e.g., user_name -> user, name) parts = word.split('_') base_keywords.extend([p for p in parts if p not in stopwords and len(p) > 2]) elif word not in stopwords and len(word) > 2: base_keywords.append(word) # Extract multi-word phrases (simple bigrams) tokens = normalized.split() bigrams = [] for i in range(len(tokens) - 1): if tokens[i] not in stopwords and tokens[i+1] not in stopwords: phrase = f"{tokens[i]} {tokens[i+1]}" if re.match(r'^[a-z_][a-z0-9_ ]+$', phrase): bigrams.append(phrase) # Combine unique base keywords and relevant bigrams (limit bigrams to top 3) all_base_keywords = list(dict.fromkeys(base_keywords)) for bigram in bigrams[:3]: if bigram not in all_base_keywords: all_base_keywords.append(bigram) # STEP 2: Semantic hints from LLM (if semantic_client provided) semantic_hints = [] if semantic_client: try: semantic_hints = semantic_client(question) logger.info(f"Semantic hints extracted: {semantic_hints}") except Exception as e: logger.warning(f"Could not get semantic hints: {e}") semantic_hints = [] # STEP 3: Process LLM-provided concepts (if any) processed_concepts = [] if llm_concepts: processed_concepts = [ c.lower().strip() for c in llm_concepts if c and c.strip() ] processed_concepts = list(dict.fromkeys(processed_concepts)) # Dedupe # STEP 4: Combine all keywords (preserve order, remove duplicates) combined = [] seen = set() # Add in order: concepts (LLM priority) -> base (deterministic) -> semantic (hints) for keyword_list in [processed_concepts, all_base_keywords, semantic_hints]: for kw in keyword_list: kw_lower = kw.lower().strip() if kw_lower and kw_lower not in seen: combined.append(kw) seen.add(kw_lower) result = { 'base': all_base_keywords, 'semantic': semantic_hints, 'concepts': processed_concepts, 'combined': combined } logger.debug(f"Hybrid keyword extraction for '{question}':") logger.debug(f" Base: {all_base_keywords}") logger.debug(f" Semantic: {semantic_hints}") logger.debug(f" Concepts: {processed_concepts}") logger.debug(f" Combined: {combined}") return result def create_gemini_semantic_client(model_id: str = "gemini-2.5-flash") -> Callable[[str], List[str]]: """ Creates a semantic client that uses Gemini to extract business concepts. Args: model_id: The Gemini model ID to use Returns: Callable that takes a question and returns list of semantic hints """ try: from agno.models.google import Gemini except ImportError: logger.error("Cannot import Gemini. Install agno package.") return lambda q: [] def semantic_client(question: str) -> List[str]: """Extract semantic hints using Gemini""" semantic_hint_prompt = f""" Given this user question, what are the likely business concepts or entity types they're asking about? Be concise and provide semantic categories (not exact table names). Question: "{question}" Think about: 1. What business metric/concept are they asking about? (e.g., "earnings" = revenue/profit/income) 2. What entities/dimensions are involved? (e.g., "2023" = time period, "customers" = people) 3. What operations? (e.g., "compare" = aggregation/grouping, "trends" = time series) Respond with just 3-5 short semantic hints separated by commas (e.g., "revenue concept, time period, user dimension"): """ try: model = Gemini(id=model_id) response = model.generate(semantic_hint_prompt) semantic_hints_str = response.content if hasattr(response, 'content') else "" # Parse comma-separated hints hints = [h.strip() for h in semantic_hints_str.split(',') if h.strip()] return hints[:5] # Limit to 5 hints max except Exception as e: logger.warning(f"Gemini semantic extraction failed: {e}") return [] return semantic_client