Spaces:
Running
Running
| """ | |
| Hybrid Keyword Extraction Utilities | |
| Provides reusable keyword extraction combining: | |
| 1. Deterministic extraction (regex + stopword filtering) | |
| 2. Semantic hints via LLM (business concepts) | |
| 3. Optional LLM-provided concepts | |
| This module can be imported by any agent needing intelligent schema discovery. | |
| """ | |
| import re | |
| import logging | |
| from typing import Dict, List, Optional, Callable, Any | |
| logger = logging.getLogger(__name__) | |
| def extract_hybrid_keywords( | |
| question: str, | |
| llm_concepts: Optional[List[str]] = None, | |
| semantic_client: Optional[Callable[[str], List[str]]] = None, | |
| ) -> Dict[str, List[str]]: | |
| """ | |
| Extract keywords using hybrid approach: deterministic + semantic + LLM concepts. | |
| Args: | |
| question: The user's natural language question | |
| llm_concepts: Optional list of high-level concepts provided by the LLM agent | |
| semantic_client: Optional callable that takes a question and returns semantic hints | |
| If None, semantic extraction is skipped | |
| Returns: | |
| Dictionary with: | |
| - 'base': Deterministic keywords from stopword filtering | |
| - 'semantic': LLM-generated semantic hints (if semantic_client provided) | |
| - 'concepts': LLM-provided concepts (if llm_concepts provided) | |
| - 'combined': Merged, deduplicated list of all keywords | |
| """ | |
| # Normalize the question | |
| normalized = question.lower().strip() | |
| # Common stopwords to filter out | |
| stopwords = { | |
| 'what', 'when', 'where', 'who', 'which', 'how', 'show', 'give', 'tell', | |
| 'get', 'find', 'list', 'display', 'the', 'a', 'an', 'is', 'are', 'was', | |
| 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'can', | |
| 'could', 'would', 'should', 'may', 'might', 'must', 'will', 'shall', | |
| 'from', 'to', 'in', 'on', 'at', 'by', 'for', 'with', 'about', 'into', | |
| 'through', 'during', 'before', 'after', 'above', 'below', 'between', | |
| 'under', 'of', 'me', 'my', 'our', 'total', 'all', 'any', 'each', 'every' | |
| } | |
| # STEP 1: Deterministic keyword extraction | |
| # Extract words, including splitting underscored identifiers | |
| words = re.findall(r'\b[a-z_][a-z0-9_]*\b', normalized) | |
| base_keywords = [] | |
| for word in words: | |
| if '_' in word: | |
| # Split underscored identifiers (e.g., user_name -> user, name) | |
| parts = word.split('_') | |
| base_keywords.extend([p for p in parts if p not in stopwords and len(p) > 2]) | |
| elif word not in stopwords and len(word) > 2: | |
| base_keywords.append(word) | |
| # Extract multi-word phrases (simple bigrams) | |
| tokens = normalized.split() | |
| bigrams = [] | |
| for i in range(len(tokens) - 1): | |
| if tokens[i] not in stopwords and tokens[i+1] not in stopwords: | |
| phrase = f"{tokens[i]} {tokens[i+1]}" | |
| if re.match(r'^[a-z_][a-z0-9_ ]+$', phrase): | |
| bigrams.append(phrase) | |
| # Combine unique base keywords and relevant bigrams (limit bigrams to top 3) | |
| all_base_keywords = list(dict.fromkeys(base_keywords)) | |
| for bigram in bigrams[:3]: | |
| if bigram not in all_base_keywords: | |
| all_base_keywords.append(bigram) | |
| # STEP 2: Semantic hints from LLM (if semantic_client provided) | |
| semantic_hints = [] | |
| if semantic_client: | |
| try: | |
| semantic_hints = semantic_client(question) | |
| logger.info(f"Semantic hints extracted: {semantic_hints}") | |
| except Exception as e: | |
| logger.warning(f"Could not get semantic hints: {e}") | |
| semantic_hints = [] | |
| # STEP 3: Process LLM-provided concepts (if any) | |
| processed_concepts = [] | |
| if llm_concepts: | |
| processed_concepts = [ | |
| c.lower().strip() | |
| for c in llm_concepts | |
| if c and c.strip() | |
| ] | |
| processed_concepts = list(dict.fromkeys(processed_concepts)) # Dedupe | |
| # STEP 4: Combine all keywords (preserve order, remove duplicates) | |
| combined = [] | |
| seen = set() | |
| # Add in order: concepts (LLM priority) -> base (deterministic) -> semantic (hints) | |
| for keyword_list in [processed_concepts, all_base_keywords, semantic_hints]: | |
| for kw in keyword_list: | |
| kw_lower = kw.lower().strip() | |
| if kw_lower and kw_lower not in seen: | |
| combined.append(kw) | |
| seen.add(kw_lower) | |
| result = { | |
| 'base': all_base_keywords, | |
| 'semantic': semantic_hints, | |
| 'concepts': processed_concepts, | |
| 'combined': combined | |
| } | |
| logger.debug(f"Hybrid keyword extraction for '{question}':") | |
| logger.debug(f" Base: {all_base_keywords}") | |
| logger.debug(f" Semantic: {semantic_hints}") | |
| logger.debug(f" Concepts: {processed_concepts}") | |
| logger.debug(f" Combined: {combined}") | |
| return result | |
| def create_gemini_semantic_client(model_id: str = "gemini-2.5-flash") -> Callable[[str], List[str]]: | |
| """ | |
| Creates a semantic client that uses Gemini to extract business concepts. | |
| Args: | |
| model_id: The Gemini model ID to use | |
| Returns: | |
| Callable that takes a question and returns list of semantic hints | |
| """ | |
| try: | |
| from agno.models.google import Gemini | |
| except ImportError: | |
| logger.error("Cannot import Gemini. Install agno package.") | |
| return lambda q: [] | |
| def semantic_client(question: str) -> List[str]: | |
| """Extract semantic hints using Gemini""" | |
| semantic_hint_prompt = f""" | |
| Given this user question, what are the likely business concepts or entity types they're asking about? | |
| Be concise and provide semantic categories (not exact table names). | |
| Question: "{question}" | |
| Think about: | |
| 1. What business metric/concept are they asking about? (e.g., "earnings" = revenue/profit/income) | |
| 2. What entities/dimensions are involved? (e.g., "2023" = time period, "customers" = people) | |
| 3. What operations? (e.g., "compare" = aggregation/grouping, "trends" = time series) | |
| Respond with just 3-5 short semantic hints separated by commas (e.g., "revenue concept, time period, user dimension"): | |
| """ | |
| try: | |
| model = Gemini(id=model_id) | |
| response = model.generate(semantic_hint_prompt) | |
| semantic_hints_str = response.content if hasattr(response, 'content') else "" | |
| # Parse comma-separated hints | |
| hints = [h.strip() for h in semantic_hints_str.split(',') if h.strip()] | |
| return hints[:5] # Limit to 5 hints max | |
| except Exception as e: | |
| logger.warning(f"Gemini semantic extraction failed: {e}") | |
| return [] | |
| return semantic_client | |