sirus / backend /SQL_Agent /hybrid_keyword_utils.py
ranilmukesh's picture
Deploy SiRUS SQL Agent backend
b8277c4
"""
Hybrid Keyword Extraction Utilities
Provides reusable keyword extraction combining:
1. Deterministic extraction (regex + stopword filtering)
2. Semantic hints via LLM (business concepts)
3. Optional LLM-provided concepts
This module can be imported by any agent needing intelligent schema discovery.
"""
import re
import logging
from typing import Dict, List, Optional, Callable, Any
logger = logging.getLogger(__name__)
def extract_hybrid_keywords(
question: str,
llm_concepts: Optional[List[str]] = None,
semantic_client: Optional[Callable[[str], List[str]]] = None,
) -> Dict[str, List[str]]:
"""
Extract keywords using hybrid approach: deterministic + semantic + LLM concepts.
Args:
question: The user's natural language question
llm_concepts: Optional list of high-level concepts provided by the LLM agent
semantic_client: Optional callable that takes a question and returns semantic hints
If None, semantic extraction is skipped
Returns:
Dictionary with:
- 'base': Deterministic keywords from stopword filtering
- 'semantic': LLM-generated semantic hints (if semantic_client provided)
- 'concepts': LLM-provided concepts (if llm_concepts provided)
- 'combined': Merged, deduplicated list of all keywords
"""
# Normalize the question
normalized = question.lower().strip()
# Common stopwords to filter out
stopwords = {
'what', 'when', 'where', 'who', 'which', 'how', 'show', 'give', 'tell',
'get', 'find', 'list', 'display', 'the', 'a', 'an', 'is', 'are', 'was',
'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'can',
'could', 'would', 'should', 'may', 'might', 'must', 'will', 'shall',
'from', 'to', 'in', 'on', 'at', 'by', 'for', 'with', 'about', 'into',
'through', 'during', 'before', 'after', 'above', 'below', 'between',
'under', 'of', 'me', 'my', 'our', 'total', 'all', 'any', 'each', 'every'
}
# STEP 1: Deterministic keyword extraction
# Extract words, including splitting underscored identifiers
words = re.findall(r'\b[a-z_][a-z0-9_]*\b', normalized)
base_keywords = []
for word in words:
if '_' in word:
# Split underscored identifiers (e.g., user_name -> user, name)
parts = word.split('_')
base_keywords.extend([p for p in parts if p not in stopwords and len(p) > 2])
elif word not in stopwords and len(word) > 2:
base_keywords.append(word)
# Extract multi-word phrases (simple bigrams)
tokens = normalized.split()
bigrams = []
for i in range(len(tokens) - 1):
if tokens[i] not in stopwords and tokens[i+1] not in stopwords:
phrase = f"{tokens[i]} {tokens[i+1]}"
if re.match(r'^[a-z_][a-z0-9_ ]+$', phrase):
bigrams.append(phrase)
# Combine unique base keywords and relevant bigrams (limit bigrams to top 3)
all_base_keywords = list(dict.fromkeys(base_keywords))
for bigram in bigrams[:3]:
if bigram not in all_base_keywords:
all_base_keywords.append(bigram)
# STEP 2: Semantic hints from LLM (if semantic_client provided)
semantic_hints = []
if semantic_client:
try:
semantic_hints = semantic_client(question)
logger.info(f"Semantic hints extracted: {semantic_hints}")
except Exception as e:
logger.warning(f"Could not get semantic hints: {e}")
semantic_hints = []
# STEP 3: Process LLM-provided concepts (if any)
processed_concepts = []
if llm_concepts:
processed_concepts = [
c.lower().strip()
for c in llm_concepts
if c and c.strip()
]
processed_concepts = list(dict.fromkeys(processed_concepts)) # Dedupe
# STEP 4: Combine all keywords (preserve order, remove duplicates)
combined = []
seen = set()
# Add in order: concepts (LLM priority) -> base (deterministic) -> semantic (hints)
for keyword_list in [processed_concepts, all_base_keywords, semantic_hints]:
for kw in keyword_list:
kw_lower = kw.lower().strip()
if kw_lower and kw_lower not in seen:
combined.append(kw)
seen.add(kw_lower)
result = {
'base': all_base_keywords,
'semantic': semantic_hints,
'concepts': processed_concepts,
'combined': combined
}
logger.debug(f"Hybrid keyword extraction for '{question}':")
logger.debug(f" Base: {all_base_keywords}")
logger.debug(f" Semantic: {semantic_hints}")
logger.debug(f" Concepts: {processed_concepts}")
logger.debug(f" Combined: {combined}")
return result
def create_gemini_semantic_client(model_id: str = "gemini-2.5-flash") -> Callable[[str], List[str]]:
"""
Creates a semantic client that uses Gemini to extract business concepts.
Args:
model_id: The Gemini model ID to use
Returns:
Callable that takes a question and returns list of semantic hints
"""
try:
from agno.models.google import Gemini
except ImportError:
logger.error("Cannot import Gemini. Install agno package.")
return lambda q: []
def semantic_client(question: str) -> List[str]:
"""Extract semantic hints using Gemini"""
semantic_hint_prompt = f"""
Given this user question, what are the likely business concepts or entity types they're asking about?
Be concise and provide semantic categories (not exact table names).
Question: "{question}"
Think about:
1. What business metric/concept are they asking about? (e.g., "earnings" = revenue/profit/income)
2. What entities/dimensions are involved? (e.g., "2023" = time period, "customers" = people)
3. What operations? (e.g., "compare" = aggregation/grouping, "trends" = time series)
Respond with just 3-5 short semantic hints separated by commas (e.g., "revenue concept, time period, user dimension"):
"""
try:
model = Gemini(id=model_id)
response = model.generate(semantic_hint_prompt)
semantic_hints_str = response.content if hasattr(response, 'content') else ""
# Parse comma-separated hints
hints = [h.strip() for h in semantic_hints_str.split(',') if h.strip()]
return hints[:5] # Limit to 5 hints max
except Exception as e:
logger.warning(f"Gemini semantic extraction failed: {e}")
return []
return semantic_client