Spaces:

ranilmukesh
/

sirus

Running

App Files Files Community

sirus / backend /SQL_Agent /hybrid_keyword_utils.py

ranilmukesh

Deploy SiRUS SQL Agent backend

b8277c4 3 months ago

raw

history blame contribute delete

6.71 kB

	"""
	Hybrid Keyword Extraction Utilities

	Provides reusable keyword extraction combining:
	1. Deterministic extraction (regex + stopword filtering)
	2. Semantic hints via LLM (business concepts)
	3. Optional LLM-provided concepts

	This module can be imported by any agent needing intelligent schema discovery.
	"""

	import re
	import logging
	from typing import Dict, List, Optional, Callable, Any

	logger = logging.getLogger(__name__)


	def extract_hybrid_keywords(
	question: str,
	llm_concepts: Optional[List[str]] = None,
	semantic_client: Optional[Callable[[str], List[str]]] = None,
	) -> Dict[str, List[str]]:
	"""
	Extract keywords using hybrid approach: deterministic + semantic + LLM concepts.

	Args:
	question: The user's natural language question
	llm_concepts: Optional list of high-level concepts provided by the LLM agent
	semantic_client: Optional callable that takes a question and returns semantic hints
	If None, semantic extraction is skipped

	Returns:
	Dictionary with:
	- 'base': Deterministic keywords from stopword filtering
	- 'semantic': LLM-generated semantic hints (if semantic_client provided)
	- 'concepts': LLM-provided concepts (if llm_concepts provided)
	- 'combined': Merged, deduplicated list of all keywords
	"""
	# Normalize the question
	normalized = question.lower().strip()

	# Common stopwords to filter out
	stopwords = {
	'what', 'when', 'where', 'who', 'which', 'how', 'show', 'give', 'tell',
	'get', 'find', 'list', 'display', 'the', 'a', 'an', 'is', 'are', 'was',
	'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'can',
	'could', 'would', 'should', 'may', 'might', 'must', 'will', 'shall',
	'from', 'to', 'in', 'on', 'at', 'by', 'for', 'with', 'about', 'into',
	'through', 'during', 'before', 'after', 'above', 'below', 'between',
	'under', 'of', 'me', 'my', 'our', 'total', 'all', 'any', 'each', 'every'
	}

	# STEP 1: Deterministic keyword extraction
	# Extract words, including splitting underscored identifiers
	words = re.findall(r'\b[a-z_][a-z0-9_]*\b', normalized)
	base_keywords = []
	for word in words:
	if '_' in word:
	# Split underscored identifiers (e.g., user_name -> user, name)
	parts = word.split('_')
	base_keywords.extend([p for p in parts if p not in stopwords and len(p) > 2])
	elif word not in stopwords and len(word) > 2:
	base_keywords.append(word)

	# Extract multi-word phrases (simple bigrams)
	tokens = normalized.split()
	bigrams = []
	for i in range(len(tokens) - 1):
	if tokens[i] not in stopwords and tokens[i+1] not in stopwords:
	phrase = f"{tokens[i]} {tokens[i+1]}"
	if re.match(r'^[a-z_][a-z0-9_ ]+$', phrase):
	bigrams.append(phrase)

	# Combine unique base keywords and relevant bigrams (limit bigrams to top 3)
	all_base_keywords = list(dict.fromkeys(base_keywords))
	for bigram in bigrams[:3]:
	if bigram not in all_base_keywords:
	all_base_keywords.append(bigram)

	# STEP 2: Semantic hints from LLM (if semantic_client provided)
	semantic_hints = []
	if semantic_client:
	try:
	semantic_hints = semantic_client(question)
	logger.info(f"Semantic hints extracted: {semantic_hints}")
	except Exception as e:
	logger.warning(f"Could not get semantic hints: {e}")
	semantic_hints = []

	# STEP 3: Process LLM-provided concepts (if any)
	processed_concepts = []
	if llm_concepts:
	processed_concepts = [
	c.lower().strip()
	for c in llm_concepts
	if c and c.strip()
	]
	processed_concepts = list(dict.fromkeys(processed_concepts)) # Dedupe

	# STEP 4: Combine all keywords (preserve order, remove duplicates)
	combined = []
	seen = set()

	# Add in order: concepts (LLM priority) -> base (deterministic) -> semantic (hints)
	for keyword_list in [processed_concepts, all_base_keywords, semantic_hints]:
	for kw in keyword_list:
	kw_lower = kw.lower().strip()
	if kw_lower and kw_lower not in seen:
	combined.append(kw)
	seen.add(kw_lower)

	result = {
	'base': all_base_keywords,
	'semantic': semantic_hints,
	'concepts': processed_concepts,
	'combined': combined
	}

	logger.debug(f"Hybrid keyword extraction for '{question}':")
	logger.debug(f" Base: {all_base_keywords}")
	logger.debug(f" Semantic: {semantic_hints}")
	logger.debug(f" Concepts: {processed_concepts}")
	logger.debug(f" Combined: {combined}")

	return result


	def create_gemini_semantic_client(model_id: str = "gemini-2.5-flash") -> Callable[[str], List[str]]:
	"""
	Creates a semantic client that uses Gemini to extract business concepts.

	Args:
	model_id: The Gemini model ID to use

	Returns:
	Callable that takes a question and returns list of semantic hints
	"""
	try:
	from agno.models.google import Gemini
	except ImportError:
	logger.error("Cannot import Gemini. Install agno package.")
	return lambda q: []

	def semantic_client(question: str) -> List[str]:
	"""Extract semantic hints using Gemini"""
	semantic_hint_prompt = f"""
	Given this user question, what are the likely business concepts or entity types they're asking about?
	Be concise and provide semantic categories (not exact table names).

	Question: "{question}"

	Think about:
	1. What business metric/concept are they asking about? (e.g., "earnings" = revenue/profit/income)
	2. What entities/dimensions are involved? (e.g., "2023" = time period, "customers" = people)
	3. What operations? (e.g., "compare" = aggregation/grouping, "trends" = time series)

	Respond with just 3-5 short semantic hints separated by commas (e.g., "revenue concept, time period, user dimension"):
	"""

	try:
	model = Gemini(id=model_id)
	response = model.generate(semantic_hint_prompt)
	semantic_hints_str = response.content if hasattr(response, 'content') else ""

	# Parse comma-separated hints
	hints = [h.strip() for h in semantic_hints_str.split(',') if h.strip()]
	return hints[:5] # Limit to 5 hints max

	except Exception as e:
	logger.warning(f"Gemini semantic extraction failed: {e}")
	return []

	return semantic_client