"""
Hybrid Keyword Extraction Utilities

Provides reusable keyword extraction combining:
1. Deterministic extraction (regex + stopword filtering)
2. Semantic hints via LLM (business concepts)
3. Optional LLM-provided concepts

This module can be imported by any agent needing intelligent schema discovery.
"""

import re
import logging
from typing import Dict, List, Optional, Callable, Any

logger = logging.getLogger(__name__)


def extract_hybrid_keywords(
    question: str,
    llm_concepts: Optional[List[str]] = None,
    semantic_client: Optional[Callable[[str], List[str]]] = None,
) -> Dict[str, List[str]]:
    """
    Extract keywords using hybrid approach: deterministic + semantic + LLM concepts.
    
    Args:
        question: The user's natural language question
        llm_concepts: Optional list of high-level concepts provided by the LLM agent
        semantic_client: Optional callable that takes a question and returns semantic hints
                        If None, semantic extraction is skipped
    
    Returns:
        Dictionary with:
        - 'base': Deterministic keywords from stopword filtering
        - 'semantic': LLM-generated semantic hints (if semantic_client provided)
        - 'concepts': LLM-provided concepts (if llm_concepts provided)
        - 'combined': Merged, deduplicated list of all keywords
    """
    # Normalize the question
    normalized = question.lower().strip()
    
    # Common stopwords to filter out
    stopwords = {
        'what', 'when', 'where', 'who', 'which', 'how', 'show', 'give', 'tell', 
        'get', 'find', 'list', 'display', 'the', 'a', 'an', 'is', 'are', 'was',
        'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'can',
        'could', 'would', 'should', 'may', 'might', 'must', 'will', 'shall',
        'from', 'to', 'in', 'on', 'at', 'by', 'for', 'with', 'about', 'into',
        'through', 'during', 'before', 'after', 'above', 'below', 'between',
        'under', 'of', 'me', 'my', 'our', 'total', 'all', 'any', 'each', 'every'
    }
    
    # STEP 1: Deterministic keyword extraction
    # Extract words, including splitting underscored identifiers
    words = re.findall(r'\b[a-z_][a-z0-9_]*\b', normalized)
    base_keywords = []
    for word in words:
        if '_' in word:
            # Split underscored identifiers (e.g., user_name -> user, name)
            parts = word.split('_')
            base_keywords.extend([p for p in parts if p not in stopwords and len(p) > 2])
        elif word not in stopwords and len(word) > 2:
            base_keywords.append(word)
    
    # Extract multi-word phrases (simple bigrams)
    tokens = normalized.split()
    bigrams = []
    for i in range(len(tokens) - 1):
        if tokens[i] not in stopwords and tokens[i+1] not in stopwords:
            phrase = f"{tokens[i]} {tokens[i+1]}"
            if re.match(r'^[a-z_][a-z0-9_ ]+$', phrase):
                bigrams.append(phrase)
    
    # Combine unique base keywords and relevant bigrams (limit bigrams to top 3)
    all_base_keywords = list(dict.fromkeys(base_keywords))
    for bigram in bigrams[:3]:
        if bigram not in all_base_keywords:
            all_base_keywords.append(bigram)
    
    # STEP 2: Semantic hints from LLM (if semantic_client provided)
    semantic_hints = []
    if semantic_client:
        try:
            semantic_hints = semantic_client(question)
            logger.info(f"Semantic hints extracted: {semantic_hints}")
        except Exception as e:
            logger.warning(f"Could not get semantic hints: {e}")
            semantic_hints = []
    
    # STEP 3: Process LLM-provided concepts (if any)
    processed_concepts = []
    if llm_concepts:
        processed_concepts = [
            c.lower().strip() 
            for c in llm_concepts 
            if c and c.strip()
        ]
        processed_concepts = list(dict.fromkeys(processed_concepts))  # Dedupe
    
    # STEP 4: Combine all keywords (preserve order, remove duplicates)
    combined = []
    seen = set()
    
    # Add in order: concepts (LLM priority) -> base (deterministic) -> semantic (hints)
    for keyword_list in [processed_concepts, all_base_keywords, semantic_hints]:
        for kw in keyword_list:
            kw_lower = kw.lower().strip()
            if kw_lower and kw_lower not in seen:
                combined.append(kw)
                seen.add(kw_lower)
    
    result = {
        'base': all_base_keywords,
        'semantic': semantic_hints,
        'concepts': processed_concepts,
        'combined': combined
    }
    
    logger.debug(f"Hybrid keyword extraction for '{question}':")
    logger.debug(f"  Base: {all_base_keywords}")
    logger.debug(f"  Semantic: {semantic_hints}")
    logger.debug(f"  Concepts: {processed_concepts}")
    logger.debug(f"  Combined: {combined}")
    
    return result


def create_gemini_semantic_client(model_id: str = "gemini-2.5-flash") -> Callable[[str], List[str]]:
    """
    Creates a semantic client that uses Gemini to extract business concepts.
    
    Args:
        model_id: The Gemini model ID to use
        
    Returns:
        Callable that takes a question and returns list of semantic hints
    """
    try:
        from agno.models.google import Gemini
    except ImportError:
        logger.error("Cannot import Gemini. Install agno package.")
        return lambda q: []
    
    def semantic_client(question: str) -> List[str]:
        """Extract semantic hints using Gemini"""
        semantic_hint_prompt = f"""
Given this user question, what are the likely business concepts or entity types they're asking about?
Be concise and provide semantic categories (not exact table names).

Question: "{question}"

Think about:
1. What business metric/concept are they asking about? (e.g., "earnings" = revenue/profit/income)
2. What entities/dimensions are involved? (e.g., "2023" = time period, "customers" = people)
3. What operations? (e.g., "compare" = aggregation/grouping, "trends" = time series)

Respond with just 3-5 short semantic hints separated by commas (e.g., "revenue concept, time period, user dimension"):
"""
        
        try:
            model = Gemini(id=model_id)
            response = model.generate(semantic_hint_prompt)
            semantic_hints_str = response.content if hasattr(response, 'content') else ""
            
            # Parse comma-separated hints
            hints = [h.strip() for h in semantic_hints_str.split(',') if h.strip()]
            return hints[:5]  # Limit to 5 hints max
            
        except Exception as e:
            logger.warning(f"Gemini semantic extraction failed: {e}")
            return []
    
    return semantic_client