"""
backend/app/core/topic.py

Extracts a 1–3 word topic label from a natural-language query.

Used by Guard, Retrieve, and any node that surfaces context-specific status
labels ("Checking your question about machine learning", "Searching portfolio
for RAG pipeline") without any LLM call.  The extraction is a pure set-lookup
— it adds no measurable latency.

>>> extract_topic("What are Darshan's machine learning projects?")
'machine learning projects'
>>> extract_topic("Tell me about his background")
'background'
>>> extract_topic("How does he implement RAG?")
'implement RAG'
>>> extract_topic("What is")
'What is'
"""
from __future__ import annotations

import re

# Comprehensive stopword set: prepositions, articles, auxiliary verbs, common
# question words, personal pronouns, demonstratives, and portfolio-query filler.
# Content-bearing words (nouns, adjectives, verbs like "implement", "built")
# are intentionally absent — they ARE the topic.
_STOPWORDS: frozenset[str] = frozenset({
    # Articles
    "a", "an", "the",
    # Prepositions
    "about", "above", "across", "after", "against", "along", "among",
    "around", "at", "before", "behind", "below", "beneath", "beside",
    "between", "beyond", "by", "during", "except", "for", "from", "in",
    "inside", "into", "like", "near", "of", "off", "on", "onto", "out",
    "outside", "over", "past", "regarding", "since", "through",
    "throughout", "to", "toward", "under", "underneath", "until", "up",
    "upon", "with", "within", "without",
    # Conjunctions
    "and", "but", "or", "nor", "so", "yet", "both", "either", "neither",
    # Common auxiliary verbs
    "is", "are", "was", "were", "be", "been", "being",
    "has", "have", "had", "do", "does", "did",
    "will", "would", "could", "should", "may", "might", "can", "shall",
    # Question words
    "what", "who", "where", "when", "how", "why", "which",
    # Personal pronouns
    "i", "you", "he", "she", "it", "we", "they",
    "me", "him", "her", "us", "them",
    "my", "your", "his", "its", "our", "their",
    "mine", "yours", "hers", "ours", "theirs",
    # Demonstratives
    "this", "that", "these", "those",
    # Common portfolio-query filler
    "tell", "me", "about", "show", "give", "list", "get", "find",
    "look", "also", "just", "really", "very", "more", "most",
    "some", "any", "other", "another", "same", "such", "own",
    "darshan", "chheda",  # owner name is not a useful topic word
})


def extract_topic(query: str) -> str:
    """Return a 1–3 word topic phrase extracted from ``query``.

    Words matching the stopword set are stripped (case-insensitive).  The first
    1–3 remaining words are returned joined by spaces.  If the query resolves
    to zero content words (all stopwords, or empty), the first two whitespace-
    separated tokens of the original query are returned unchanged so the caller
    always receives a non-empty string.
    """
    tokens = re.findall(r"[a-zA-Z']+", query)
    content = [t for t in tokens if t.lower() not in _STOPWORDS and len(t) > 1]

    if not content:
        # Fallback: keep the first two words of the original query verbatim.
        parts = query.strip().split()
        return " ".join(parts[:2]) if len(parts) >= 2 else (parts[0] if parts else query)

    return " ".join(content[:3])