"""
Query rewriting / understanding utilities.

Goal:
- Produce two strings per user input:
  * retrieval_query: short, keyword-heavy, good for BM25 + embedding retrievers.
  * rerank_query: full intent/constraints (close to original) for cross-encoder rerank.

Design: pure rule-based (Phase 1, no LLM). Easily swappable later.
"""
from __future__ import annotations

import re
import json
from dataclasses import dataclass, asdict
from typing import List, Optional, Dict, Set
from collections import Counter

try:
    from nltk.stem import PorterStemmer
    from nltk.tokenize import wordpunct_tokenize
    _NLTK_AVAILABLE = True
    _STEMMER = PorterStemmer()
except Exception:
    _NLTK_AVAILABLE = False
    _STEMMER = None

# Small, hand-curated vocabularies (extend as needed).
TECH_SKILLS = {
    "java",
    "python",
    "c++",
    "c#",
    ".net",
    "dotnet",
    "sql",
    "mysql",
    "postgres",
    "react",
    "node",
    "javascript",
    "typescript",
    "spring",
    "django",
    "flask",
    "aws",
    "azure",
    "gcp",
    "kubernetes",
    "docker",
    "html",
    "css",
    "jquery",
    "selenium",
    "angular",
    "vue",
    "ruby",
    "rails",
    "php",
    "laravel",
    "terraform",
    "ansible",
    "jenkins",
    "ci",
    "cd",
    "cicd",
    "ml",
    "ai",
    "data",
    "science",
}

SOFT_SKILLS = {
    "communication",
    "collaboration",
    "collaborate",
    "collaborative",
    "teamwork",
    "team",
    "stakeholder",
    "leadership",
    "analytical",
    "problem solving",
    "analytical thinking",
    "people management",
    "customer",
    "client",
    "communication",
    "negotiation",
    "presentation",
    "adaptability",
    "emotional",
    "conflict",
}

JOB_LEVEL_HINTS = {
    "entry": ["entry", "graduate", "junior"],
    "mid": ["mid", "mid-level", "midlevel"],
    "senior": ["senior", "sr", "lead"],
    "manager": ["manager", "management", "leadership"],
}

ROLE_HINTS = {
    "developer",
    "dev",
    "engineer",
    "manager",
    "analyst",
    "writer",
    "content",
    "designer",
    "consultant",
    "architect",
    "lead",
    "sales",
}

ROLE_PHRASES = [
    "java developer",
    "software engineer",
    "software developer",
    "content writer",
    "technical writer",
    "product manager",
    "project manager",
    "data analyst",
    "data scientist",
    "business analyst",
]

# Domain intent canonicalization: map abstract user phrases to catalog-relevant terms.
INTENT_CANONICAL_MAP = {
    "culture fit": ["personality", "behavioral", "values", "situational judgement"],
    "cultural fit": ["personality", "behavioral", "values", "situational judgement"],
    "leadership": ["leadership", "management", "executive"],
    "coo": ["executive", "leadership", "management"],
    "chief operating officer": ["executive", "leadership", "management"],
    "content writer": ["english comprehension", "verbal ability", "writing"],
    "seo": ["verbal reasoning", "english comprehension", "writing"],
    "culture fit": ["personality", "behavioral", "values", "situational judgement"],
    "collaborate": ["communication", "teamwork", "interpersonal communications"],
    "collaboration": ["communication", "teamwork", "interpersonal communications"],
    "communication": ["communication", "interpersonal communications"],
    "business team": ["communication", "teamwork", "interpersonal communications"],
}

STOPWORDS = {
    "the",
    "and",
    "or",
    "for",
    "to",
    "with",
    "of",
    "a",
    "an",
    "in",
    "on",
    "at",
    "by",
    "is",
    "are",
    "be",
    "that",
    "this",
    "these",
    "those",
    "as",
    "from",
    "we",
    "our",
    "their",
    "your",
    "i",
    "you",
    "they",
    "he",
    "she",
    "it",
    "was",
    "were",
    "will",
    "can",
    "could",
    "should",
    "would",
    "who",
    # Extra noise terms common in user JDs / requests that dilute retrieval queries
    "want",
    "hiring",
    "hire",
    "new",
    "role",
    "company",
    "compani",
    "my",
    "budget",
    "option",
    "options",
    "give",
    "some",
    "about",
    "each",
    "test",
    "tests",
}

SYNONYMS = {
    "js": "javascript",
    "ts": "typescript",
    "k8s": "kubernetes",
    "db": "database",
    "ml": "machine learning",
    "ai": "artificial intelligence",
    "comm": "communication",
}

MISSPELLINGS = {
    "pythn": "python",
    "javscript": "javascript",
    "dockr": "docker",
    "kubernets": "kubernetes",
}


@dataclass
class DurationConstraint:
    mode: str  # "MAX" or "TARGET"
    minutes: int


@dataclass
class ParsedConstraints:
    duration: Optional[DurationConstraint]
    job_levels: List[str]
    languages: List[str]
    experience: Optional[str]
    flags: Dict[str, Optional[bool]]  # remote/adaptive


@dataclass
class QueryRewrite:
    retrieval_query: str
    rerank_query: str
    intent: str  # TECH / BEHAVIORAL / MIXED / UNKNOWN
    must_have_skills: List[str]
    soft_skills: List[str]
    role_terms: List[str]
    negated_skills: List[str]
    constraints: ParsedConstraints
    llm_debug: Optional[dict] = None

    def to_dict(self):
        d = asdict(self)
        # dataclass for constraints nests another dataclass; fix for serialization
        if self.constraints and self.constraints.duration:
            d["constraints"]["duration"] = asdict(self.constraints.duration)
        if self.llm_debug is not None:
            d["llm_debug"] = self.llm_debug
        return d


def _normalize(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()


def _lower(text: str) -> str:
    return _normalize(text.lower())


def parse_duration(text: str) -> Optional[DurationConstraint]:
    t = text.lower()
    if "about an hour" in t or "around an hour" in t or re.search(r"\ban hour\b", t):
        return DurationConstraint(mode="TARGET", minutes=60)
    if "half hour" in t:
        return DurationConstraint(mode="TARGET", minutes=30)
    # hours pattern e.g., 1.5 hours
    m = re.search(r"(\d+(?:\.\d+)?)\s*(hour|hours|hr|hrs)", t)
    if m:
        minutes = int(round(float(m.group(1)) * 60))
        return DurationConstraint(mode="TARGET", minutes=minutes)
    # minutes pattern
    m = re.search(r"(\d{1,3})\s*(minute|min|minutes|mins)", t)
    if m:
        minutes = int(m.group(1))
        # MAX if “at most/within/under”; else TARGET
        if re.search(r"(at most|within|under|<=|less than)", t):
            return DurationConstraint(mode="MAX", minutes=minutes)
        return DurationConstraint(mode="TARGET", minutes=minutes)
    return None


def parse_flags(text: str) -> Dict[str, Optional[bool]]:
    t = text.lower()
    remote = True if re.search(r"\bremote\b", t) else None
    adaptive = True if re.search(r"\badaptive\b|\birt\b", t) else None
    return {"remote": remote, "adaptive": adaptive}


def parse_languages(text: str) -> List[str]:
    LANGS = {
        "english": "English",
        "spanish": "Spanish",
        "french": "French",
        "german": "German",
        "mandarin": "Mandarin",
    }
    langs = []
    t = text.lower()
    for key, val in LANGS.items():
        if key in t:
            langs.append(val)
    return langs


def parse_experience(text: str) -> Optional[str]:
    t = text.lower()
    m = re.search(r"(\d{1,2})(?:\s*-\s*(\d{1,2}))?\s*(year|years|yr|yrs)", t)
    if m:
        low = int(m.group(1))
        high = int(m.group(2)) if m.group(2) else low + 2
        return f"{low}-{high} years"
    if "fresher" in t or "0 years" in t or "entry-level" in t:
        return "0-2 years"
    return None


def parse_job_levels(text: str) -> List[str]:
    out = set()
    t = text.lower()
    for lvl, patterns in JOB_LEVEL_HINTS.items():
        for p in patterns:
            if re.search(rf"\b{re.escape(p)}\b", t):
                out.add(lvl.title())
    return sorted(out)


def tokenize(text: str) -> List[str]:
    def simple_stem(w: str) -> str:
        if w.endswith("ing") and len(w) > 4:
            return w[:-3]
        if w.endswith("s") and len(w) > 3:
            return w[:-1]
        return w

    if _NLTK_AVAILABLE:
        raw_tokens = wordpunct_tokenize(text.lower())
    else:
        raw_tokens = re.findall(r"[a-zA-Z0-9\+#\.]+", text.lower())

    toks = [MISSPELLINGS.get(t, SYNONYMS.get(t, t)) for t in raw_tokens]

    if _NLTK_AVAILABLE and _STEMMER is not None:
        toks = [_STEMMER.stem(t) for t in toks]
    else:
        toks = [simple_stem(t) for t in toks]

    return [t for t in toks if t and t not in STOPWORDS]


def extract_phrases(tokens: List[str], max_phrases: int = 5) -> List[str]:
    """Return a few informative bigrams/trigrams preserved as phrases."""
    phrases: List[str] = []
    n = len(tokens)
    SIGNAL = set(TECH_SKILLS) | set(ROLE_HINTS) | set(SOFT_SKILLS) | {
        "graduate",
        "junior",
        "entry",
        "senior",
        "manager",
        "leadership",
        "culture",
        "values",
        "personality",
        "behavior",
        "behaviour",
        "sales",
        "marketing",
    }
    for size in (3, 2):  # prefer trigrams, then bigrams
        for i in range(n - size + 1):
            gram = tokens[i : i + size]
            # require at least one signal token
            if not any(g in SIGNAL for g in gram):
                continue
            # skip if mostly stopwords/very short
            if all(t in STOPWORDS for t in gram):
                continue
            if sum(len(g) <= 2 for g in gram) >= 2:
                continue
            phrase = " ".join(gram)
            if phrase not in phrases:
                phrases.append(phrase)
            if len(phrases) >= max_phrases:
                return phrases
    return phrases[:max_phrases]


def extract_skills(tokens: List[str]) -> (Set[str], Set[str], Set[str]):
    toks_join = " ".join(tokens)
    must = set()
    soft = set()
    negated = set()
    for skill in TECH_SKILLS:
        if re.search(rf"\b{re.escape(skill)}\b", toks_join):
            must.add(skill)
    for s in SOFT_SKILLS:
        if re.search(rf"\b{re.escape(s)}\b", toks_join):
            soft.add(s)
    for i, tok in enumerate(tokens):
        if tok in TECH_SKILLS and i > 0 and tokens[i - 1] in {"no", "without", "exclude", "not"}:
            negated.add(tok)
            must.discard(tok)
    return must, soft, negated


def top_keywords(tokens: List[str], k: int = 15) -> List[str]:
    cnt = Counter(tokens)
    return [w for w, _ in cnt.most_common(k)]


def classify_intent(tokens: List[str]) -> str:
    tset = set(tokens)
    tech_hit = any(tok in TECH_SKILLS for tok in tset)
    behav_hit = any(tok in {"communication", "collaboration", "teamwork", "stakeholder", "leadership", "personality", "values", "culture", "cultural", "fit", "behavioral", "behavioural"} or tok.startswith("sales") for tok in tset)
    if tech_hit and behav_hit:
        return "MIXED"
    if tech_hit:
        return "TECH"
    if behav_hit:
        return "BEHAVIORAL"
    return "UNKNOWN"


LOCATION_TOKENS = {
    "china",
    "india",
    "usa",
    "uk",
    "europe",
    "us",
    "canada",
    "germany",
    "france",
}


def strip_locations(tokens: List[str]) -> List[str]:
    return [t for t in tokens if t not in LOCATION_TOKENS]


def intent_canonical_terms(text_lower: str) -> List[str]:
    terms: List[str] = []
    for phrase, mapped in INTENT_CANONICAL_MAP.items():
        if phrase in text_lower:
            terms.extend(mapped)
    # dedupe preserve order
    out = []
    seen = set()
    for t in terms:
        if t not in seen:
            out.append(t)
            seen.add(t)
    return out


def build_retrieval_query(
    role_terms: List[str],
    must_skills: List[str],
    soft_skills: List[str],
    constraints: ParsedConstraints,
    extra_terms: List[str],
    phrases: Optional[List[str]] = None,
    canonical_terms: Optional[List[str]] = None,
) -> str:
    parts = []
    if phrases:
        # Inject phrase tokens in two forms:
        # 1) underscore-joined to preserve as a single token (for BM25).
        # 2) original text to keep semantic signal for embeddings.
        for p in phrases:
            p_norm = p.strip()
            if not p_norm:
                continue
            parts.append(p_norm.replace(" ", "_"))
            parts.append(p_norm)
    parts.extend(role_terms)
    parts.extend(must_skills)
    # Downweight negated skills by excluding them (could prefix with "-skill" if BM25 supports).
    parts.extend(soft_skills)
    parts.extend(extra_terms)
    if canonical_terms:
        parts.extend(canonical_terms)
    if constraints.experience:
        parts.append(constraints.experience)
    if constraints.duration:
        if constraints.duration.mode == "MAX":
            parts.append("duration under")
        parts.append(f"{constraints.duration.minutes} minutes")
    if constraints.languages:
        parts.extend([l.lower() for l in constraints.languages])
    # keep order, drop dupes, and trim length
    def _is_stop(tok: str) -> bool:
        if not tok:
            return True
        base = tok.replace("_", " ")
        base_parts = base.split()
        # If all parts are stopwords, drop it.
        return all(p in STOPWORDS for p in base_parts)

    deduped = []
    seen = set()
    for p in parts:
        if p and p not in seen and not _is_stop(p):
            deduped.append(p)
            seen.add(p)
    query = _normalize(" ".join(deduped))
    # Trim to ~40 tokens to avoid bloating retriever input
    toks = query.split()
    if len(toks) > 40:
        query = " ".join(toks[:40])
    return query


def _boost_from_vocab(tokens: Set[str], vocab: Dict[str, List[str]], intent: str, max_terms: int = 5) -> List[str]:
    out: List[str] = []
    if not vocab:
        return out
    if intent in ("TECH", "MIXED") and "technical" in vocab:
        for w in vocab["technical"]:
            if w in tokens and w not in out:
                out.append(w)
            if len(out) >= max_terms:
                return out
    if intent in ("BEHAVIORAL", "MIXED") and "behavioral" in vocab:
        for w in vocab["behavioral"]:
            if w in tokens and w not in out:
                out.append(w)
            if len(out) >= max_terms:
                return out
    if "roles" in vocab:
        for w in vocab["roles"]:
            if w in tokens and w not in out:
                out.append(w)
            if len(out) >= max_terms:
                return out
    return out[:max_terms]


def _extract_json_text(raw):
    # NuExtractWrapper often returns dict with clean_output
    if isinstance(raw, dict):
        for k in ("clean_output", "output", "text"):
            v = raw.get(k)
            if isinstance(v, str) and v.strip():
                return v
        # sometimes raw_output exists
        v = raw.get("raw_output")
        if isinstance(v, str) and v.strip():
            return v
        # if dict itself is already the parsed object
        if "retrieval_query" in raw or "rerank_query" in raw:
            return json.dumps(raw)
        return ""  # will fail loudly below

    # QwenRewriter might return str or dict
    if isinstance(raw, str):
        return raw
    if raw is None:
        return ""
    # anything else
    return str(raw)

def _coerce_json(s: str) -> dict:
    s = (s or "").strip()
    if not s:
        raise ValueError("LLM returned empty output")

    # strip fences
    if "```" in s:
        s = s.replace("```json", "").replace("```", "").strip()

    # extract first JSON object
    a, b = s.find("{"), s.rfind("}")
    if a != -1 and b != -1 and b > a:
        s = s[a:b+1]

    return json.loads(s)


def _rewrite_with_llm(raw_text: str, catalog_vocab: Optional[Dict[str, List[str]]], llm_extractor) -> Optional[QueryRewrite]:
    # Legacy placeholder will be overridden by the newer implementation below.
    return None


# --- Qwen-focused LLM rewrite (preferred) ---
LLM_SCHEMA = json.dumps(
    {
        "retrieval_query": "concise keyword-heavy string",
        "rerank_query": "full query text (keep intent/constraints)",
        "intent": "one of TECH, BEHAVIORAL, MIXED, UNKNOWN",
        "must_have_skills": ["string"],
        "soft_skills": ["string"],
        "role_terms": ["string"],
        "negated_skills": ["string"],
        "constraints": {
            "duration": {"mode": "one of MAX, TARGET, or null", "minutes": "integer or null"},
            "job_levels": ["string"],
            "languages": ["string"],
            "experience": "string or null",
            "flags": {"remote": "boolean or null", "adaptive": "boolean or null"},
        },
    },
    indent=2,
)


def _rewrite_with_llm(raw_text: str, catalog_vocab: Optional[Dict[str, List[str]]], llm_extractor) -> Optional[QueryRewrite]:
    """
    Preferred LLM rewrite using Qwen (or other local LLM). Returns None on failure so the
    caller can fall back to deterministic rewrite.
    """
    try:
        raw = llm_extractor.predict(text=raw_text, schema=LLM_SCHEMA, return_full=True)
        if "_extract_json_text" in globals():
            raw_json = _extract_json_text(raw)
        else:
            raw_json = raw.get("clean_output") if isinstance(raw, dict) else raw
        data = _coerce_json(raw_json) if "_coerce_json" in globals() else json.loads(raw_json)

        dur = data.get("constraints", {}).get("duration") or {}
        duration_obj = None
        duration_error = None
        if dur.get("minutes") is not None:
            mode = dur.get("mode") or "TARGET"
            try:
                duration_obj = DurationConstraint(mode=mode, minutes=int(float(dur["minutes"])))
            except Exception as e:
                duration_error = f"duration_parse_error: {e}"
                duration_obj = None

        constraints = ParsedConstraints(
            duration=duration_obj,
            job_levels=data.get("constraints", {}).get("job_levels") or [],
            languages=data.get("constraints", {}).get("languages") or [],
            experience=data.get("constraints", {}).get("experience"),
            flags=data.get("constraints", {}).get("flags") or {"remote": None, "adaptive": None},
        )

        intent_raw = data.get("intent")
        allowed_intents = {"TECH", "BEHAVIORAL", "MIXED", "UNKNOWN"}
        intent_final = intent_raw if intent_raw in allowed_intents else None
        if intent_final is None:
            toks_src = data.get("retrieval_query") or raw_text
            toks = tokenize(_lower(toks_src))
            intent_final = classify_intent(toks)

        retrieval_q = data.get("retrieval_query") or raw_text
        rerank_q = data.get("rerank_query") or raw_text
        placeholder = False
        if retrieval_q.strip().lower() in {"string", ""} or rerank_q.strip().lower() in {"string", ""}:
            placeholder = True
        if intent_raw and "|" in str(intent_raw):
            placeholder = True

        rw = QueryRewrite(
            retrieval_query=retrieval_q,
            rerank_query=rerank_q,
            intent=intent_final or "UNKNOWN",
            must_have_skills=data.get("must_have_skills") or [],
            soft_skills=data.get("soft_skills") or [],
            role_terms=data.get("role_terms") or [],
            negated_skills=data.get("negated_skills") or [],
            constraints=constraints,
        )

        if isinstance(raw, dict):
            rw.llm_debug = {
                "prompt": raw.get("prompt"),
                "raw_output": raw.get("raw_output"),
                "clean_output": raw_json,
                "intent_raw": intent_raw,
                "model": getattr(llm_extractor, "model_name", "llm"),
            }
            if duration_error:
                rw.llm_debug["duration_error"] = duration_error
        if placeholder:
            if rw.llm_debug is None:
                rw.llm_debug = {}
            rw.llm_debug["error"] = "placeholder_output"
            return None
        return rw
    except Exception:
        return None
    """Try to rewrite via NuExtract (LLM) using a JSON schema; return None on failure."""
    schema ={
            {
                "retrieval_query": "java developer assessment core java collaboration communication 40 minutes",
                "rerank_query": "Hiring Java dev who can collaborate with business teams. 40 minutes.",
                "intent": "MIXED",
                "must_have_skills": ["java"],
                "soft_skills": ["communication", "collaboration"],
                "role_terms": ["java developer"],
                "negated_skills": [],
                "constraints": {
                    "duration": {"mode": "TARGET", "minutes": 40},
                    "job_levels": [],
                    "languages": [],
                    "experience": None,
                    "flags": {"remote": None, "adaptive": None},
                },
            },
            {
                "retrieval_query": "culture fit leadership personality situational judgement executive assessment 60 minutes",
                "rerank_query": "Find a 1 hour culture fit assessment for a COO",
                "intent": "BEHAVIORAL",
                "must_have_skills": [],
                "soft_skills": ["leadership", "personality"],
                "role_terms": ["coo", "executive"],
                "negated_skills": [],
                "constraints": {
                    "duration": {"mode": "TARGET", "minutes": 60},
                    "job_levels": ["manager"],
                    "languages": [],
                    "experience": None,
                    "flags": {"remote": None, "adaptive": None},
                },
            }
    }

    try:
        raw = llm_extractor.predict(text=raw_text, schema=json.dumps(schema), return_full=True)
        print("LLM raw type:", type(raw))
        print("LLM raw keys:" , list(raw.keys()) if isinstance(raw, dict) else None)
        print("LLM raw_json head:", repr(raw_json[:80]))

        raw_json = _extract_json_text(raw)
        data = _coerce_json(raw_json)
        dur = data.get("constraints", {}).get("duration") or {}
        duration_obj = None
        duration_error = None
        if dur.get("minutes") is not None:
            mode = dur.get("mode") or "TARGET"
            try:
                minutes_val = float(dur["minutes"])
                duration_obj = DurationConstraint(mode=mode, minutes=int(minutes_val))
            except Exception as e:
                duration_obj = None
                duration_error = f"duration_parse_error: {e}"
        constraints = ParsedConstraints(
            duration=duration_obj,
            job_levels=data.get("constraints", {}).get("job_levels") or [],
            languages=data.get("constraints", {}).get("languages") or [],
            experience=data.get("constraints", {}).get("experience"),
            flags=data.get("constraints", {}).get("flags") or {"remote": None, "adaptive": None},
        )
        intent_raw = data.get("intent")
        allowed_intents = {"TECH", "BEHAVIORAL", "MIXED", "UNKNOWN"}
        intent_final = intent_raw if intent_raw in allowed_intents else None
        # fallback heuristic intent if LLM intent is missing/unrecognized
    
        retrieval_q = data.get("retrieval_query") or raw_text
        if intent_final is None or intent_final == "UNKNOWN":
            # Use LLM-derived retrieval query for a better hint
            toks_src = retrieval_q if isinstance(retrieval_q, str) else raw_text
            toks = tokenize(_lower(toks_src))
            intent_final = classify_intent(toks)

        rerank_q = data.get("rerank_query") or raw_text
        # Detect placeholder/hallucinated outputs; if placeholders found, treat as failure.
        placeholder = False
        if retrieval_q.strip().lower() in {"string", ""} or rerank_q.strip().lower() in {"string", ""}:
            placeholder = True
        if intent_raw == "TECH|BEHAVIORAL|MIXED|UNKNOWN":
            placeholder = True
        if dur.get("minutes") in ("int|null", "int", "", None) and duration_obj is None:
            placeholder = True

        rw = QueryRewrite(
            retrieval_query=retrieval_q,
            rerank_query=rerank_q,
            intent=intent_final or "UNKNOWN",
            must_have_skills=data.get("must_have_skills") or [],
            soft_skills=data.get("soft_skills") or [],
            role_terms=data.get("role_terms") or [],
            negated_skills=data.get("negated_skills") or [],
            constraints=constraints,
        )
        # attach LLM debug if available
        if isinstance(raw, dict):
            rw.llm_debug = {
                "prompt": raw.get("prompt"),
                "raw_output": raw.get("raw_output"),
                "clean_output": raw_json,
                "intent_raw": intent_raw,
                "model": getattr(llm_extractor, "model_name", "llm"),
            }
            if duration_error:
                rw.llm_debug["duration_error"] = duration_error
            if placeholder:
                rw.llm_debug["error"] = "placeholder_output"
        if placeholder:
            return None
        return rw
    except Exception as e:
        # Attach failure reason for debugging if caller wants it
        dummy = QueryRewrite(
            retrieval_query=raw_text,
            rerank_query=raw_text,
            intent="UNKNOWN",
            must_have_skills=[],
            soft_skills=[],
            role_terms=[],
            negated_skills=[],
            constraints=ParsedConstraints(duration=None, job_levels=[], languages=[], experience=None, flags={"remote": None, "adaptive": None}),
            llm_debug={"error": str(e)},
        )
        return dummy


def rewrite_query(raw_text: str, catalog_vocab: Optional[Dict[str, List[str]]] = None, llm_extractor=None) -> QueryRewrite:
    catalog_vocab = catalog_vocab or {}
    raw_clean = raw_text.strip()
    low = _lower(raw_text)
    # LLM-based rewrite first if provided
    llm_fail_debug = None
    if llm_extractor:
        llm_rw = _rewrite_with_llm(raw_text, catalog_vocab, llm_extractor)
        if llm_rw and not (llm_rw.llm_debug and llm_rw.llm_debug.get("error")):
            return llm_rw
        if llm_rw and llm_rw.llm_debug:
            llm_fail_debug = llm_rw.llm_debug

    tokens = tokenize(low)
    tokens = strip_locations(tokens)

    duration = parse_duration(raw_text)
    flags = parse_flags(raw_text)
    languages = parse_languages(raw_text)
    experience = parse_experience(raw_text)
    job_levels = parse_job_levels(raw_text)
    constraints = ParsedConstraints(duration=duration, job_levels=job_levels, languages=languages, experience=experience, flags=flags)

    intent = classify_intent(tokens)
    must_skills, soft_sk, neg_skills = extract_skills(tokens)
    keywords = top_keywords(tokens, k=25)
    # Boost with catalog vocab matches, typed by intent.
    boost = _boost_from_vocab(set(tokens), catalog_vocab, intent, max_terms=5)

    # Role terms: prefer ROLE_HINTS present in tokens, then fall back to top keywords.
    role_terms: List[str] = []
    # Add matching role phrases if present.
    for phrase in ROLE_PHRASES:
        if phrase in low and phrase not in role_terms:
            role_terms.append(phrase)
    for tok in keywords:
        if tok in ROLE_HINTS and tok not in role_terms:
            role_terms.append(tok)
        if len(role_terms) >= 5:
            break
    if len(role_terms) < 3:  # backfill with keywords if needed
        for tok in keywords:
            if tok not in role_terms and tok not in STOPWORDS:
                role_terms.append(tok)
            if len(role_terms) >= 5:
                break
    phrases = extract_phrases(tokens, max_phrases=5)
    canonical_terms = intent_canonical_terms(low)
    retrieval_query = build_retrieval_query(
        role_terms,
        sorted(must_skills),
        sorted(soft_sk),
        constraints,
        boost,
        phrases=phrases,
        canonical_terms=canonical_terms,
    )
    rerank_query = raw_clean  # keep full context for reranker

    return QueryRewrite(
        retrieval_query=retrieval_query,
        rerank_query=rerank_query,
        intent=intent,
        must_have_skills=sorted(must_skills),
        soft_skills=sorted(soft_sk),
        role_terms=role_terms,
        negated_skills=sorted(neg_skills),
        constraints=constraints,
        llm_debug=llm_fail_debug,
    )


def build_catalog_vocab(catalog_texts: List[str], min_len: int = 5, max_terms: int = 200) -> Dict[str, List[str]]:
    """Lightweight vocab from catalog, bucketed by intent."""
    cnt_tech = Counter()
    cnt_behav = Counter()
    cnt_roles = Counter()
    for txt in catalog_texts:
        for tok in tokenize(txt.lower()):
            if len(tok) < min_len:
                continue
            if tok in TECH_SKILLS:
                cnt_tech[tok] += 1
            elif tok in SOFT_SKILLS or "behavior" in tok or "culture" in tok:
                cnt_behav[tok] += 1
            elif tok in ROLE_HINTS:
                cnt_roles[tok] += 1
    return {
        "technical": [w for w, _ in cnt_tech.most_common(max_terms // 3 or 1)],
        "behavioral": [w for w, _ in cnt_behav.most_common(max_terms // 3 or 1)],
        "roles": [w for w, _ in cnt_roles.most_common(max_terms // 3 or 1)],
    }


if __name__ == "__main__":
    import argparse
    import json
    import sys

    parser = argparse.ArgumentParser(description="Rewrite a query into retrieval + rerank forms.")
    parser.add_argument("--query", required=True, help="Raw user query/JD text")
    parser.add_argument("--catalog", help="Optional catalog JSONL to build vocab (uses doc_text or name/description)")
    args = parser.parse_args()

    vocab = {}
    if args.catalog:
        import pandas as pd

        df = pd.read_json(args.catalog, lines=True)
        if "doc_text" in df.columns:
            texts = df["doc_text"].astype(str).tolist()
        else:
            texts = (df.get("name", "").astype(str) + " " + df.get("description", "").astype(str)).tolist()
        vocab = build_catalog_vocab(texts)

    rewrite = rewrite_query(args.query, vocab)
    json.dump(rewrite.to_dict(), sys.stdout, indent=2)
    sys.stdout.write("\n")