Spaces:

mlbench123
/

aesthetic_AI

Sleeping

App Files Files Community

mlbench123 commited on Jan 19

Commit

aaa22f8

verified ·

1 Parent(s): fec0153

Upload rag_treatment_app.py

Browse files

Files changed (1) hide show

rag_treatment_app.py +874 -0

rag_treatment_app.py ADDED Viewed

	@@ -0,0 +1,874 @@

+#!/usr/bin/env python3
+"""RAGTreatmentSearchApp.
+Structured RAG AI Search over database.xlsx with strict filtering.
+UPDATED:
+- Adds mismatch detection for worst-case scenario:
+  If user-selected Region/Sub-Zone is inconsistent with the issue text,
+  return a warning message + recommended Region/Sub-Zone suggestions (exact DB names),
+  instead of producing irrelevant treatments.
+This prevents cases like:
+  Hair -> Scalp + "dark circles under eyes"
+"""
+from __future__ import annotations
+import json
+import os
+import pickle
+import re
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+import pandas as pd
+import torch
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from llm_client import LocalLLMClient
+from web_retriever import WebRetriever, WebDoc
+# Keep everything CPU-only
+if torch.backends.mps.is_available():
+    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+    torch.set_default_device("cpu")
+DEFAULT_EMBEDDING_MODEL = "sentence-transformers/static-similarity-mrl-multilingual-v1"
+@dataclass
+class RetrievedCandidate:
+    row_index: int
+    similarity: float
+    procedure: str
+    region: str
+    sub_zone: str
+    type: str
+    technique: str
+    concerns: str
+    verbatims: str
+def _norm_text(x: str) -> str:
+    return " ".join(str(x or "").strip().lower().split())
+def _subzone_mask(df: pd.DataFrame, sub_zone: str) -> pd.Series:
+    """Robust sub-zone matching.
+    1) Try strict equality on normalized values.
+    2) If strict match yields 0 rows, broaden to a fuzzy match:
+       - match if DB sub-zone contains the selected sub-zone (e.g., "under-eyes" contains "eyes"), OR
+       - match if selected sub-zone contains the DB sub-zone.
+    This specifically fixes cases like:
+      selected: "Eyes"  -> DB rows: "Under-Eyes", "Tear Troughs", "Eyes / crow's feet"
+    """
+    sz = _norm_text(sub_zone)
+    if not sz:
+        return pd.Series([True] * len(df), index=df.index)
+    strict = df["_subzone_norm"].eq(sz)
+    if int(strict.sum()) > 0:
+        return strict
+    key_tokens = [t for t in re.split(r"[^a-z0-9]+", sz) if t]
+    if not key_tokens:
+        return strict
+    def _fuzzy(cell: str) -> bool:
+        c = _norm_text(cell)
+        if not c:
+            return False
+        if sz in c or c in sz:
+            return True
+        return any(tok in c for tok in key_tokens)
+    return df["_subzone_norm"].apply(_fuzzy)
+def _norm_type_value(x: str) -> str:
+    """Normalize Excel 'Type' values to either 'surgical' or 'non-surgical'."""
+    t = _norm_text(x)
+    t = t.replace("_", "-").replace("–", "-").replace("—", "-")
+    if "non" in t and "surg" in t:
+        return "non-surgical"
+    if "surg" in t:
+        return "surgical"
+    return ""
+def _norm_type_choice(choice: str) -> str:
+    """Normalize UI choice to 'surgical' / 'non-surgical' / 'both'."""
+    c = _norm_text(choice)
+    if not c:
+        return "both"
+    if "both" in c:
+        return "both"
+    if "non" in c and "surg" in c:
+        return "non-surgical"
+    if "surg" in c:
+        return "surgical"
+    return "both"
+class RAGTreatmentSearchApp:
+    """Core engine: loads database.xlsx, creates/loads embeddings, and performs filtered retrieval + synthesis."""
+    def __init__(
+        self,
+        excel_path: str = "database.xlsx",
+        sheet_name: str = "All_Procedures",
+        embeddings_cache_path: str = "treatment_embeddings.pkl",
+        embedding_model_name: str = DEFAULT_EMBEDDING_MODEL,
+        llm: Optional[LocalLLMClient] = None,
+        web: Optional[WebRetriever] = None,
+    ):
+        self.excel_path = excel_path
+        self.sheet_name = sheet_name
+        self.embeddings_cache_path = embeddings_cache_path
+        self.df = self._load_database()
+        self._normalize_columns()
+        self.model = SentenceTransformer(embedding_model_name, device="cpu")
+        self.embeddings, self.texts = self._load_or_create_embeddings()
+        self.llm = llm or LocalLLMClient()
+        self.web = web or WebRetriever()
+    # ------------------------------------------------------------------
+    # Data loading / normalization
+    # ------------------------------------------------------------------
+    def _load_database(self) -> pd.DataFrame:
+        xl = pd.ExcelFile(self.excel_path)
+        if self.sheet_name not in xl.sheet_names:
+            raise ValueError(
+                f"Sheet '{self.sheet_name}' not found in {self.excel_path}. Found: {xl.sheet_names}"
+            )
+        return pd.read_excel(self.excel_path, sheet_name=self.sheet_name)
+    def _normalize_columns(self) -> None:
+        required = [
+            "Type",
+            "Region",
+            "Sub-Zone",
+            "Procedure",
+            "Technique / Technology / Brand",
+            "Signature technique, brands, technology",
+            "Aesthetic Concerns",
+            "Verbatims",
+        ]
+        missing = [c for c in required if c not in self.df.columns]
+        if missing:
+            raise ValueError(
+                f"database.xlsx is missing required columns: {missing}. Found: {list(self.df.columns)}"
+            )
+        for col in ["Type", "Region", "Sub-Zone", "Procedure"]:
+            self.df[col] = self.df[col].astype(str).fillna("").str.strip()
+        self.df["_region_norm"] = self.df["Region"].astype(str).apply(_norm_text)
+        self.df["_subzone_norm"] = self.df["Sub-Zone"].astype(str).apply(_norm_text)
+        self.df["_type_norm"] = self.df["Type"].astype(str).apply(_norm_type_value)
+    def get_regions(self) -> List[str]:
+        regions = [r for r in self.df["Region"].dropna().unique().tolist() if str(r).strip()]
+        return sorted(regions)
+    def get_sub_zones(self, region: str) -> List[str]:
+        r = _norm_text(region)
+        sub = self.df[self.df["_region_norm"].eq(r)]["Sub-Zone"].dropna().unique().tolist()
+        return sorted([s for s in sub if str(s).strip()])
+    # ------------------------------------------------------------------
+    # Embedding text creation
+    # ------------------------------------------------------------------
+    def _row_to_text(self, row: pd.Series) -> str:
+        def safe(col: str) -> str:
+            v = row.get(col, "")
+            if pd.isna(v):
+                return ""
+            return str(v).strip()
+        parts = [
+            f"Type: {safe('Type')}",
+            f"Region: {safe('Region')}",
+            f"Sub-Zone: {safe('Sub-Zone')}",
+            f"Procedure: {safe('Procedure')}",
+        ]
+        tech = safe("Technique / Technology / Brand")
+        if tech:
+            parts.append(f"Technique/Technology/Brand: {tech}")
+        sig = safe("Signature technique, brands, technology")
+        if sig:
+            parts.append(f"Signature techniques/brands/technology: {sig}")
+        concerns = safe("Aesthetic Concerns")
+        if concerns:
+            parts.append(f"Aesthetic concerns: {concerns}")
+        verb = safe("Verbatims")
+        if verb:
+            parts.append(f"Patient verbatims: {verb}")
+        return " | ".join([p for p in parts if p.strip()])
+    # ------------------------------------------------------------------
+    # Embedding cache
+    # ------------------------------------------------------------------
+    def _load_or_create_embeddings(self) -> Tuple[np.ndarray, List[str]]:
+        if os.path.exists(self.embeddings_cache_path):
+            try:
+                with open(self.embeddings_cache_path, "rb") as f:
+                    data = pickle.load(f)
+                if (
+                    data.get("excel_path") == os.path.abspath(self.excel_path)
+                    and data.get("sheet_name") == self.sheet_name
+                ):
+                    emb = np.array(data["embeddings"], dtype=np.float32)
+                    txt = list(data["texts"])
+                    if len(txt) == len(self.df) and emb.shape[0] == len(self.df):
+                        return emb, txt
+            except Exception:
+                pass
+        return self._create_embeddings()
+    def _create_embeddings(self) -> Tuple[np.ndarray, List[str]]:
+        texts = [self._row_to_text(self.df.iloc[i]) for i in range(len(self.df))]
+        embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=True).astype(np.float32)
+        payload = {
+            "excel_path": os.path.abspath(self.excel_path),
+            "sheet_name": self.sheet_name,
+            "created_at": time.time(),
+            "model": getattr(self.model, "name_or_path", "unknown"),
+            "texts": texts,
+            "embeddings": embeddings.tolist(),
+        }
+        with open(self.embeddings_cache_path, "wb") as f:
+            pickle.dump(payload, f)
+        return embeddings, texts
+    def refresh_embeddings(self) -> None:
+        if os.path.exists(self.embeddings_cache_path):
+            os.remove(self.embeddings_cache_path)
+        self.embeddings, self.texts = self._create_embeddings()
+    # ------------------------------------------------------------------
+    # Retrieval: strict filter + semantic search
+    # ------------------------------------------------------------------
+    def _candidate_indices(
+        self,
+        region: str,
+        sub_zone: str,
+        type_choice_norm: str,
+    ) -> np.ndarray:
+        r = _norm_text(region)
+        df = self.df
+        base = df["_region_norm"].eq(r)
+        sz_norm = _norm_text(sub_zone)
+        if sz_norm:
+            strict_sz = df["_subzone_norm"].eq(sz_norm)
+        else:
+            strict_sz = pd.Series([True] * len(df), index=df.index)
+        def _apply(base_mask: pd.Series, sz_mask: pd.Series) -> np.ndarray:
+            m = base_mask & sz_mask
+            if type_choice_norm in {"surgical", "non-surgical"}:
+                m = m & df["_type_norm"].eq(type_choice_norm)
+            return np.where(m.values)[0]
+        # 1) strict sub-zone
+        idxs = _apply(base, strict_sz)
+        if idxs.size > 0 or not sz_norm:
+            return idxs
+        # 2) fuzzy sub-zone
+        fuzzy_sz = _subzone_mask(df, sub_zone)
+        idxs = _apply(base, fuzzy_sz)
+        return idxs
+    def _semantic_search_over_indices(
+        self,
+        idxs: np.ndarray,
+        query: str,
+        top_k: int,
+        min_similarity: float,
+    ) -> List[RetrievedCandidate]:
+        if idxs.size == 0:
+            return []
+        q_emb = self.model.encode([query], convert_to_numpy=True).astype(np.float32)
+        sub_emb = self.embeddings[idxs]
+        sims = cosine_similarity(q_emb, sub_emb)[0]
+        order = sims.argsort()[::-1]
+        results: List[RetrievedCandidate] = []
+        for rank_pos in order[: max(top_k, 1) * 3]:
+            sim = float(sims[rank_pos])
+            if sim < min_similarity:
+                continue
+            row_index = int(idxs[rank_pos])
+            row = self.df.iloc[row_index]
+            results.append(
+                RetrievedCandidate(
+                    row_index=row_index,
+                    similarity=sim,
+                    procedure=str(row.get("Procedure", "")).strip(),
+                    region=str(row.get("Region", "")).strip(),
+                    sub_zone=str(row.get("Sub-Zone", "")).strip(),
+                    type=str(row.get("Type", "")).strip(),
+                    technique=str(row.get("Technique / Technology / Brand", "")).strip(),
+                    concerns=str(row.get("Aesthetic Concerns", "")).strip(),
+                    verbatims=str(row.get("Verbatims", "")).strip(),
+                )
+            )
+            if len(results) >= top_k:
+                break
+        return results
+    def semantic_search(
+        self,
+        region: str,
+        sub_zone: str,
+        type_choice: str,
+        issue_text: str,
+        top_k: int = 12,
+        min_similarity: float = 0.18,
+        both_type_balanced: bool = True,
+    ) -> List[RetrievedCandidate]:
+        issue_text = (issue_text or "").strip()
+        if not issue_text:
+            return []
+        type_choice_norm = _norm_type_choice(type_choice)
+        query = f"Region: {region} | Sub-Zone: {sub_zone} | Type: {type_choice} | Issue: {issue_text}"
+        if type_choice_norm == "both" and both_type_balanced:
+            per_bucket = max(2, top_k // 2)
+            idx_s = self._candidate_indices(region, sub_zone, "surgical")
+            idx_n = self._candidate_indices(region, sub_zone, "non-surgical")
+            if idx_s.size == 0:
+                idx_s = self._candidate_indices(region, "", "surgical")
+            if idx_n.size == 0:
+                idx_n = self._candidate_indices(region, "", "non-surgical")
+            res_s = self._semantic_search_over_indices(idx_s, query, per_bucket, min_similarity)
+            res_n = self._semantic_search_over_indices(idx_n, query, per_bucket, min_similarity)
+            merged = res_s + res_n
+            by_idx: Dict[int, RetrievedCandidate] = {}
+            for c in merged:
+                prev = by_idx.get(c.row_index)
+                if prev is None or c.similarity > prev.similarity:
+                    by_idx[c.row_index] = c
+            out = list(by_idx.values())
+            out.sort(key=lambda x: x.similarity, reverse=True)
+            return out[:top_k]
+        idxs = self._candidate_indices(region, sub_zone, type_choice_norm if type_choice_norm != "both" else "")
+        if idxs.size == 0:
+            idxs = self._candidate_indices(region, "", type_choice_norm if type_choice_norm != "both" else "")
+        if idxs.size == 0:
+            r = _norm_text(region)
+            idxs = np.where(self.df["_region_norm"].eq(r).values)[0]
+        return self._semantic_search_over_indices(idxs, query, top_k, min_similarity)
+    # ------------------------------------------------------------------
+    # NEW: Global semantic search + mismatch detection
+    # ------------------------------------------------------------------
+    def _global_semantic_search(
+        self,
+        issue_text: str,
+        top_k: int = 20,
+        min_similarity: float = 0.18,
+    ) -> List[RetrievedCandidate]:
+        """Search across the ENTIRE database to infer likely region/sub-zones for the issue."""
+        issue_text = (issue_text or "").strip()
+        if not issue_text:
+            return []
+        q_emb = self.model.encode([issue_text], convert_to_numpy=True).astype(np.float32)
+        sims = cosine_similarity(q_emb, self.embeddings)[0]
+        order = sims.argsort()[::-1]
+        results: List[RetrievedCandidate] = []
+        for idx in order[: max(top_k, 1) * 4]:
+            sim = float(sims[idx])
+            if sim < min_similarity:
+                continue
+            row = self.df.iloc[int(idx)]
+            results.append(
+                RetrievedCandidate(
+                    row_index=int(idx),
+                    similarity=sim,
+                    procedure=str(row.get("Procedure", "")).strip(),
+                    region=str(row.get("Region", "")).strip(),
+                    sub_zone=str(row.get("Sub-Zone", "")).strip(),
+                    type=str(row.get("Type", "")).strip(),
+                    technique=str(row.get("Technique / Technology / Brand", "")).strip(),
+                    concerns=str(row.get("Aesthetic Concerns", "")).strip(),
+                    verbatims=str(row.get("Verbatims", "")).strip(),
+                )
+            )
+            if len(results) >= top_k:
+                break
+        return results
+    def _detect_mismatch(
+        self,
+        selected_region: str,
+        selected_sub_zone: str,
+        local_candidates: List[RetrievedCandidate],
+        global_candidates: List[RetrievedCandidate],
+    ) -> Tuple[bool, str, List[Tuple[str, str, float]]]:
+        """Decide whether the issue text appears inconsistent with selected region/sub-zone.
+        Returns:
+          mismatch: bool
+          reason: short string
+          suggestions: list of (Region, Sub-Zone, score) from global candidates
+        """
+        sr = _norm_text(selected_region)
+        ssz = _norm_text(selected_sub_zone)
+        local_best = local_candidates[0].similarity if local_candidates else 0.0
+        global_best = global_candidates[0].similarity if global_candidates else 0.0
+        # Dedup suggested (Region, Sub-Zone) from global
+        seen = set()
+        suggestions: List[Tuple[str, str, float]] = []
+        for c in global_candidates:
+            key = (c.region, c.sub_zone)
+            if key in seen:
+                continue
+            seen.add(key)
+            suggestions.append((c.region, c.sub_zone, float(c.similarity)))
+            if len(suggestions) >= 8:
+                break
+        # If no global signal, do not block
+        if global_best <= 0.0:
+            return False, "no_global_signal", suggestions
+        # If local candidates are empty, and global is strong -> mismatch
+        if not local_candidates and global_best >= 0.45:
+            return True, "no_local_candidates_but_global_strong", suggestions
+        # Check whether selected region/sub-zone appears in global top results
+        def _same_selected(c: RetrievedCandidate) -> bool:
+            if _norm_text(c.region) != sr:
+                return False
+            # allow fuzzy containment between selected sub-zone and global sub-zone
+            cz = _norm_text(c.sub_zone)
+            if not ssz:
+                return True
+            if cz == ssz:
+                return True
+            if ssz in cz or cz in ssz:
+                return True
+            return False
+        selected_in_global = any(_same_selected(c) for c in global_candidates[:10])
+        # Mismatch rule (tuned for stability on small models):
+        # - global is meaningfully strong
+        # - local best is weak relative to global
+        # - and the selected region/sub-zone does NOT appear among global top signals
+        gap = global_best - local_best
+        if (global_best >= 0.50 and gap >= 0.10 and not selected_in_global):
+            return True, f"global_much_stronger_than_selected (gap={gap:.2f})", suggestions
+        # Another conservative rule: local best is very low but global is decent
+        if (global_best >= 0.48 and local_best <= 0.35 and not selected_in_global):
+            return True, "selected_signal_weak_vs_global", suggestions
+        return False, "no_mismatch", suggestions
+    def _build_mismatch_message(
+        self,
+        selected_region: str,
+        selected_sub_zone: str,
+        issue_text: str,
+        suggestions: List[Tuple[str, str, float]],
+    ) -> str:
+        """Use LLM to write a friendly mismatch notice, but force exact DB names."""
+        sug_lines = []
+        for i, (r, sz, sc) in enumerate(suggestions, start=1):
+            sug_lines.append(f"{i}. Region: {r} | Sub-Zone: {sz}")
+        allowed_block = "\n".join(sug_lines) if sug_lines else "(No suggestions available)"
+        prompt = f"""
+You are an assistant in an Aesthetic treatment search app.
+The user selected:
+- Region: {selected_region}
+- Sub-Zone: {selected_sub_zone}
+But the user's described problem is:
+"{issue_text}"
+Task:
+Write a short, polite warning that the selected Region/Sub-Zone do not seem appropriate for the problem,
+and suggest the most appropriate Region/Sub-Zone choices from the database list below.
+IMPORTANT RULES:
+- You MUST use the Region/Sub-Zone names EXACTLY as provided (do not invent new names).
+- Do NOT recommend procedures now; only guide the user to select correct Region/Sub-Zone.
+- Output MUST be Markdown.
+Database-based suggestions (use these exact strings):
+{allowed_block}
+Markdown output format:
+## Notice
+<1-2 sentence apology + mismatch explanation>
+## Suggested Region/Sub-Zones
+- Region → Sub-Zone
+- Region → Sub-Zone
+## Next step
+<one sentence instruction to re-run search with suggested categories>
+""".strip()
+        try:
+            msg = self.llm.generate(prompt, temperature=0.2, max_tokens=450)
+            msg = (msg or "").strip()
+            if msg:
+                return msg
+        except Exception:
+            pass
+        # Fallback deterministic message (no LLM)
+        lines = [
+            "## Notice",
+            "Sorry for inconvenience. Your selected body region/sub-zone does not seem appropriate for your described problem.",
+            "",
+            "## Suggested Region/Sub-Zones",
+        ]
+        if suggestions:
+            for (r, sz, _) in suggestions[:8]:
+                lines.append(f"- {r} → {sz}")
+        else:
+            lines.append("- (No suggestions found in database)")
+        lines += [
+            "",
+            "## Next step",
+            "Please select one of the suggested Region/Sub-Zones above and run the search again.",
+        ]
+        return "\n".join(lines).strip()
+    # ------------------------------------------------------------------
+    # LLM rerank + web-enriched final answer
+    # ------------------------------------------------------------------
+    def _llm_rerank(
+        self,
+        issue_text: str,
+        region: str,
+        sub_zone: str,
+        type_choice: str,
+        candidates: List[RetrievedCandidate],
+        top_k: int = 5,
+    ) -> List[RetrievedCandidate]:
+        if not candidates:
+            return []
+        cand_lines = []
+        for i, c in enumerate(candidates, start=1):
+            cand_lines.append(
+                f"{i}. {c.procedure} (Type: {c.type}; Region/Sub-Zone: {c.region}/{c.sub_zone})\n"
+                f"   Technique: {c.technique}\n"
+                f"   Concerns: {c.concerns}\n"
+            )
+        cand_block = "\n".join(cand_lines)
+        prompt = f"""
+You are a medical-aesthetics assistant helping select the best matching procedures from a structured database.
+User selections:
+- Region (body part): {region}
+- Sub-Zone: {sub_zone}
+- Treatment preference: {type_choice}
+- Issue/problem (free text): {issue_text}
+Candidate procedures (already filtered and semantically matched):
+{cand_block}
+Task:
+Pick the best {top_k} procedures that match the user's issue and selections.
+Output format (STRICT):
+Return ONLY a numbered list of procedure names, one per line, exactly as written in the candidates.
+Example:
+1) Procedure Name A
+2) Procedure Name B
+""".strip()
+        try:
+            raw = self.llm.generate(prompt, temperature=0.2, max_tokens=350)
+        except Exception:
+            return candidates[:top_k]
+        ranked_names: List[str] = []
+        for line in (raw or "").splitlines():
+            m = re.match(r"^\s*\d+\s*[\)\.-]\s*(.+?)\s*$", line)
+            if m:
+                ranked_names.append(m.group(1).strip())
+        if not ranked_names:
+            data = self.llm.safe_json_loads(raw)
+            for item in (data.get("ranked") or [])[:top_k]:
+                name = str(item.get("procedure", "")).strip()
+                if name:
+                    ranked_names.append(name)
+        if not ranked_names:
+            return candidates[:top_k]
+        name_to_candidate = {c.procedure.lower(): c for c in candidates}
+        out: List[RetrievedCandidate] = []
+        for nm in ranked_names:
+            c = name_to_candidate.get(nm.lower())
+            if c and c not in out:
+                out.append(c)
+            if len(out) >= top_k:
+                break
+        for c in candidates:
+            if len(out) >= top_k:
+                break
+            if c not in out:
+                out.append(c)
+        return out
+    def _web_enrich(self, procedure_name: str) -> List[WebDoc]:
+        queries = [
+            f"{procedure_name} downtime recovery time",
+            f"{procedure_name} how long does it last results longevity",
+            f"{procedure_name} session duration minutes",
+            f"{procedure_name} risks side effects complications",
+            f"{procedure_name} candidacy who is it for",
+        ]
+        return self.web.search_and_fetch(queries, max_results_per_query=3, max_docs=8)
+    @staticmethod
+    def _format_web_evidence(docs: List[WebDoc], max_sources: int = 6) -> Tuple[str, List[str]]:
+        blocks = []
+        urls: List[str] = []
+        for i, d in enumerate(docs[:max_sources], start=1):
+            if d.url:
+                urls.append(d.url)
+            snippet = (d.snippet or "").strip()
+            blocks.append(
+                f"[Source {i}] {d.title}\nURL: {d.url}\nSnippet: {snippet}\n"
+            )
+        return "\n".join(blocks).strip(), urls
+    def answer(
+        self,
+        region: str,
+        sub_zone: str,
+        type_choice: str,
+        issue_text: str,
+        retrieval_k: int = 12,
+        final_k: int = 5,
+    ) -> Dict[str, object]:
+        """Full pipeline: retrieval -> mismatch check -> rerank -> web evidence -> synthesis."""
+        issue_text = (issue_text or "").strip()
+        candidates = self.semantic_search(
+            region=region,
+            sub_zone=sub_zone,
+            type_choice=type_choice,
+            issue_text=issue_text,
+            top_k=int(retrieval_k),
+        )
+        # NEW: mismatch detection against global semantic signal
+        global_cands = self._global_semantic_search(issue_text=issue_text, top_k=20, min_similarity=0.18)
+        mismatch, mismatch_reason, suggestions = self._detect_mismatch(
+            selected_region=region,
+            selected_sub_zone=sub_zone,
+            local_candidates=candidates,
+            global_candidates=global_cands,
+        )
+        if mismatch:
+            answer_md = self._build_mismatch_message(
+                selected_region=region,
+                selected_sub_zone=sub_zone,
+                issue_text=issue_text,
+                suggestions=suggestions,
+            )
+            return {
+                "answer_md": answer_md,
+                "sources": [],
+                "_debug": {
+                    "mismatch": True,
+                    "mismatch_reason": mismatch_reason,
+                    "candidate_count": len(candidates),
+                    "candidates": [
+                        {
+                            "procedure": c.procedure,
+                            "similarity": round(float(c.similarity), 4),
+                            "type": c.type,
+                            "region": c.region,
+                            "sub_zone": c.sub_zone,
+                        }
+                        for c in candidates[: min(len(candidates), 25)]
+                    ],
+                    "global_top": [
+                        {
+                            "procedure": c.procedure,
+                            "similarity": round(float(c.similarity), 4),
+                            "type": c.type,
+                            "region": c.region,
+                            "sub_zone": c.sub_zone,
+                        }
+                        for c in global_cands[:10]
+                    ],
+                    "suggested_region_subzones": [
+                        {"region": r, "sub_zone": sz, "score": round(float(sc), 4)}
+                        for (r, sz, sc) in suggestions
+                    ],
+                },
+            }
+        # Continue normal pipeline if no mismatch
+        best = self._llm_rerank(
+            issue_text=issue_text,
+            region=region,
+            sub_zone=sub_zone,
+            type_choice=type_choice,
+            candidates=candidates,
+            top_k=int(final_k),
+        )
+        web_bundle: Dict[str, List[WebDoc]] = {}
+        all_urls: List[str] = []
+        for c in best:
+            docs = self._web_enrich(c.procedure)
+            web_bundle[c.procedure] = docs
+            for d in docs:
+                if d.url:
+                    all_urls.append(d.url)
+        proc_blocks = []
+        for c in best:
+            evidence, _ = self._format_web_evidence(web_bundle.get(c.procedure, []))
+            proc_blocks.append(
+                f"PROCEDURE: {c.procedure}\n"
+                f"TYPE (from DB): {c.type}\n"
+                f"REGION/SUB-ZONE: {c.region} / {c.sub_zone}\n"
+                f"TECHNIQUE/TECHNOLOGY/BRAND (from DB): {c.technique}\n"
+                f"AESTHETIC CONCERNS (from DB): {c.concerns}\n"
+                f"WEB EVIDENCE:\n{evidence if evidence else '(No web evidence retrieved)'}\n"
+            )
+        synthesis_prompt = f"""
+You are a medical-aesthetics research assistant.
+User:
+- Region: {region}
+- Sub-Zone: {sub_zone}
+- Preference: {type_choice}
+- Issue: {issue_text}
+Selected procedures (database + web evidence):
+{chr(10).join(proc_blocks)}
+Task:
+Create a concise, high-signal comparison for the user. For EACH procedure, provide:
+- What it is (1-2 sentences)
+- Invasiveness (Non-invasive / Minimally invasive / Surgical)
+- Typical session duration
+- Downtime / recovery (typical range)
+- When results appear + longevity
+- Key risks / side effects
+- Best suited for (bullet points)
+Rules:
+- Base factual claims on the WEB EVIDENCE. If something is not supported, write "Not found in evidence".
+- Cite sources as [Source #] next to claims.
+- Output MUST be Markdown.
+- Include a short safety disclaimer at the end.
+""".strip()
+        try:
+            answer_md = self.llm.generate(synthesis_prompt, temperature=0.3, max_tokens=900)
+            answer_md = (answer_md or "").strip()
+        except Exception as e:
+            lines = [
+                "## Recommended treatments",
+                "(LLM generation failed; showing database + web evidence only.)",
+                "",
+            ]
+            for i, c in enumerate(best, start=1):
+                lines.append(f"### {i}) {c.procedure} ({c.type})")
+                lines.append(f"- Technique: {c.technique}")
+                lines.append(f"- Concerns: {c.concerns}")
+                docs = web_bundle.get(c.procedure, [])
+                if docs:
+                    lines.append("- Sources:")
+                    for d in docs[:6]:
+                        lines.append(f"  - {d.url}")
+                lines.append("")
+            lines.append("**Disclaimer:** This is general information and not medical advice. Consult a licensed clinician.")
+            answer_md = "\n".join(lines).strip() + f"\n\n(Reason: {repr(e)})"
+        seen = set()
+        dedup_urls: List[str] = []
+        for u in all_urls:
+            if u and u not in seen:
+                seen.add(u)
+                dedup_urls.append(u)
+        out: Dict[str, object] = {
+            "answer_md": answer_md,
+            "sources": dedup_urls,
+            "_debug": {
+                "mismatch": False,
+                "candidate_count": len(candidates),
+                "candidates": [
+                    {
+                        "procedure": c.procedure,
+                        "similarity": round(float(c.similarity), 4),
+                        "type": c.type,
+                        "region": c.region,
+                        "sub_zone": c.sub_zone,
+                    }
+                    for c in candidates[: min(len(candidates), 25)]
+                ],
+                "global_top": [
+                    {
+                        "procedure": c.procedure,
+                        "similarity": round(float(c.similarity), 4),
+                        "type": c.type,
+                        "region": c.region,
+                        "sub_zone": c.sub_zone,
+                    }
+                    for c in global_cands[:10]
+                ],
+            },
+        }
+        return out