Spaces:

mlbench123
/

aesthetic_AI

Sleeping

App Files Files Community

mlbench123 commited on Feb 9

Commit

da9369b

verified ·

1 Parent(s): 143a440

Update rag_treatment_app.py

Browse files

Files changed (1) hide show

rag_treatment_app.py +191 -72

rag_treatment_app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 import os
 import pickle
 import time
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
@@ -14,6 +15,7 @@ from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from llm_client import LocalLLMClient
 DEFAULT_EMBEDDING_MODEL = "sentence-transformers/static-similarity-mrl-multilingual-v1"
@@ -26,10 +28,6 @@ def _norm(x: str) -> str:
 def _norm_type_value(x: str) -> str:
-    """
-    Normalize DB type to {surgical, non-surgical, ""}.
-    Handles many variants: Non surgical, non-surg, non-surgical, etc.
-    """
     t = _norm(x).replace("_", "-").replace("–", "-").replace("—", "-")
     if ("non" in t and "surg" in t) or ("nonsurg" in t):
         return "non-surgical"
@@ -84,6 +82,28 @@ def _na_db(v: str) -> str:
     return v if v else "Not found in database."
 # ---------------------------- data model ----------------------------
 @dataclass
@@ -121,14 +141,10 @@ class RetrievedCandidate:
 class RAGTreatmentSearchApp:
     """
-    HF-ready local structured RAG (DB-based details).
-    DB: database.xlsx (NEW schema)
-      - Uses sheet_name default: "Procedures"
-      - Reads procedure details from DB columns (no web calls)
-    API is kept compatible with your existing gradio_new_rag_app.py:
-      RAGTreatmentSearchApp(excel_path=..., embeddings_cache_path=...)
     """
     def __init__(
@@ -138,6 +154,7 @@ class RAGTreatmentSearchApp:
         embeddings_cache_path: str = "treatment_embeddings.pkl",
         embedding_model_name: str = DEFAULT_EMBEDDING_MODEL,
         llm: Optional[LocalLLMClient] = None,
     ):
         try:
             torch.set_num_threads(int(os.getenv("TORCH_NUM_THREADS", "2")))
@@ -155,15 +172,22 @@ class RAGTreatmentSearchApp:
         self.embeddings, self.texts = self._load_or_build_embeddings()
         self.llm = llm or LocalLLMClient()
-        # hard gate: avoid returning output when issue is empty
         self.min_issue_chars = int(os.getenv("MIN_ISSUE_CHARS", "5"))
-        # mismatch sensitivity (tuned)
         self.local_issue_min_sim = float(os.getenv("LOCAL_ISSUE_MIN_SIM", "0.42"))
         self.global_issue_min_sim = float(os.getenv("GLOBAL_ISSUE_MIN_SIM", "0.52"))
         self.global_local_delta = float(os.getenv("GLOBAL_LOCAL_DELTA", "0.10"))
     # ---------------- DB ----------------
     def _load_db(self) -> pd.DataFrame:
@@ -173,45 +197,32 @@ class RAGTreatmentSearchApp:
         return pd.read_excel(self.excel_path, sheet_name=self.sheet_name)
     def _normalize_columns(self) -> None:
-        """
-        Supports the NEW schema you described.
-        We also create UI-friendly aliases: Region, Sub-Zone, Procedure, Type.
-        """
-        # Required minimal new schema keys (based on your DB update)
-        required_any = [
-            "procedure_title",
-            "main_zone",
-            "treatment_type",
-        ]
         missing_any = [c for c in required_any if c not in self.df.columns]
         if missing_any:
             raise ValueError(f"Database missing required columns: {missing_any}")
-        # Build unified Region/Sub-Zone fields
-        # Region -> main_zone
         self.df["Region"] = self.df["main_zone"].fillna("").astype(str).str.strip()
-        # Sub-Zone: prefer face_subzone else body_subzone else any existing fallback
         if "face_subzone" in self.df.columns or "body_subzone" in self.df.columns:
-            face = self.df["face_subzone"].fillna("").astype(str).str.strip() if "face_subzone" in self.df.columns else ""
-            body = self.df["body_subzone"].fillna("").astype(str).str.strip() if "body_subzone" in self.df.columns else ""
-            sub = face
-            if isinstance(sub, str):
-                # shouldn't happen, but keep safe
-                sub = ""
-            self.df["Sub-Zone"] = face
-            mask_empty = self.df["Sub-Zone"].eq("") | self.df["Sub-Zone"].str.lower().eq("nan")
-            if not isinstance(body, str):
-                self.df.loc[mask_empty, "Sub-Zone"] = body.loc[mask_empty]
         else:
-            # last fallback if DB already has something named Sub-Zone
             self.df["Sub-Zone"] = self.df.get("Sub-Zone", "").fillna("").astype(str).str.strip()
-        # Procedure/Type
         self.df["Procedure"] = self.df["procedure_title"].fillna("").astype(str).str.strip()
         self.df["Type"] = self.df["treatment_type"].fillna("").astype(str).str.strip()
-        # Normalize core columns
         for col in ["Type", "Region", "Sub-Zone", "Procedure"]:
             self.df[col] = self.df[col].astype(str).fillna("").str.strip()
@@ -233,22 +244,145 @@ class RAGTreatmentSearchApp:
                 out.append(ss)
         return sorted(out)
-    # ---------------- Embeddings ----------------
-    def _row_to_text(self, row: pd.Series) -> str:
         """
-        Build semantic text from DB fields (for embeddings).
-        Keep it compact but informative so issue-only similarity works.
         """
-        proc = _db_str(row.get("procedure_title", ""))
-        reg = _db_str(row.get("main_zone", ""))
         sub = _db_str(row.get("Sub-Zone", ""))
-        typ = _db_str(row.get("treatment_type", ""))
         short_desc = _first_present(row, ["short_description", "procedure_description", "description"])
         concerns = _first_present(row, ["concerns", "aesthetic_concerns", "Aesthetic Concerns"])
         techniques = _first_present(row, ["techniques_brands_variants", "Technique / Technology / Brand", "techniques"])
         expected = _first_present(row, ["expected_results", "expected_result"])
         sidefx = _first_present(row, ["potential_side_effects", "side_effects", "risks"])
@@ -321,6 +455,7 @@ class RAGTreatmentSearchApp:
                 RetrievedCandidate(
                     row_index=row_index,
                     similarity=float(sims[pos]),
                     procedure=_na_db(proc),
                     region=_na_db(reg),
                     sub_zone=_na_db(sub),
@@ -347,8 +482,10 @@ class RAGTreatmentSearchApp:
                     average_cost_max_chf=_na_db(_first_present(row, ["average_cost_max_chf"])),
                 )
             )
             if len(out) >= top_k:
                 break
         return out
     def _global_semantic(self, issue_text: str, top_k: int = 15) -> List[RetrievedCandidate]:
@@ -361,7 +498,6 @@ class RAGTreatmentSearchApp:
         out: List[RetrievedCandidate] = []
         for idx in order[: max(top_k, 1) * 20]:
             row = self.df.iloc[int(idx)]
-            # Build minimal candidate (details not required for mismatch suggestion list)
             proc = _db_str(row.get("procedure_title", "")) or _db_str(row.get("Procedure", ""))
             reg = _db_str(row.get("main_zone", "")) or _db_str(row.get("Region", ""))
             sub = _db_str(row.get("Sub-Zone", "")) or _db_str(row.get("face_subzone", "")) or _db_str(row.get("body_subzone", ""))
@@ -371,6 +507,7 @@ class RAGTreatmentSearchApp:
                 RetrievedCandidate(
                     row_index=int(idx),
                     similarity=float(sims[idx]),
                     procedure=_na_db(proc),
                     region=_na_db(reg),
                     sub_zone=_na_db(sub),
@@ -399,12 +536,10 @@ class RAGTreatmentSearchApp:
             )
             if len(out) >= top_k:
                 break
         return out
     def _local_issue_only_best_sim(self, region: str, sub_zone: str, type_choice: str, issue_text: str) -> float:
-        """
-        Compute issue-only similarity inside selected region/sub-zone to detect irrelevance.
-        """
         issue_text = (issue_text or "").strip()
         if not issue_text:
             return 0.0
@@ -418,7 +553,6 @@ class RAGTreatmentSearchApp:
             idxs = self._candidate_indices(region, sub_zone, t)
         if idxs.size == 0:
-            # region only
             if t == "both":
                 idx_s = self._candidate_indices(region, "", "surgical")
                 idx_n = self._candidate_indices(region, "", "non-surgical")
@@ -433,16 +567,8 @@ class RAGTreatmentSearchApp:
         sims = cosine_similarity(q_emb, self.embeddings[idxs])[0]
         return float(np.max(sims)) if sims.size else 0.0
-    def semantic_search(
-        self,
-        region: str,
-        sub_zone: str,
-        type_choice: str,
-        issue_text: str,
-        top_k: int = 12,
-    ) -> List[RetrievedCandidate]:
         type_norm = _norm_type_choice(type_choice)
         query = f"Region: {region} | Sub-Zone: {sub_zone} | Preference: {type_choice} | Issue: {issue_text}"
         if type_norm == "both":
@@ -451,7 +577,6 @@ class RAGTreatmentSearchApp:
             per = max(3, top_k // 2)
             res = self._semantic_over(idx_s, query, per) + self._semantic_over(idx_n, query, per)
             res.sort(key=lambda x: x.similarity, reverse=True)
-            # de-dupe by row index
             seen = set()
             out = []
             for c in res:
@@ -499,7 +624,6 @@ Return ONLY a comma-separated list of procedure names (exactly as written).
             if len(out) >= top_k:
                 break
-        # fill remainder
         for c in candidates:
             if len(out) >= top_k:
                 break
@@ -508,7 +632,7 @@ Return ONLY a comma-separated list of procedure names (exactly as written).
         return out
-    # ---------------- Formatting (DB details) ----------------
     def _format_cost(self, mn: str, mx: str, unit: str) -> str:
         if mn == "Not found in database." and mx == "Not found in database.":
@@ -572,7 +696,6 @@ Return ONLY a comma-separated list of procedure names (exactly as written).
         sub_zone = (sub_zone or "").strip()
         issue_text = (issue_text or "").strip()
-        # Hard gate: must provide issue text
         if not region or not sub_zone:
             return {
                 "answer_md": "Please select **Region** and **Sub-Zone** before running the search.",
@@ -595,7 +718,7 @@ Return ONLY a comma-separated list of procedure names (exactly as written).
                 "_debug": {"mismatch": False, "candidate_count": 0, "final_count": 0},
             }
-        # ---------- mismatch detection ----------
         global_cands = self._global_semantic(issue_text, top_k=15)
         global_best = global_cands[0].similarity if global_cands else 0.0
         local_best = candidates[0].similarity if candidates else 0.0
@@ -625,7 +748,6 @@ Return ONLY a comma-separated list of procedure names (exactly as written).
         )
         if mismatch_strict or mismatch_delta:
-            # suggest correct region/sub-zones based on issue text
             suggestions = []
             seen = set()
             for c in global_cands:
@@ -665,11 +787,8 @@ Please select one of the suggested **Region/Sub-Zones** and run the search again
                     "candidate_count": len(candidates),
                 },
             }
-        # ---------------------------------------
         best = self._llm_rerank(issue_text, candidates, top_k=int(final_k))
-        # Ensure exactly final_k if possible
         if len(best) < int(final_k):
             for c in candidates:
                 if c not in best:
@@ -682,7 +801,7 @@ Please select one of the suggested **Region/Sub-Zones** and run the search again
         return {
             "answer_md": answer_md,
-            "sources": [],  # DB-only mode
             "_debug": {
                 "mismatch": False,
                 "candidate_count": len(candidates),

 import os
 import pickle
+import re
 import time
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 from sklearn.metrics.pairwise import cosine_similarity
 from llm_client import LocalLLMClient
+from web_retriever import WebRetriever, WebDoc
 DEFAULT_EMBEDDING_MODEL = "sentence-transformers/static-similarity-mrl-multilingual-v1"
 def _norm_type_value(x: str) -> str:
     t = _norm(x).replace("_", "-").replace("–", "-").replace("—", "-")
     if ("non" in t and "surg" in t) or ("nonsurg" in t):
         return "non-surgical"
     return v if v else "Not found in database."
+def _split_concerns(text: str) -> List[str]:
+    """
+    Split a concerns cell into candidate concern phrases.
+    Handles ; , | newlines and bullet-ish formats.
+    """
+    t = (text or "").strip()
+    if not t:
+        return []
+    t = t.replace("•", "\n").replace("·", "\n")
+    parts = re.split(r"[;\n\|]+", t)
+    out = []
+    for p in parts:
+        p = p.strip(" -\t\r")
+        if not p:
+            continue
+        if len(p) > 120:
+            # keep short fragments only
+            continue
+        out.append(p)
+    return out
 # ---------------------------- data model ----------------------------
 @dataclass
 class RAGTreatmentSearchApp:
     """
+    DB-driven structured RAG + Common Concerns (internet -> fallback DB).
+    - Core recommendations: semantic retrieval + LLM rerank + formatting from DB columns.
+    - Common concerns: fetch short common issues for Region/Sub-Zone to help the user fill the issue box.
     """
     def __init__(
         embeddings_cache_path: str = "treatment_embeddings.pkl",
         embedding_model_name: str = DEFAULT_EMBEDDING_MODEL,
         llm: Optional[LocalLLMClient] = None,
+        web: Optional[WebRetriever] = None,
     ):
         try:
             torch.set_num_threads(int(os.getenv("TORCH_NUM_THREADS", "2")))
         self.embeddings, self.texts = self._load_or_build_embeddings()
         self.llm = llm or LocalLLMClient()
+        self.web = web or WebRetriever()
+        # gates + mismatch knobs
         self.min_issue_chars = int(os.getenv("MIN_ISSUE_CHARS", "5"))
         self.local_issue_min_sim = float(os.getenv("LOCAL_ISSUE_MIN_SIM", "0.42"))
         self.global_issue_min_sim = float(os.getenv("GLOBAL_ISSUE_MIN_SIM", "0.52"))
         self.global_local_delta = float(os.getenv("GLOBAL_LOCAL_DELTA", "0.10"))
+        # common concerns config
+        self.common_web_enabled = os.getenv("COMMON_CONCERNS_WEB_ENABLED", "1").strip() != "0"
+        self.common_max_docs = int(os.getenv("COMMON_CONCERNS_MAX_DOCS", "4"))
+        self.common_max_chars = int(os.getenv("COMMON_CONCERNS_MAX_CHARS", "900"))
+        self.common_top_n = int(os.getenv("COMMON_CONCERNS_TOP_N", "4"))
+        self._common_cache: Dict[Tuple[str, str], List[str]] = {}
     # ---------------- DB ----------------
     def _load_db(self) -> pd.DataFrame:
         return pd.read_excel(self.excel_path, sheet_name=self.sheet_name)
     def _normalize_columns(self) -> None:
+        required_any = ["procedure_title", "main_zone", "treatment_type"]
         missing_any = [c for c in required_any if c not in self.df.columns]
         if missing_any:
             raise ValueError(f"Database missing required columns: {missing_any}")
+        # Region
         self.df["Region"] = self.df["main_zone"].fillna("").astype(str).str.strip()
+        # Sub-Zone (prefer face_subzone, else body_subzone, else existing Sub-Zone)
         if "face_subzone" in self.df.columns or "body_subzone" in self.df.columns:
+            face = self.df["face_subzone"].fillna("").astype(str).str.strip() if "face_subzone" in self.df.columns else None
+            body = self.df["body_subzone"].fillna("").astype(str).str.strip() if "body_subzone" in self.df.columns else None
+            if face is None:
+                self.df["Sub-Zone"] = body
+            else:
+                self.df["Sub-Zone"] = face
+                mask_empty = self.df["Sub-Zone"].eq("") | self.df["Sub-Zone"].str.lower().eq("nan")
+                if body is not None:
+                    self.df.loc[mask_empty, "Sub-Zone"] = body.loc[mask_empty]
         else:
             self.df["Sub-Zone"] = self.df.get("Sub-Zone", "").fillna("").astype(str).str.strip()
+        # Procedure / Type aliases
         self.df["Procedure"] = self.df["procedure_title"].fillna("").astype(str).str.strip()
         self.df["Type"] = self.df["treatment_type"].fillna("").astype(str).str.strip()
         for col in ["Type", "Region", "Sub-Zone", "Procedure"]:
             self.df[col] = self.df[col].astype(str).fillna("").str.strip()
                 out.append(ss)
         return sorted(out)
+    # ---------------- Common concerns ----------------
+    def _db_common_concerns(self, region: str, sub_zone: str, n: int = 4) -> List[str]:
+        """
+        Fallback: extract most frequent short concerns from DB rows in selected Region/Sub-Zone.
+        """
+        r = _norm(region)
+        sz = _norm(sub_zone)
+        m = self.df["_region_norm"].eq(r)
+        if sz:
+            m = m & (self.df["_subzone_norm"].eq(sz) | self.df["_subzone_norm"].str.contains(sz, na=False))
+        df2 = self.df[m]
+        if df2.empty:
+            return []
+        counts: Dict[str, int] = {}
+        for _, row in df2.iterrows():
+            concerns = _first_present(row, ["concerns", "Aesthetic Concerns", "aesthetic_concerns"])
+            for c in _split_concerns(concerns):
+                key = c.strip()
+                if len(key) < 4:
+                    continue
+                counts[key] = counts.get(key, 0) + 1
+        ranked = sorted(counts.items(), key=lambda x: (-x[1], x[0].lower()))
+        return [k for (k, _) in ranked[: max(1, n)]]
+    def _web_common_concerns(self, region: str, sub_zone: str, n: int = 4) -> List[str]:
+        """
+        Internet-based: get common concerns for Region/Sub-Zone; extract with LLM as short phrases.
+        If web is blocked/rate-limited on HF, this naturally falls back to DB list.
+        """
+        if not self.common_web_enabled:
+            return []
+        region = (region or "").strip()
+        sub_zone = (sub_zone or "").strip()
+        if not region or not sub_zone:
+            return []
+        queries = [
+            f"common aesthetic concerns {region} {sub_zone}",
+            f"most common problems {sub_zone} aesthetic treatment",
+            f"{sub_zone} cosmetic concerns dark circles wrinkles pigmentation",
+        ]
+        docs = self.web.search_and_fetch(
+            queries=queries,
+            max_results_per_query=2,
+            max_docs=self.common_max_docs,
+            max_chars_per_doc=self.common_max_chars,
+        )
+        if not docs:
+            return []
+        def compact(s: str, limit: int = 650) -> str:
+            s = re.sub(r"\s+", " ", (s or "").strip())
+            return (s[:limit] + "…") if len(s) > limit else s
+        ev = []
+        for i, d in enumerate(docs[:4], start=1):
+            ev.append(f"[Doc {i}] {d.title}\n{compact(d.snippet)}")
+        evidence = "\n\n".join(ev)
+        prompt = f"""
+You are extracting ONLY common patient concerns (issues) for:
+Region: {region}
+Sub-Zone: {sub_zone}
+From the evidence, output STRICT JSON:
+{{"concerns": ["...","..."]}}
+Rules:
+- return 1 to {n} short concern phrases (3-8 words each)
+- no treatment names, only issues/concerns
+- deduplicate similar items
+- if unclear, return fewer items
+Evidence:
+{evidence}
+""".strip()
+        raw = (self.llm.generate(prompt, temperature=0.2, max_tokens=160) or "").strip()
+        data = self.llm.safe_json_loads(raw)
+        arr = data.get("concerns", [])
+        out: List[str] = []
+        if isinstance(arr, list):
+            for x in arr:
+                s = str(x).strip()
+                if not s:
+                    continue
+                if len(s) > 80:
+                    continue
+                if s.lower() in {z.lower() for z in out}:
+                    continue
+                out.append(s)
+        return out[:n]
+    def get_common_concerns(self, region: str, sub_zone: str, n: Optional[int] = None) -> List[str]:
         """
+        Public API for UI:
+          - first try internet extraction
+          - if it fails, use DB-derived concerns
+          - cached per (region, sub_zone)
         """
+        n = int(n or self.common_top_n)
+        key = (_norm(region), _norm(sub_zone))
+        if key in self._common_cache:
+            return self._common_cache[key]
+        concerns: List[str] = []
+        try:
+            concerns = self._web_common_concerns(region, sub_zone, n=n)
+        except Exception:
+            concerns = []
+        if not concerns:
+            concerns = self._db_common_concerns(region, sub_zone, n=n)
+        self._common_cache[key] = concerns
+        return concerns
+    # ---------------- Embeddings ----------------
+    def _row_to_text(self, row: pd.Series) -> str:
+        proc = _db_str(row.get("procedure_title", "")) or _db_str(row.get("Procedure", ""))
+        reg = _db_str(row.get("main_zone", "")) or _db_str(row.get("Region", ""))
         sub = _db_str(row.get("Sub-Zone", ""))
+        typ = _db_str(row.get("treatment_type", "")) or _db_str(row.get("Type", ""))
         short_desc = _first_present(row, ["short_description", "procedure_description", "description"])
         concerns = _first_present(row, ["concerns", "aesthetic_concerns", "Aesthetic Concerns"])
         techniques = _first_present(row, ["techniques_brands_variants", "Technique / Technology / Brand", "techniques"])
         expected = _first_present(row, ["expected_results", "expected_result"])
         sidefx = _first_present(row, ["potential_side_effects", "side_effects", "risks"])
                 RetrievedCandidate(
                     row_index=row_index,
                     similarity=float(sims[pos]),
                     procedure=_na_db(proc),
                     region=_na_db(reg),
                     sub_zone=_na_db(sub),
                     average_cost_max_chf=_na_db(_first_present(row, ["average_cost_max_chf"])),
                 )
             )
             if len(out) >= top_k:
                 break
         return out
     def _global_semantic(self, issue_text: str, top_k: int = 15) -> List[RetrievedCandidate]:
         out: List[RetrievedCandidate] = []
         for idx in order[: max(top_k, 1) * 20]:
             row = self.df.iloc[int(idx)]
             proc = _db_str(row.get("procedure_title", "")) or _db_str(row.get("Procedure", ""))
             reg = _db_str(row.get("main_zone", "")) or _db_str(row.get("Region", ""))
             sub = _db_str(row.get("Sub-Zone", "")) or _db_str(row.get("face_subzone", "")) or _db_str(row.get("body_subzone", ""))
                 RetrievedCandidate(
                     row_index=int(idx),
                     similarity=float(sims[idx]),
                     procedure=_na_db(proc),
                     region=_na_db(reg),
                     sub_zone=_na_db(sub),
             )
             if len(out) >= top_k:
                 break
         return out
     def _local_issue_only_best_sim(self, region: str, sub_zone: str, type_choice: str, issue_text: str) -> float:
         issue_text = (issue_text or "").strip()
         if not issue_text:
             return 0.0
             idxs = self._candidate_indices(region, sub_zone, t)
         if idxs.size == 0:
             if t == "both":
                 idx_s = self._candidate_indices(region, "", "surgical")
                 idx_n = self._candidate_indices(region, "", "non-surgical")
         sims = cosine_similarity(q_emb, self.embeddings[idxs])[0]
         return float(np.max(sims)) if sims.size else 0.0
+    def semantic_search(self, region: str, sub_zone: str, type_choice: str, issue_text: str, top_k: int = 12) -> List[RetrievedCandidate]:
         type_norm = _norm_type_choice(type_choice)
         query = f"Region: {region} | Sub-Zone: {sub_zone} | Preference: {type_choice} | Issue: {issue_text}"
         if type_norm == "both":
             per = max(3, top_k // 2)
             res = self._semantic_over(idx_s, query, per) + self._semantic_over(idx_n, query, per)
             res.sort(key=lambda x: x.similarity, reverse=True)
             seen = set()
             out = []
             for c in res:
             if len(out) >= top_k:
                 break
         for c in candidates:
             if len(out) >= top_k:
                 break
         return out
+    # ---------------- Formatting ----------------
     def _format_cost(self, mn: str, mx: str, unit: str) -> str:
         if mn == "Not found in database." and mx == "Not found in database.":
         sub_zone = (sub_zone or "").strip()
         issue_text = (issue_text or "").strip()
         if not region or not sub_zone:
             return {
                 "answer_md": "Please select **Region** and **Sub-Zone** before running the search.",
                 "_debug": {"mismatch": False, "candidate_count": 0, "final_count": 0},
             }
+        # mismatch detection
         global_cands = self._global_semantic(issue_text, top_k=15)
         global_best = global_cands[0].similarity if global_cands else 0.0
         local_best = candidates[0].similarity if candidates else 0.0
         )
         if mismatch_strict or mismatch_delta:
             suggestions = []
             seen = set()
             for c in global_cands:
                     "candidate_count": len(candidates),
                 },
             }
         best = self._llm_rerank(issue_text, candidates, top_k=int(final_k))
         if len(best) < int(final_k):
             for c in candidates:
                 if c not in best:
         return {
             "answer_md": answer_md,
+            "sources": [],
             "_debug": {
                 "mismatch": False,
                 "candidate_count": len(candidates),