Spaces:

essprasad
/

CT-Chat-V2

Sleeping

App Files Files Community

essprasad commited on Nov 24, 2025

Commit

cdea1c3

verified ·

1 Parent(s): 7a82071

Upload 11 files

Browse files

Files changed (11) hide show

core/admin_tasks.py +265 -0
core/bm25.py +137 -0
core/glossary.py +109 -0
core/glossary_builder.py +480 -0
core/hybrid_retriever.py +925 -0
core/retrieval.py +25 -0
core/van_normalizer.py +57 -0
core/vector_search.py +47 -0
core/vector_store.py +208 -0
core/vector_sync.py +208 -0
core/web_loader.py +55 -0

core/admin_tasks.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""
+core/admin_tasks.py
+Centralized admin / maintenance functions used by both the Gradio UI (app.py)
+and the FastAPI admin endpoints (api.py). These are synchronous as in your
+current setup and return friendly status strings for display.
+"""
+import os
+import json
+import shutil
+import glob
+import traceback
+try:
+    import pandas as pd
+except Exception:
+    pd = None
+try:
+    import faiss
+except Exception:
+    faiss = None
+try:
+    from sentence_transformers import SentenceTransformer
+except Exception:
+    SentenceTransformer = None
+from huggingface_hub import hf_hub_download, list_repo_files
+# functions from your project (should exist)
+# rebuild_faiss_from_glossary should return (index, metas) like before.
+try:
+    from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
+except Exception:
+    rebuild_faiss_from_glossary = None
+    _upload_to_dataset = None
+# Optional web loader
+try:
+    from core.web_loader import web_crawler_loader
+except Exception:
+    web_crawler_loader = None
+PERSISTENT_DIR = "/home/user/app/persistent"
+DATASET_INDEX_REPO = os.environ.get("DATASET_INDEX_REPO", "essprasad/CT-Chat-Index")
+DATASET_DOCS_REPO = os.environ.get("DATASET_DOCS_REPO", "essprasad/CT-Chat-Docs")
+def _ensure_dirs():
+    os.makedirs(PERSISTENT_DIR, exist_ok=True)
+def clear_index():
+    """Delete local FAISS and related caches. Returns a message string."""
+    removed = []
+    paths = [
+        os.path.join(PERSISTENT_DIR, "faiss.index"),
+        os.path.join(PERSISTENT_DIR, "faiss.index.meta.json"),
+        os.path.join(PERSISTENT_DIR, "glossary.json"),
+        "/home/user/app/data/docs_cache",
+        "/home/user/app/runtime_faiss",
+    ]
+    for p in paths:
+        try:
+            if os.path.isdir(p):
+                shutil.rmtree(p, ignore_errors=True)
+                removed.append(f"🗑️ Deleted folder: {p}")
+            elif os.path.exists(p):
+                os.remove(p)
+                removed.append(f"🗑️ Deleted file: {p}")
+        except Exception as e:
+            removed.append(f"⚠️ Failed to delete {p}: {e}")
+    if not removed:
+        return "ℹ️ No cache files found."
+    return "\n".join(removed)
+def rebuild_glossary():
+    """
+    Calls the existing glossary builder (core.glossary_builder.rebuild_and_upload).
+    Returns status string.
+    """
+    try:
+        from core.glossary_builder import rebuild_and_upload
+    except Exception as e:
+        return f"⚠️ Cannot import glossary builder: {e}"
+    try:
+        rebuild_and_upload()
+        return "✅ Glossary rebuilt and uploaded successfully."
+    except Exception as e:
+        tb = traceback.format_exc()
+        return f"⚠️ Glossary rebuild failed: {e}\n{tb}"
+def rebuild_index(force_download_glossary: bool = False):
+    """
+    Rebuild FAISS index from glossary.json + Excel + (optionally) web content.
+    Returns status string. Mirrors the logic in your previous rebuild_index implementation.
+    """
+    _ensure_dirs()
+    try:
+        if rebuild_faiss_from_glossary is None:
+            return "⚠️ rebuild_faiss_from_glossary is not available in core.vector_sync."
+        glossary_path = os.path.join(PERSISTENT_DIR, "glossary.json")
+        # Attempt to download glossary.json from HF dataset if missing
+        if not os.path.exists(glossary_path) or force_download_glossary:
+            try:
+                downloaded = hf_hub_download(repo_id=DATASET_INDEX_REPO, filename="persistent/glossary.json", repo_type="dataset")
+                shutil.copy2(downloaded, glossary_path)
+            except Exception as e:
+                # Continue even if glossary download fails; rebuild_faiss_from_glossary may handle absent file
+                return f"⚠️ Could not download glossary.json from {DATASET_INDEX_REPO}: {e}"
+        # Build faiss index using the project helper
+        index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path)
+        loaded = len(metas) if isinstance(metas, (list, tuple)) else 0
+        # Index Excel files from docs dataset
+        try:
+            repo_files = list_repo_files(DATASET_DOCS_REPO, repo_type="dataset")
+            excel_files = [f for f in repo_files if f.lower().endswith((".xls", ".xlsx"))]
+        except Exception:
+            excel_files = []
+        # If we have SentenceTransformer available we will embed and add Excel content
+        if SentenceTransformer is not None and faiss is not None and excel_files:
+            model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+            excel_entries = []
+            for file_name in excel_files:
+                try:
+                    fp = hf_hub_download(repo_id=DATASET_DOCS_REPO, filename=file_name, repo_type="dataset")
+                    # read sheets and look for MRCT-style columns (best-effort)
+                    try:
+                        xls = pd.read_excel(fp, sheet_name=None)
+                    except Exception:
+                        xls = {}
+                    for sheet, df in xls.items():
+                        if not isinstance(df, pd.DataFrame):
+                            continue
+                        cols = [c.lower() for c in df.columns.astype(str)]
+                        # heuristic — look for "glossary term" or "glossary term" header
+                        if not any("glossary term" in c or "term" == c.strip().lower() for c in cols):
+                            continue
+                        df = df.fillna("").dropna(how="all")
+                        for _, row in df.iterrows():
+                            term = str(row.get("Glossary Term", "") or row.get("term", "")).strip()
+                            if not term:
+                                # try first column
+                                try:
+                                    term = str(row.iloc[0]).strip()
+                                except Exception:
+                                    term = ""
+                            if not term:
+                                continue
+                            combined = " ".join(str(x) for x in row.values if str(x).strip())
+                            excel_entries.append({
+                                "file": file_name,
+                                "sheet": sheet,
+                                "term": term,
+                                "type": "excel",
+                                "text": combined,
+                                "source": file_name
+                            })
+                except Exception:
+                    # non-fatal: skip problematic excel
+                    continue
+            if excel_entries:
+                texts = [e["text"] for e in excel_entries]
+                embs = model.encode(texts, show_progress_bar=False, convert_to_numpy=True).astype("float32")
+                try:
+                    faiss.normalize_L2(embs)
+                    index.add(embs)
+                    if isinstance(metas, list):
+                        metas.extend(excel_entries)
+                    loaded = len(metas)
+                except Exception:
+                    # index may be incompatible or None
+                    pass
+        # Optionally fetch & embed web content if web_crawler_loader exists
+        if web_crawler_loader is not None and SentenceTransformer is not None and faiss is not None:
+            try:
+                web_entries = web_crawler_loader(
+                    urls_file="/home/user/app/data/urls.txt",
+                    cache_path=os.path.join(PERSISTENT_DIR, "web_cache.json"),
+                    max_pages=2,
+                    timeout=15,
+                    force_refresh=False,
+                )
+                if web_entries:
+                    web_texts = [w.get("text", "") for w in web_entries if len(w.get("text","")) > 50]
+                    if web_texts:
+                        model = model if 'model' in locals() else SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+                        web_emb = model.encode(web_texts, show_progress_bar=False, convert_to_numpy=True).astype("float32")
+                        faiss.normalize_L2(web_emb)
+                        index.add(web_emb)
+                        if isinstance(metas, list):
+                            metas.extend(web_entries)
+                            loaded = len(metas)
+            except Exception:
+                pass
+        # Save the index + meta back to persistent
+        try:
+            faiss_path = os.path.join(PERSISTENT_DIR, "faiss.index")
+            meta_path = os.path.join(PERSISTENT_DIR, "faiss.index.meta.json")
+            if faiss is not None and hasattr(faiss, "write_index"):
+                faiss.write_index(index, faiss_path)
+            with open(meta_path, "w", encoding="utf-8") as f:
+                json.dump(metas, f, indent=2)
+            # Try upload if helper present
+            if _upload_to_dataset is not None:
+                try:
+                    _upload_to_dataset(faiss_path, meta_path, DATASET_INDEX_REPO)
+                except Exception:
+                    pass
+        except Exception:
+            pass
+        return f"✅ Rebuild complete: {loaded} entries."
+    except Exception as e:
+        tb = traceback.format_exc()
+        return f"⚠️ Rebuild failed: {e}\n{tb}"
+def reset_faiss_cache():
+    """
+    Wipe persistent & runtime FAISS/glossary, then call rebuild_glossary + rebuild_index.
+    Returns concatenated status string.
+    """
+    msgs = []
+    # wipe persistent
+    try:
+        to_remove = [
+            os.path.join(PERSISTENT_DIR, "faiss.index"),
+            os.path.join(PERSISTENT_DIR, "faiss.index.meta.json"),
+            os.path.join(PERSISTENT_DIR, "glossary.json"),
+            os.path.join(PERSISTENT_DIR, "web_cache.json"),
+            "/home/user/app/runtime_faiss",
+        ]
+        for p in to_remove:
+            try:
+                if os.path.isdir(p):
+                    shutil.rmtree(p, ignore_errors=True)
+                elif os.path.exists(p):
+                    os.remove(p)
+            except Exception:
+                pass
+        msgs.append("🧹 Persistent FAISS + glossary caches cleared.")
+    except Exception as e:
+        msgs.append(f"⚠️ Failed clearing caches: {e}")
+    # Rebuild glossary then index
+    try:
+        msgs.append(rebuild_glossary())
+    except Exception as e:
+        msgs.append(f"⚠️ Rebuild glossary failed: {e}")
+    try:
+        msgs.append(rebuild_index())
+    except Exception as e:
+        msgs.append(f"⚠️ Rebuild index failed: {e}")
+    return "\n".join(msgs)

core/bm25.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import json
+import re
+import math
+from collections import defaultdict, Counter
+# --- 🔧 NEW: Lightweight stemming and lemmatization helpers
+try:
+    import nltk
+    from nltk.stem import WordNetLemmatizer, PorterStemmer
+    from nltk.corpus import wordnet
+    nltk.download("wordnet", quiet=True)
+    nltk.download("omw-1.4", quiet=True)
+except Exception:
+    WordNetLemmatizer = PorterStemmer = None
+# Initialize stemmer and lemmatizer
+_lemmatizer = WordNetLemmatizer() if WordNetLemmatizer else None
+_stemmer = PorterStemmer() if PorterStemmer else None
+def _normalize_token(token: str) -> str:
+    """Normalize a token by lowercasing, lemmatizing, and stemming."""
+    token = token.lower().strip()
+    if _lemmatizer:
+        try:
+            token = _lemmatizer.lemmatize(token)
+        except Exception:
+            pass
+    if _stemmer:
+        try:
+            token = _stemmer.stem(token)
+        except Exception:
+            pass
+    return token
+class BM25:
+    def __init__(self, corpus):
+        # corpus = list of dicts each with 'text'
+        # 🔧 FIX: support for 'definition' or 'content' fallback if 'text' missing
+        self.corpus = corpus
+        self.tokenized_corpus = [self._tokenize(self._get_text(doc)) for doc in corpus]
+        self.doc_lens = [len(tokens) for tokens in self.tokenized_corpus]
+        self.avgdl = sum(self.doc_lens) / len(self.doc_lens) if self.doc_lens else 0.0
+        self.doc_freqs = self._calc_doc_freqs()
+        self.k1 = 1.5
+        self.b = 0.75
+    def _get_text(self, doc):
+        """Safely extract text from multiple possible keys ('text', 'definition', 'content')."""
+        if not isinstance(doc, dict):
+            return ""
+        if "text" in doc and isinstance(doc["text"], str):
+            return doc["text"]
+        elif "definition" in doc and isinstance(doc["definition"], str):
+            return doc["definition"]
+        elif "content" in doc and isinstance(doc["content"], str):
+            return doc["content"]
+        return ""
+    def _tokenize(self, text):
+        """Tokenize and normalize each word with stemming and lemmatization."""
+        raw_tokens = re.findall(r"\w+", (text or "").lower())
+        return [_normalize_token(t) for t in raw_tokens if t]
+    def _calc_doc_freqs(self):
+        freqs = defaultdict(int)
+        for doc in self.tokenized_corpus:
+            for word in set(doc):
+                freqs[word] += 1
+        return freqs
+    def _idf(self, term):
+        N = len(self.tokenized_corpus)
+        df = self.doc_freqs.get(term, 0)
+        # smoothed idf to avoid division issues
+        return math.log(1 + (N - df + 0.5) / (df + 0.5)) if N > 0 else 0.0
+    def get_scores(self, query_tokens):
+        scores = [0.0] * len(self.tokenized_corpus)
+        for idx, doc_tokens in enumerate(self.tokenized_corpus):
+            freqs = Counter(doc_tokens)
+            dl = self.doc_lens[idx]
+            for term in query_tokens:
+                idf = self._idf(term)
+                tf = freqs.get(term, 0)
+                denom = tf + self.k1 * (1 - self.b + self.b * dl / (self.avgdl or 1.0))
+                score = 0.0
+                if denom != 0:
+                    score = idf * ((tf * (self.k1 + 1)) / denom)
+                scores[idx] += score
+        return scores
+def search_bm25(query, docs=None, top_n=10):
+    """
+    BM25 search helper.
+    - query: string
+    - docs: optional list of dicts (each may have 'text'/'definition'/'content');
+      if None, will load from vector_store.load_all_text_chunks()
+    - top_n: int
+    Returns list of doc dicts with added 'score' field.
+    """
+    from core.vector_store import load_all_text_chunks
+    if docs is None:
+        docs = load_all_text_chunks() or []
+    if not docs:
+        return []
+    bm25 = BM25(docs)
+    # 🔧 Normalize query tokens with same stem/lemma logic
+    query_tokens = [_normalize_token(t) for t in re.findall(r"\w+", (query or "").lower()) if t]
+    if not query_tokens:
+        return []
+    scores = bm25.get_scores(query_tokens)
+    # --- 🎯 NEW: Boost Excel glossary sources (MRCT, xlsx/xls) by +15%
+    for i, doc in enumerate(docs):
+        src = (doc.get("file") or doc.get("source") or "").lower()
+        if any(x in src for x in [".xlsx", ".xls", "mrct", "clinical-research-glossary"]):
+            scores[i] *= 1.15  # Excel source boost
+    # --- Rank and return top_n docs
+    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
+    results = []
+    for i in top_indices:
+        doc = dict(docs[i])  # shallow copy
+        # 🔧 Ensure 'text' key exists so retriever can render it
+        if "text" not in doc:
+            doc["text"] = bm25._get_text(doc)
+        doc["score"] = float(scores[i])
+        results.append(doc)
+    return results

core/glossary.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# core/glossary.py
+import json
+import os
+import re
+from difflib import get_close_matches
+from huggingface_hub import hf_hub_download
+GLOSSARY = None
+GLOSSARY_TERMS_CACHE = []  # 🧠 Cache of glossary keys for fuzzy matching
+DATASET_REPO = "essprasad/CT-Chat-Index"
+GLOSSARY_FILENAME = "persistent/glossary.json"
+def _normalize_term(term: str) -> str:
+    """Normalize glossary terms for matching, with fuzzy fallback."""
+    if not term:
+        return ""
+    term = term.lower().strip()
+    term = re.sub(r'[\-_/\\.,;:]+', ' ', term)
+    term = re.sub(r'\s+', ' ', term)
+    # Common clinical research synonym normalization
+    term = term.replace("e crf", "ecrf").replace("e-crf", "ecrf").replace("e/crf", "ecrf").replace("e_crf", "ecrf")
+    term = term.replace("electronic case report form", "ecrf")
+    term = term.replace("case report form", "crf")
+    term = term.replace("informed consent form", "icf")
+    term = term.replace("good clinical practice", "gcp")
+    term = term.replace("serious adverse event", "sae")
+    term = term.replace("adverse event", "ae")
+    term = term.replace("21 cfr part 11", "21cfrpart11")
+    term = term.replace("clinical study report", "csr")
+    term = term.strip()
+    # 🧩 Fuzzy matching fallback (for plural/singular or typos)
+    if GLOSSARY_TERMS_CACHE:
+        if term not in GLOSSARY_TERMS_CACHE:
+            close = get_close_matches(term, GLOSSARY_TERMS_CACHE, n=1, cutoff=0.85)
+            if close:
+                # return the closest key for better recall
+                return close[0]
+    return term
+def _load_glossary():
+    """Load glossary.json from Hugging Face Hub (cached)."""
+    global GLOSSARY, GLOSSARY_TERMS_CACHE
+    if GLOSSARY is not None:
+        return GLOSSARY
+    try:
+        path = hf_hub_download(
+            repo_id=DATASET_REPO,
+            filename=GLOSSARY_FILENAME,
+            repo_type="dataset",
+        )
+        with open(path, "r", encoding="utf-8") as f:
+            raw = json.load(f)
+        GLOSSARY = {}
+        for k, vlist in raw.items():
+            if not isinstance(k, str) or len(k.split()) > 12 or re.search(r'\d{4}', k):
+                continue
+            candidate_key = k
+            if isinstance(vlist, dict):
+                candidate_key = vlist.get("term") or vlist.get("name") or vlist.get("title") or k
+            norm = _normalize_term(candidate_key)
+            if not norm:
+                continue
+            if isinstance(vlist, dict):
+                dfn = vlist.get("definition") or vlist.get("text") or ""
+                sources = vlist.get("sources", [])
+            elif isinstance(vlist, str):
+                dfn = vlist
+                sources = []
+            else:
+                dfn, sources = "", []
+            if not dfn or len(dfn.strip()) < 5:
+                continue
+            if norm not in GLOSSARY:
+                GLOSSARY[norm] = {
+                    "term": candidate_key.strip(),
+                    "definition": dfn.strip(),
+                    "sources": sources if isinstance(sources, list) else []
+                }
+            else:
+                # Merge sources if already exists
+                existing = GLOSSARY[norm]
+                existing_sources = set(existing.get("sources", []))
+                new_sources = set(sources) if sources else set()
+                existing["sources"] = list(existing_sources.union(new_sources))
+        # 🧠 Store all glossary keys for fuzzy fallback
+        GLOSSARY_TERMS_CACHE = list(GLOSSARY.keys())
+        print(f"✅ Glossary downloaded and cached ({len(GLOSSARY)} entries).")
+        return GLOSSARY
+    except Exception as e:
+        print(f"⚠️ Failed to load glossary from Hugging Face: {e}")
+        return {}
+__all__ = ["_load_glossary", "_normalize_term"]

core/glossary_builder.py ADDED Viewed

	@@ -0,0 +1,480 @@

+"""
+📘 glossary_builder.py — FINAL VERSION WITH MRCT SECTION FIX + CDISC + ABBREVIATIONS
+------------------------------------------------------------------------------------
+Builds a unified glossary from:
+  - PDF glossary files
+  - MRCT Clinical Research Glossary (Excel)
+  - CDISC Glossary (Excel)
+  - Abbreviations (Excel)
+  - Web glossary sources
+Features:
+  ✔ Correctly splits MRCT concatenated cells (Glossary Definition + Use in Context…)
+  ✔ Removes all duplicated sections
+  ✔ Maintains the correct order of sections
+  ✔ Handles CDISC Submission Value → Definition extraction
+  ✔ Handles Abbreviations.xlsx column patterns
+"""
+import os
+import re
+import json
+import time
+import fitz
+import requests
+import pandas as pd
+from bs4 import BeautifulSoup
+from huggingface_hub import (
+    upload_file, HfFolder, list_repo_files, hf_hub_download
+)
+# ------------------------------------------------------------------------------
+# CONFIG
+# ------------------------------------------------------------------------------
+DATASET_REPO = "essprasad/CT-Chat-Index"
+DOCS_REPO = "essprasad/CT-Chat-Docs"
+LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
+REMOTE_GLOSSARY = "persistent/glossary.json"
+TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")
+WEB_SOURCES = [
+    "https://mrctcenter.org/glossaryterm/clinical-research/",
+    "https://www.fda.gov/patients/drug-development-process/step-3-clinical-research",
+    "https://www.cdisc.org/",
+    "https://www.ich.org/",
+    "https://www.ema.europa.eu/",
+    "https://www.who.int/",
+    "https://clinicaltrials.gov/",
+]
+# ------------------------------------------------------------------------------
+# HELPERS
+# ------------------------------------------------------------------------------
+def normalize_term(term: str) -> str:
+    if not term:
+        return ""
+    s = re.sub(r"[\-_/\\.,;:]+", " ", term.lower().strip())
+    s = re.sub(r"\s+", " ", s)
+    synonyms = {
+        "electronic case report form": "ecrf",
+        "case report form": "crf",
+        "good clinical practice": "gcp",
+        "clinical study report": "csr",
+        "informed consent form": "icf",
+        "adverse event": "ae",
+        "serious adverse event": "sae",
+        "21 cfr part 11": "21cfrpart11",
+    }
+    return synonyms.get(s, s)
+def extract_text_from_pdf(path):
+    try:
+        doc = fitz.open(path)
+        text = "\n".join(page.get_text("text") for page in doc)
+        doc.close()
+        return text
+    except Exception as e:
+        print(f"⚠️ Error reading PDF {path}: {e}")
+        return ""
+def extract_definitions_from_text(text):
+    glossary = {}
+    text = re.sub(r"\r", "", text)
+    lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
+    i = 0
+    while i < len(lines):
+        term = lines[i]
+        if len(term) <= 1 or term.isdigit():
+            i += 1
+            continue
+        if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index"]):
+            i += 1
+            continue
+        def_lines = []
+        j = i + 1
+        while j < len(lines):
+            nxt = lines[j]
+            if (
+                re.match(r"^[A-Za-z][A-Za-z0-9\- ]{0,20}$", nxt)
+                and not nxt.endswith(".")
+            ):
+                break
+            def_lines.append(nxt)
+            j += 1
+        definition = " ".join(def_lines).strip()
+        if len(definition.split()) < 5:
+            i += 1
+            continue
+        norm = normalize_term(term)
+        glossary[norm] = {
+            "term": term,
+            "definition": definition,
+        }
+        i = j
+    return glossary
+def detect_source_type(src: str) -> str:
+    s = src.lower()
+    if s.endswith(".pdf"):
+        return "pdf"
+    if s.endswith((".xls", ".xlsx")):
+        return "excel"
+    if s.startswith("http"):
+        return "web"
+    return "other"
+def extract_web_glossary(url):
+    results = []
+    try:
+        print(f"🌐 Fetching {url}…")
+        r = requests.get(url, timeout=10)
+        if r.status_code != 200:
+            print(f"⚠️ Skipped {url} (HTTP {r.status_code})")
+            return []
+        soup = BeautifulSoup(r.text, "html.parser")
+        text = soup.get_text(separator="\n")
+        matches = re.findall(
+            r"([A-Z][A-Za-z0-9 \-]{3,30})[:\-]\s*(.{10,200})", text
+        )
+        for term, definition in matches[:50]:
+            results.append(
+                {
+                    "term": term.strip(),
+                    "definition": definition.strip(),
+                    "sources": [url],
+                    "file": url,
+                    "type": "web",
+                }
+            )
+    except Exception as e:
+        print(f"⚠️ Web extraction error for {url}: {e}")
+    return results
+# ------------------------------------------------------------------------------
+# MRCT STRUCTURED CELL PARSER
+# ------------------------------------------------------------------------------
+SECTION_LABELS = [
+    "Glossary Definition",
+    "Use in Context",
+    "More Info",
+    "Other Info to Think About When Joining a Study",
+    "Related Terms",
+    "Resource URL",
+]
+LABEL_RE = re.compile(
+    r"(?i)(Glossary Definition:|Use in Context:|More Info:|Other Info to Think About When Joining a Study:|Related Terms:|Resource URL:)"
+)
+def parse_mrct_cell(cell: str):
+    if not isinstance(cell, str) or not cell.strip():
+        return []
+    text = re.sub(r"\s{2,}", " ", cell.strip())
+    # Split by labels
+    pieces = re.split(LABEL_RE, text)
+    out = []
+    i = 0
+    while i < len(pieces):
+        p = pieces[i].strip()
+        if p == "":
+            i += 1
+            continue
+        if p.endswith(":"):
+            label = p[:-1].strip()
+            value = pieces[i + 1].strip() if i + 1 < len(pieces) else ""
+            out.append((label, value))
+            i += 2
+        else:
+            out.append(("Glossary Definition", p))
+            i += 1
+    return out
+# ------------------------------------------------------------------------------
+# MAIN: GLOSSARY REBUILD
+# ------------------------------------------------------------------------------
+def rebuild_and_upload():
+    start = time.time()
+    print("📘 Starting glossary rebuild…")
+    try:
+        all_files = list_repo_files(
+            repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN
+        )
+        pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
+        excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
+    except Exception as e:
+        raise RuntimeError(f"❌ Cannot list files: {e}")
+    all_defs = {}
+    # ----------------------------------------------------
+    # 1️⃣ PDFs
+    # ----------------------------------------------------
+    skip_patterns = [
+        "topic_", "template", "schedule", "protocol",
+        "painac", "sas", "glossary_printable"
+    ]
+    for pdf in pdfs:
+        if any(sp in pdf.lower() for sp in skip_patterns):
+            print(f"⏩ Skipping non-glossary PDF: {pdf}")
+            continue
+        print(f"🔍 Processing PDF: {pdf}")
+        try:
+            path = hf_hub_download(
+                repo_id=DOCS_REPO,
+                filename=pdf,
+                token=TOKEN,
+                repo_type="dataset"
+            )
+            text = extract_text_from_pdf(path)
+            defs = extract_definitions_from_text(text)
+            for k, v in defs.items():
+                v["sources"] = [pdf]
+                v["file"] = pdf
+                v["type"] = "pdf"
+                all_defs[f"{k}__{pdf}"] = v
+        except Exception as e:
+            print(f"⚠️ PDF extraction error: {pdf}: {e}")
+    # ----------------------------------------------------
+    # 2️⃣ Excel files (MRCT + Abbreviations + CDISC)
+    # ----------------------------------------------------
+    for excel in excels:
+        try:
+            print(f"📗 Processing Excel: {excel}")
+            path = hf_hub_download(
+                repo_id=DOCS_REPO,
+                filename=excel,
+                token=TOKEN,
+                repo_type="dataset"
+            )
+            xls = pd.read_excel(path, sheet_name=None)
+            for sheet_name, df in xls.items():
+                df = df.fillna("").dropna(how="all")
+                if df.empty:
+                    continue
+                df.columns = [str(c).strip() for c in df.columns]
+                lower_cols = {c.lower(): c for c in df.columns}
+                # -----------------------------
+                # Detect term column
+                # -----------------------------
+                term_col = next(
+                    (
+                        c
+                        for c in df.columns
+                        if "glossary term" in c.lower() or c.lower() == "term"
+                    ),
+                    None,
+                )
+                # Abbreviations
+                if not term_col:
+                    for c in [
+                        "acronym",
+                        "abbreviation",
+                        "acryonym/abbreviation/initial",
+                        "initial",
+                    ]:
+                        if c in lower_cols:
+                            term_col = lower_cols[c]
+                            break
+                # CDISC
+                if not term_col:
+                    for c in ["cdisc submission value", "submission value"]:
+                        if c in lower_cols:
+                            term_col = lower_cols[c]
+                            break
+                # Fallback
+                if not term_col:
+                    for c in df.columns:
+                        if "submission" in c.lower():
+                            term_col = c
+                            break
+                if not term_col:
+                    print(f"⏩ Skipping sheet {sheet_name} — no term column")
+                    continue
+                # -----------------------------
+                # MRCT Structured Format
+                # -----------------------------
+                if "Glossary Definition" in df.columns:
+                    # All possible MRCT columns
+                    mrct_cols = [
+                        "Glossary Definition",
+                        "Use in Context",
+                        "More Info",
+                        "Other Info to Think About When Joining a Study",
+                        "Related Terms",
+                        "Resource URL",
+                    ]
+                    def_cols = [c for c in mrct_cols if c in df.columns]
+                else:
+                    # Generic fallback
+                    def_cols = [
+                        c
+                        for c in df.columns
+                        if any(
+                            k in c.lower()
+                            for k in [
+                                "definition",
+                                "description",
+                                "cdisc definition",
+                                "context",
+                                "info",
+                                "related",
+                            ]
+                        )
+                    ]
+                    if not def_cols and len(df.columns) > 1:
+                        def_cols = [df.columns[1]]
+                # -----------------------------
+                # Extract rows
+                # -----------------------------
+                for _, row in df.iterrows():
+                    term = str(row.get(term_col, "")).strip()
+                    if not term:
+                        continue
+                    # Clean + dedupe
+                    def_parts = []
+                    seen = set()
+                    if "Glossary Definition" in df.columns:
+                        raw = str(row.get("Glossary Definition", "")).strip()
+                        parsed = parse_mrct_cell(raw)
+                        if parsed:
+                            # Preferred order
+                            for label in SECTION_LABELS:
+                                for plabel, ptext in parsed:
+                                    if plabel.lower() == label.lower() and ptext.strip():
+                                        if ptext not in seen:
+                                            def_parts.append(f"<b>{label}:</b> {ptext}")
+                                            seen.add(ptext)
+                            # Add missing columns (non-duplicates)
+                            for c in def_cols:
+                                val = str(row.get(c, "")).strip()
+                                if val and val not in seen:
+                                    def_parts.append(f"<b>{c}:</b> {val}")
+                                    seen.add(val)
+                        else:
+                            # Fallback to direct column reading
+                            for c in def_cols:
+                                val = str(row.get(c, "")).strip()
+                                if val and val not in seen:
+                                    def_parts.append(f"<b>{c}:</b> {val}")
+                                    seen.add(val)
+                    else:
+                        # Non-MRCT Excel rows
+                        for c in def_cols:
+                            val = str(row.get(c, "")).strip()
+                            if val and val not in seen:
+                                def_parts.append(f"<b>{c}:</b> {val}")
+                                seen.add(val)
+                    if not def_parts:
+                        continue
+                    entry = {
+                        "term": term,
+                        "definition": "<br>".join(def_parts),
+                        "sources": [excel],
+                        "file": excel,
+                        "sheet": sheet_name,
+                        "type": "excel",
+                    }
+                    key = f"{normalize_term(term)}__{excel}"
+                    all_defs[key] = entry
+            print(f"✅ Processed Excel: {excel}")
+        except Exception as e:
+            print(f"⚠️ Excel extraction error: {excel}: {e}")
+    # ----------------------------------------------------
+    # 3️⃣ Web Sources
+    # ----------------------------------------------------
+    web_defs = []
+    for url in WEB_SOURCES:
+        items = extract_web_glossary(url)
+        for e in items:
+            key = f"{normalize_term(e['term'])}__{e['file']}"
+            all_defs[key] = e
+            web_defs.append(e)
+    print(f"🌐 Added {len(web_defs)} web entries.")
+    # ----------------------------------------------------
+    # 4️⃣ SAVE glossary.json
+    # ----------------------------------------------------
+    os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
+    with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
+        json.dump(list(all_defs.values()), f, indent=2, ensure_ascii=False)
+    print(f"💾 Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")
+    # ----------------------------------------------------
+    # 5️⃣ UPLOAD TO HUGGINGFACE
+    # ----------------------------------------------------
+    if TOKEN:
+        try:
+            upload_file(
+                path_or_fileobj=LOCAL_GLOSSARY,
+                path_in_repo=REMOTE_GLOSSARY,
+                repo_id=DATASET_REPO,
+                repo_type="dataset",
+                token=TOKEN,
+                commit_message="Glossary updated (PDF + Excel + Web)",
+            )
+            print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
+        except Exception as e:
+            print(f"⚠️ Upload failed: {e}")
+    print(f"✨ Glossary rebuild complete in {time.time() - start:.1f}s")
+# ------------------------------------------------------------------------------
+if __name__ == "__main__":
+    rebuild_and_upload()

core/hybrid_retriever.py ADDED Viewed

	@@ -0,0 +1,925 @@

+"""
+Hybrid retriever (drop-in replacement)
+-------------------------------------
+- Preserves original function & variable names and signatures.
+- Integrates CDISC Excel runtime loader, Abbreviations.xlsx loader,
+  PyMuPDF-based clinical-informatics PDF parser,
+  and MRCT duplicate-section dedupe.
+- Injects abbreviation and CDISC entries as separate answers (one per term).
+- Uses FAISS + BM25 retrieval as before.
+"""
+import os
+import re
+import time
+import glob
+from urllib.parse import urlparse
+from difflib import SequenceMatcher
+from pathlib import Path
+import logging
+logger = logging.getLogger(__name__)
+# optional libs
+try:
+    import pandas as pd
+except Exception:
+    pd = None
+try:
+    import fitz  # PyMuPDF
+except Exception:
+    fitz = None
+# project imports
+from core.glossary import _normalize_term
+from core.vector_store import _ensure_faiss_index, search_index, load_all_text_chunks
+from core.bm25 import search_bm25
+from utils.nlp_helpers import extract_van_tokens, normalize_query_text
+# ----------------------------
+# CONFIG
+# ----------------------------
+DENSE_TOP_K = 10
+FUZZY_THRESHOLD = 0.15
+TOP_RESULTS_LIMIT = 5
+GCDMP_FILENAME = "GCDMP_Glossary.pdf"  # exact filename in your HF space/persistent store
+# ----------------------------
+# UTILITIES (preserve names)
+# ----------------------------
+def fuzzy_ratio(a: str, b: str) -> float:
+    return SequenceMatcher(None, a or "", b or "").ratio()
+def strip_question_phrases(text: str) -> str:
+    text = (text or "").lower().strip()
+    prefixes = [
+        "what", "how", "when", "why", "define", "definition", "meaning", "explain",
+        "describe", "expand", "abbreviate", "compare", "identify", "classify",
+        "determine", "do", "did", "does", "done", "can", "shall",
+        "will", "where", "which", "who", "whose", "have", "might", "could", "would",
+        "kindly", "please", "may", "you", "i", "we","us" "they", "there", "here",
+        "what's", "i'll", "where's", "how's", "there's", "who's", "didn't", "doesn't",
+        "give", "provide", "mention", "state", "arrange", "asking", "tell", "explain me",
+        "can you", "could you", "would you", "please explain", "let me know",
+        "say something about", "give details of", "show me", "find", "list", "expand on"
+        "what", "how", "when", "why", "define", "definition", "meaning", "explain",
+        "is", "was", "were", "are",
+    ]
+    prefix_pattern = r"^(" + "|".join(re.escape(p) for p in prefixes) + r")(\s+|['’])"
+    while re.match(prefix_pattern, text):
+        text = re.sub(prefix_pattern, "", text).strip()
+    text = re.sub(r"[?.!]+$", "", text)
+    text = re.sub(r"\s{2,}", " ", text)
+    return text.strip()
+def add_links_to_text(text: str) -> str:
+    return re.sub(r"(https?://[^\s<]+)", r'<a href="\1" target="_blank" rel="noopener noreferrer">\1</a>', text)
+def get_source_rank(src: str, src_type: str) -> int:
+    s = (src or "").lower()
+    # 1. GCDMP glossary PDF → highest priority
+    if GCDMP_FILENAME.lower() in s:
+        return 1
+    # 2. MRCT Excel or MRCT filename
+    if src_type == "excel" or "mrct" in s:
+        return 2
+    # 3. ICH documents (E6, E3, E2A, E9, E1)
+    if any(x in s for x in ["ich_e6", "ich-e6", "ich e6", "ich_e3", "ich-e3", "ich e3", "ich_e2", "ich-e2", "ich e2", "ich_e9", "ich-e9", "ich e9", "ich_e1", "ich-e1", "ich e1"]):
+        return 3
+    # 4. Other PDFs
+    if src_type == "pdf":
+        return 4
+    # 5. Web sources
+    if src_type == "web":
+        return 5
+    return 6
+# Patterns to filter junk lines commonly found in PDF extractions
+JUNK_PATTERNS = [
+    r"^\s*\d+\s*$",                 # page-only lines
+    r"^\s*Page\s+\d+\s*$",
+    r"^\s*Table of Contents.*$",
+    r"^\s*Figure\s+\d+.*$",
+    r"^\s*Section\s+\d+.*$",
+    r".*\.{5,}.*",                  # dotted lines
+    r"^\s*-{3,}\s*$",
+    r"^\s*_+\s*$",
+    r"^\s*required by regulatory authorities.*$",
+]
+_COMPILED_JUNK = [re.compile(p, flags=re.IGNORECASE) for p in JUNK_PATTERNS]
+def clean_extracted_text(text: str) -> str:
+    if not text:
+        return text
+    lines = text.splitlines()
+    cleaned = []
+    for line in lines:
+        s = line.strip()
+        if not s:
+            continue
+        junk = False
+        for pat in _COMPILED_JUNK:
+            if pat.match(s):
+                junk = True
+                break
+        if junk:
+            continue
+        s = re.sub(r'\.{3,}', '.', s)
+        s = re.sub(r'\s{2,}', ' ', s)
+        cleaned.append(s)
+    return "\n".join(cleaned)
+def dedupe_section_headers(txt):
+    """
+    Remove repeated section headers such as 'Use in Context', 'More Info', etc.
+    Keeps first occurrence of each heading.
+    """
+    if not txt:
+        return txt
+    lines = txt.splitlines()
+    seen = set()
+    out_lines = []
+    heading_labels = {
+        "Glossary Definition", "Use in Context", "More Info",
+        "Other Info to Think About When Joining a Study", "Related Terms", "Term URL",
+        "Other Resources", "Resource URL"
+    }
+    for line in lines:
+        # detect heading start
+        m = re.match(r"^([A-Za-z0-9 \-]{3,200}):\s*$", line)
+        if m:
+            h = m.group(1).strip()
+            if h in heading_labels:
+                if h in seen:
+                    # skip this heading line
+                    continue
+                else:
+                    seen.add(h)
+        out_lines.append(line)
+    return "\n".join(out_lines)
+# ----------------------------
+# Excel / MRCT section parser (robust)
+# ----------------------------
+# Recognizes label variants and returns ordered sections as list of (label, text)
+_SECTION_LABELS_ORDER = [
+    "Glossary Definition",
+    "Use in Context",
+    "More Info",
+    "Other Info to Think About When Joining a Study",
+    "Related Terms",
+    "Other Resources",
+    "Resource URL",
+    "Term URL",
+    "CDISC/NCI URL"
+]
+# label alternatives to catch small variations
+_LABEL_ALIASES = {
+    "glossary definition": ["glossary definition", "definition", "glossarydefinition"],
+    "use in context": ["use in context", "use in context:"],
+    "more info": ["more info", "more information", "additional info", "additional information"],
+    "other info to think about when joining a study": [
+        "other info to think about when joining a study",
+        "other info to think about when joining the study",
+        "other info to think about when joining a study:"
+    ],
+    "related terms": ["related terms", "related term", "related terms:"],
+    "other resources": ["other resources", "other resource"],
+    "resource url": ["resource url", "other resources:", "other resources:"],
+    "term url": ["term url", "term url:"],
+    "cdisc/nci url": ["cdisc/nci url", "cdisc nci url"],
+}
+# regex to find labels in a single blob of text
+_LABEL_RE = re.compile(
+    r"(?P<label>(Glossary Definition|Definition|Use in Context|More Info|More Information|Other Info to Think About When Joining a Study|Other Info|Related Terms|Related Terms:|Related Term|Other Resources|Resource URL|Term URL|CDISC/NCI URL))\s*[:\-]\s*",
+    flags=re.IGNORECASE
+)
+def parse_excel_sections(blob: str):
+    """
+    Parse a text blob that may contain multiple labeled sections (MRCT style).
+    Returns ordered list of (label, text) based on _SECTION_LABELS_ORDER.
+    If labels are missing, returns a single ('Glossary Definition', blob).
+    """
+    if not blob or not isinstance(blob, str):
+        return []
+    # Normalize common HTML tags if present (some entries might include <br> or <b>)
+    b = re.sub(r"<br\s*/?>", "\n", blob, flags=re.IGNORECASE)
+    b = re.sub(r"</?[^>]+>", "", b)  # strip tags conservatively
+    # Find all label matches and their positions
+    matches = list(_LABEL_RE.finditer(b))
+    if not matches:
+        # no labels found: attempt heuristic splits by known label-like lines
+        # split by double newline or "Related Terms:" if present
+        if "\n\n" in b:
+            parts = [p.strip() for p in b.split("\n\n") if p.strip()]
+            # heuristically map first part to glossary definition
+            out = [("Glossary Definition", parts[0])]
+            # remaining parts appended as "More Info" or "Other Info"
+            for i, p in enumerate(parts[1:], start=1):
+                label = "More Info" if i == 1 else f"Other Info {i}"
+                out.append((label, p))
+            return out
+        # fallback single block
+        return [("Glossary Definition", b.strip())]
+    # build spans for label->value
+    spans = []
+    for idx, m in enumerate(matches):
+        start = m.end()
+        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(b)
+        label = m.group("label").strip().rstrip(":").strip()
+        val = b[start:end].strip()
+        spans.append((label, val))
+    # Normalize labels to canonical labels and build ordered dict
+    canonical = {}
+    for lab, val in spans:
+        key = lab.lower().strip().rstrip(":")
+        # map through alias set
+        mapped = None
+        for canon, aliases in _LABEL_ALIASES.items():
+            for alias in aliases:
+                if alias in key:
+                    mapped = canon
+                    break
+            if mapped:
+                break
+        if not mapped:
+            # fallback: title-case the label
+            mapped = lab.strip().title()
+        canonical[mapped] = canonical.get(mapped, "") + ("\n\n" + val if canonical.get(mapped) else val)
+    # produce ordered list according to preferred order
+    out = []
+    for label in _SECTION_LABELS_ORDER:
+        key = label.lower()
+        if key in canonical:
+            out.append((label, canonical[key].strip()))
+    # if nothing matched (odd case), return spans as-is (label, val)
+    if not out:
+        for lab, val in spans:
+            out.append((lab.strip(), val.strip()))
+    return out
+# ----------------------------
+# Abbreviations.xlsx loader (Priority B: treated as excel)
+# ----------------------------
+def load_abbreviations_entries(search_paths=None):
+    """
+    Discover Abbreviations.xlsx and return list of candidate dicts:
+    [{'definition','text','file','type','term','sources'}...]
+    Auto-detects header names: term/acronym and definition/long name/description.
+    Falls back to positional columns (A=term, B=definition).
+    """
+    if pd is None:
+        logger.warning("pandas not installed — skipping Abbreviations.xlsx load.")
+        return []
+    # common HF cache & persistent paths
+    HF_CACHE_ROOT = "/root/.cache/huggingface/hub"
+    HF_DATASET_PREFIX = os.path.join(HF_CACHE_ROOT, "datasets--essprasad--CT-Chat-Docs")
+    default_paths = [
+        ".", "/workspace/data", "/mnt/data", os.getcwd(),
+        HF_CACHE_ROOT,
+        HF_DATASET_PREFIX,
+        os.path.join(HF_DATASET_PREFIX, "snapshots"),
+        "/home/user/app/persistent", "/home/user/app/persistent/glossary",
+        "/app/persistent", "/persistent", "/root/.cache"
+    ]
+    if search_paths:
+        default_paths = list(search_paths) + default_paths
+    files = []
+    for base in default_paths:
+        try:
+            files.extend(glob.glob(os.path.join(base, "*Abbreviations*.xls*"), recursive=True))
+            files.extend(glob.glob(os.path.join(base, "**/Abbreviations.xlsx"), recursive=True))
+            files.extend(glob.glob(os.path.join(base, "**/*abbrev*.xls*"), recursive=True))
+            files.extend(glob.glob(os.path.join(base, "**/*abbreviations*.xls*"), recursive=True))
+        except Exception:
+            continue
+    files = list(dict.fromkeys(files))
+    entries = []
+    for fx in files:
+        try:
+            if fx.lower().endswith("x") and pd is not None:
+                try:
+                    df = pd.read_excel(fx, engine="openpyxl")
+                except Exception:
+                    df = pd.read_excel(fx)
+            else:
+                df = pd.read_excel(fx)
+        except Exception as e:
+            logger.exception("Failed to read Abbreviations file %s: %s", fx, e)
+            continue
+        cols = {c.strip().lower(): c for c in df.columns}
+        # common possible headers for term
+        term_candidates = ["abbreviation", "acronym", "term", "short form", "initial", "abbrev", "abbrev."]
+        def_candidates = ["definition", "description", "long name", "meaning", "full form", "explanation"]
+        term_col = None
+        def_col = None
+        for k, v in cols.items():
+            if any(tc in k for tc in term_candidates) and term_col is None:
+                term_col = v
+            if any(dc in k for dc in def_candidates) and def_col is None:
+                def_col = v
+        # fallback to positional columns A/B (0/1)
+        if not term_col or not def_col:
+            try:
+                term_col = term_col or df.columns[0]
+                def_col = def_col or (df.columns[1] if len(df.columns) > 1 else df.columns[0])
+            except Exception:
+                logger.warning("Abbreviations file %s missing expected term/definition columns. Skipping.", fx)
+                continue
+        count = 0
+        for _, row in df.iterrows():
+            term = str(row.get(term_col) or "").strip()
+            definition = str(row.get(def_col) or "").strip()
+            if not term or not definition:
+                continue
+            # combine into text for retrieval
+            combined = definition
+            entries.append({
+                "definition": definition,
+                "text": combined,
+                "file": os.path.basename(fx),
+                "type": "excel",
+                "term": term,
+                "sources": [os.path.basename(fx)]
+            })
+            count += 1
+        logger.info("Loaded %d abbreviations from %s", count, fx)
+    logger.info("Total loaded abbreviations: %d", len(entries))
+    return entries
+# ----------------------------
+# CDISC Excel loader (improved HF cache discovery)
+# ----------------------------
+def load_cdisc_entries(search_paths=None):
+    """
+    Discover CDISC Excel files and return a list of normalized candidate dicts:
+    [{'definition','text','file','type','term','sources'}...]
+    """
+    if pd is None:
+        logger.warning("pandas not installed — skipping CDISC Excel load.")
+        return []
+    # HF cache path where datasets are downloaded during rebuild
+    HF_CACHE_ROOT = "/root/.cache/huggingface/hub"
+    HF_DATASET_PREFIX = os.path.join(HF_CACHE_ROOT, "datasets--essprasad--CT-Chat-Docs")
+    default_paths = [
+        ".", "/workspace/data", "/mnt/data", os.getcwd(),
+        HF_CACHE_ROOT,
+        HF_DATASET_PREFIX,
+        os.path.join(HF_DATASET_PREFIX, "snapshots"),
+        "/home/user/app/persistent", "/home/user/app/persistent/glossary",
+        "/app/persistent", "/persistent", "/root/.cache"
+    ]
+    if search_paths:
+        default_paths = list(search_paths) + default_paths
+    # find files (recursive search)
+    files = []
+    for base in default_paths:
+        try:
+            files.extend(glob.glob(os.path.join(base, "*[Cc][Dd][Ii][Ss][Cc]*.xls*"), recursive=True))
+            files.extend(glob.glob(os.path.join(base, "**/CDISC Glossary.xlsx"), recursive=True))
+            files.extend(glob.glob(os.path.join(base, "**/*CDISC*.xls*"), recursive=True))
+        except Exception:
+            continue
+    files = list(dict.fromkeys(files))  # unique
+    entries = []
+    for fx in files:
+        try:
+            # read with openpyxl for .xlsx when available
+            if fx.lower().endswith("x") and pd is not None:
+                try:
+                    df = pd.read_excel(fx, engine="openpyxl")
+                except Exception:
+                    df = pd.read_excel(fx)
+            else:
+                df = pd.read_excel(fx)
+        except Exception as e:
+            logger.exception("Failed to read CDISC file %s: %s", fx, e)
+            continue
+        cols = {c.strip().lower(): c for c in df.columns}
+        term_col = cols.get("cdisc submission value") or cols.get("term") or cols.get("submission value")
+        syn_col = cols.get("cdisc synonym(s)") or cols.get("cdisc synonym") or cols.get("synonym(s)") or cols.get("synonyms")
+        def_col = cols.get("cdisc definition") or cols.get("definition") or cols.get("cdisc definition(s)")
+        # fallback to positional columns A/B/C if headers differ
+        if not term_col or not def_col:
+            try:
+                term_col = term_col or df.columns[0]
+                def_col = def_col or df.columns[2]
+                syn_col = syn_col or (df.columns[1] if len(df.columns) > 1 else None)
+            except Exception:
+                logger.warning("CDISC file %s missing expected columns (A or C). Skipping.", fx)
+                continue
+        for _, row in df.iterrows():
+            term = str(row.get(term_col) or "").strip()
+            synonyms = str(row.get(syn_col) or "").strip() if syn_col else ""
+            definition = str(row.get(def_col) or "").strip()
+            if not term or not definition:
+                continue
+            # Build combined text including synonyms for better retrieval
+            text_parts = [definition]
+            if synonyms:
+                text_parts.append("Synonyms: " + synonyms)
+            combined = "\n\n".join([p for p in text_parts if p])
+            entries.append({
+                "definition": definition,
+                "text": combined,
+                "file": os.path.basename(fx),
+                "type": "excel",
+                "term": term,
+                "sources": [os.path.basename(fx)]
+            })
+    logger.info("Loaded %d CDISC entries from %d files", len(entries), len(files))
+    return entries
+# ----------------------------
+# Clinical-informatics PDF parser using PyMuPDF (fitz)
+# ----------------------------
+def parse_clinical_informatics_pdf(path):
+    """
+    Parse clinical-informatics-acronym-glossary.pdf using PyMuPDF (fitz).
+    Extracts acronym headings and description blocks; returns list of candidate dicts.
+    """
+    if fitz is None:
+        logger.warning("PyMuPDF (fitz) not installed — skipping clinical-informatics PDF parsing.")
+        return []
+    try:
+        doc = fitz.open(path)
+    except Exception as e:
+        logger.exception("Failed to open PDF %s: %s", path, e)
+        return []
+    full_text = []
+    for page in doc:
+        try:
+            text = page.get_text("text")
+            if text:
+                full_text.append(text)
+        except Exception:
+            continue
+    doc.close()
+    full = "\n".join(full_text)
+    full = full.replace("\r", "")
+    # split into blocks by blank lines
+    blocks = [b.strip() for b in re.split(r"\n\s*\n", full) if b.strip()]
+    entries = []
+    i = 0
+    while i < len(blocks):
+        blk = blocks[i]
+        heading = blk.splitlines()[0].strip()
+        # detect an acronym-like heading: all-caps, digits, dashes, short
+        if re.match(r"^[A-Z0-9\-/]{1,12}$", heading):
+            term = heading
+            desc_parts = []
+            j = i + 1
+            while j < len(blocks):
+                next_head = blocks[j].splitlines()[0].strip()
+                if re.match(r"^[A-Z0-9\-/]{1,12}$", next_head):
+                    break
+                desc_parts.append(blocks[j])
+                j += 1
+            if desc_parts:
+                definition = "\n\n".join(desc_parts).strip()
+                entries.append({
+                    "definition": definition,
+                    "text": definition,
+                    "file": os.path.basename(path),
+                    "type": "pdf",
+                    "term": term,
+                    "sources": [os.path.basename(path)]
+                })
+            i = j
+        else:
+            i += 1
+    logger.info("Parsed %d entries from clinical-informatics PDF (PyMuPDF).", len(entries))
+    return entries
+# ----------------------------
+# MAIN RETRIEVER (preserve name)
+# ----------------------------
+def summarize_combined(query: str, mode: str = "short") -> str:
+    start = time.time()
+    if not query or not query.strip():
+        return "<i>No query provided.</i>"
+    # Normalize user query
+    cleaned = strip_question_phrases(query)
+    expanded = normalize_query_text(cleaned)
+    van_tokens = extract_van_tokens(expanded)
+    normalized = " ".join(van_tokens).strip() or cleaned
+    nq = normalized.lower().strip()
+    print(f"🔍 summarize_combined() | cleaned='{cleaned}' normalized='{nq}'")
+    # Acronym expansion map (preserve/extend)
+    acronym_map = {
+        "ae": "adverse event", "adr": "adverse drug reaction",
+        "crf": "case report form", "ecrf": "electronic case report form",
+        "cro": "contract research organization", "csr": "clinical study report",
+        "ctms": "clinical trial management system", "edc": "electronic data capture",
+        "ehr": "electronic health record", "emr": "electronic medical record",
+        "gcp": "good clinical practice", "irb": "institutional review board",
+        "iec": "independent ethics committee", "ind": "investigational new drug application",
+        "mrct": "multi-regional clinical trials", "qa": "quality assurance",
+        "qc": "quality control", "sae": "serious adverse event", "sap": "statistical analysis plan",
+        "siv": "site initiation visit", "sop": "standard operating procedure",
+        "ssu": "study start-up", "uat": "user acceptance testing",
+        "whodrug": "world health organization drug dictionary",
+    }
+    glossary_key = _normalize_term(nq)
+    if glossary_key in acronym_map:
+        expanded_term = acronym_map[glossary_key]
+        nq = _normalize_term(expanded_term)
+        print(f"🔁 Acronym expanded → {expanded_term}")
+    # ----------------------------
+    # FAISS + BM25 retrieval
+    # ----------------------------
+    dense_hits, bm25_hits = [], []
+    try:
+        if _ensure_faiss_index():
+            dense_hits = search_index(normalized, top_k=DENSE_TOP_K) or []
+            print(f"✅ FAISS hits: {len(dense_hits)}")
+    except Exception as e:
+        print(f"⚠️ FAISS search failed: {e}")
+    try:
+        docs = load_all_text_chunks()
+        if docs:
+            bm25_hits = search_bm25(normalized, docs, top_n=8) or []
+            print(f"✅ BM25 hits: {len(bm25_hits)}")
+    except Exception as e:
+        print(f"⚠️ BM25 fallback failed: {e}")
+    # ----------------------------
+    # Inject Abbreviations + CDISC + clinical-informatics PDF parsed entries (runtime)
+    # ----------------------------
+    extra_hits = []
+    try:
+        abbrev_entries = load_abbreviations_entries()
+        for e in abbrev_entries:
+            extra_hits.append({
+                "definition": e["definition"],
+                "text": e["text"],
+                "file": e["file"],
+                "type": e["type"],
+                "term": e["term"],
+                "sources": e.get("sources", [])
+            })
+    except Exception as e:
+        logger.exception("Abbreviations load failed: %s", e)
+    try:
+        cdisc_entries = load_cdisc_entries()
+        for e in cdisc_entries:
+            extra_hits.append({
+                "definition": e["definition"],
+                "text": e["text"],
+                "file": e["file"],
+                "type": e["type"],
+                "term": e["term"],
+                "sources": e.get("sources", [])
+            })
+    except Exception as e:
+        logger.exception("CDISC load failed: %s", e)
+    try:
+        pdf_paths = glob.glob("./*clinical*informatics*.pdf") + glob.glob("/mnt/data/*clinical*informatics*.pdf") + glob.glob("/workspace/data/*clinical*informatics*.pdf")
+        pdf_paths = list(dict.fromkeys(pdf_paths))
+        for p in pdf_paths:
+            parsed = parse_clinical_informatics_pdf(p)
+            for e in parsed:
+                extra_hits.append({
+                    "definition": e["definition"],
+                    "text": e["text"],
+                    "file": e["file"],
+                    "type": e["type"],
+                    "term": e["term"],
+                    "sources": e.get("sources", [])
+                })
+    except Exception as e:
+        logger.exception("clinical-informatics parse failed: %s", e)
+    hits = (dense_hits or []) + (bm25_hits or []) + extra_hits
+    if not hits:
+        return "<i>No relevant information found.</i>"
+    # ----------------------------
+    # Group by original resolved source (prefer real source over glossary.json)
+    # ----------------------------
+    grouped = {}
+    glossary_fallbacks = []
+    for h in hits:
+        raw_src = h.get("file") or h.get("source") or h.get("source_file") or "unknown"
+        meta_sources = h.get("sources") or h.get("source_list") or []
+        # prefer a non-glossary meta source if available
+        src = raw_src
+        if isinstance(meta_sources, (list, tuple)) and meta_sources:
+            chosen = None
+            for s in meta_sources:
+                if isinstance(s, str) and not s.lower().endswith("glossary.json"):
+                    chosen = s
+                    break
+            if chosen:
+                src = chosen
+            else:
+                src = meta_sources[0]
+        src_type = (h.get("type") or "").lower()
+        term = (h.get("term") or "").strip()
+        term_lower = term.lower()
+        txt = (h.get("definition") or h.get("text") or h.get("content") or h.get("full_text") or "").strip()
+        if not txt:
+            continue
+        txt = clean_extracted_text(txt)
+        # If original stored file was glossary.json, keep as fallback only
+        if str(raw_src).lower().endswith("glossary.json"):
+            glossary_fallbacks.append({"hit": h, "text": txt, "src": src})
+        # Save resolved sources for provenance. Ensure URL-like sources are preserved.
+        resolved_sources = meta_sources if meta_sources else []
+        # If resolved_sources empty, try to collect URL-like values from hit fields or src
+        if not resolved_sources:
+            possible = []
+            for key in ("url", "source", "link", "file"):
+                v = h.get(key)
+                if isinstance(v, str) and v.startswith("http"):
+                    possible.append(v)
+            if isinstance(src, str) and src.startswith("http"):
+                possible.append(src)
+            # fallback to raw_src if nothing else
+            resolved_sources = possible or [raw_src]
+        # normalize to list
+        if isinstance(resolved_sources, (list, tuple)):
+            resolved_sources = list(resolved_sources)
+        else:
+            resolved_sources = [resolved_sources]
+        h["_resolved_sources"] = resolved_sources
+        # For MRCT-like text (detected by filename), dedupe repeated sections first
+        if "mrct" in str(src).lower() or "mrct" in str(raw_src).lower():
+            txt = dedupe_section_headers(txt)
+        # Group key based on resolved original source + type + term
+        # Special-case Abbreviations so each term is unique (Priority B behavior)
+        src_l = str(src).lower()
+        raw_src_l = str(raw_src).lower()
+        if "abbreviations.xlsx" in src_l or "abbreviations.xlsx" in raw_src_l or ("abbreviations" in src_l and src_type == "excel"):
+            key = f"abbrev__excel__{term_lower}"
+        # Special-case CDISC so each term is unique (Option A)
+        elif "cdisc glossary.xlsx" in src_l or "cdisc glossary.xlsx" in raw_src_l or ("cdisc" in src_l and src_type == "excel"):
+            key = f"cdisc__excel__{term_lower}"
+        else:
+            key = f"{os.path.basename(src).lower()}__{src_type}__{term_lower[:200]}"
+        # Prefer glossary PDF entries (GCDMP/ 'glossary' in filename) when colliding with long chunks
+        prefer_glossary = (GCDMP_FILENAME.lower() in str(src).lower()) or ("glossary" in str(src).lower())
+        if key not in grouped:
+            grouped[key] = {"hit": h, "text": txt, "src": src, "src_type": src_type, "term": term}
+        else:
+            existing_src = grouped[key]["src"]
+            existing_is_glossary = (GCDMP_FILENAME.lower() in str(existing_src).lower()) or ("glossary" in str(existing_src).lower())
+            if prefer_glossary and not existing_is_glossary:
+                grouped[key] = {"hit": h, "text": txt, "src": src, "src_type": src_type, "term": term}
+            else:
+                # otherwise prefer longer chunk unless this new is a glossary and existing is not
+                if not prefer_glossary and len(txt) > len(grouped[key]["text"]):
+                    grouped[key] = {"hit": h, "text": txt, "src": src, "src_type": src_type, "term": term}
+    # ----------------------------
+    # Format answers: one per original source
+    # ----------------------------
+    answers = []
+    src_counts = {"excel": 0, "pdf": 0, "web": 0, "other": 0}
+    # ensure only one combined excel answer per term to prevent duplicated sections
+    seen_excel_terms = set()
+    for entry in grouped.values():
+        h = entry["hit"]
+        txt = entry["text"]
+        src = entry["src"]
+        src_type = entry.get("src_type") or (h.get("type") or "").lower()
+        term = entry.get("term") or (h.get("term") or "").strip()
+        term_lower = (term or "").lower()
+        # Skip entries resolved to glossary.json here (we'll use them only as fallback)
+        if str(src).lower().endswith("glossary.json"):
+            continue
+        # If this is an excel entry for MRCT/CDISC/Abbrev, ensure only first combined answer per term
+        is_excel = (src_type == "excel") or str(src).lower().endswith((".xls", ".xlsx"))
+        if is_excel:
+            if term_lower in seen_excel_terms:
+                # skip duplicate excel results for same term (they will be combined in the first occurrence)
+                continue
+            # mark as seen (so subsequent excel chunks won't produce duplicates)
+            seen_excel_terms.add(term_lower)
+        # Skip noisy PDF sections unless they look like short glossary entries
+        txt_lower = txt.lower()
+        if src_type == "pdf" and any(k in txt_lower[:300] for k in ["table of contents", "appendix", "index", "section"]):
+            if not (len(txt.split()) < 80 and term_lower and term_lower in txt_lower[:120]):
+                # treat as noise
+                continue
+        # Determine icon and counts
+        if src_type == "excel":
+            icon, cat = "📘", "excel"
+        elif src_type == "pdf":
+            icon, cat = "📄", "pdf"
+        elif src_type == "web":
+            icon, cat = "🌐", "web"
+        else:
+            icon, cat = "📁", "other"
+        src_counts[cat] += 1
+        # Extract excerpt (PDF / web special handling for glossary-style)
+        excerpt = ""
+        if src_type in ("pdf", "web"):
+            paragraphs = re.split(r"\n\s*\n", txt)
+            paragraphs = [p.strip() for p in paragraphs if p.strip()]
+            # 1) If full term matches heading (e.g., "electronic health record")
+            if paragraphs and term_lower:
+                heading = paragraphs[0].strip().lower()
+                if heading == term_lower or (term_lower in heading):
+                    excerpt = paragraphs[1].strip() if len(paragraphs) > 1 else paragraphs[0].strip()
+            # 2) If not yet found, try exact normalized query inside paragraphs
+            if not excerpt:
+                found = None
+                for p in paragraphs:
+                    if nq and nq in p.lower():
+                        found = p.strip()
+                        break
+                # 3) Fuzzy match with paragraph starts
+                if not found and term_lower:
+                    for p in paragraphs:
+                        if fuzzy_ratio(term_lower, p.lower()[:100]) > 0.75:
+                            found = p.strip()
+                            break
+                # 4) Paragraph following a heading that contains the term
+                if not found and term_lower:
+                    for i, p in enumerate(paragraphs[:-1]):
+                        if term_lower in p.lower():
+                            found = paragraphs[i + 1].strip()
+                            break
+                excerpt = (found or (paragraphs[0] if paragraphs else txt)).strip()
+            excerpt = excerpt[:2000] + ("..." if len(excerpt) > 2000 else "")
+            excerpt = add_links_to_text(excerpt)
+        elif src_type == "excel":
+            # Special-case: Abbreviations -> always show full clean definition (single block)
+            if "abbreviations.xlsx" in str(src).lower() or ("abbreviations" in str(src).lower() and src_type=="excel"):
+                excerpt = add_links_to_text(txt)
+            # Special-case: CDISC -> always show full clean definition (single block)
+            elif "cdisc glossary.xlsx" in str(src).lower() or ("cdisc" in str(src).lower() and src_type=="excel"):
+                excerpt = add_links_to_text(txt)
+            else:
+                # General Excel/MRCT parsing: parse labeled sections and build one combined excerpt
+                try:
+                    sections = parse_excel_sections(txt)
+                except Exception:
+                    sections = [("Glossary Definition", txt)]
+                lines = []
+                seen_vals = set()
+                for label, val in sections:
+                    if not val or not str(val).strip():
+                        continue
+                    v = str(val).strip()
+                    # Clickify URLs if the section is a single URL
+                    if re.match(r"^https?://\S+$", v):
+                        v_html = f'<a href="{v}" target="_blank" rel="noopener noreferrer">{v}</a>'
+                    else:
+                        v_html = add_links_to_text(v)
+                    # Avoid duplicate repeated text segments
+                    if v_html in seen_vals:
+                        continue
+                    seen_vals.add(v_html)
+                    lines.append(f"<b>{label}:</b> {v_html}")
+                excerpt = "<br>".join(lines) if lines else add_links_to_text(txt)
+        else:
+            excerpt = add_links_to_text(txt)
+        # Prepare heading and display sources (exclude internal glossary.json from display)
+        heading_term = term.strip() or os.path.splitext(os.path.basename(src))[0]
+        heading_html = f"<h4>{icon} {heading_term}</h4>"
+        # Use _resolved_sources (preserved earlier) and ensure web URLs are shown directly
+        # Build clickable sources
+        resolved_sources = h.get("_resolved_sources") or []
+        display_sources = []
+        for s in resolved_sources:
+            if not isinstance(s, str):
+                continue
+            if s.lower().endswith("glossary.json"):
+                continue
+            if s.startswith("http"):
+                display_sources.append(
+                    f'<a href="{s}" target="_blank" rel="noopener noreferrer">{s}</a>'
+                )
+            else:
+                display_sources.append(os.path.basename(s))
+        # Fallback if empty
+        if not display_sources:
+            if isinstance(src, str) and src.startswith("http"):
+                display_sources = [
+                    f'<a href="{src}" target="_blank" rel="noopener noreferrer">{src}</a>'
+                ]
+            else:
+                display_sources = [os.path.basename(str(src))]
+        # ALWAYS create sources_line safely
+        sources_line = (
+            "<p>🔗 <i>Source:</i> "
+            + " · ".join(dict.fromkeys(display_sources))
+            + "</p>"
+        )
+        answers.append({
+            "rank": get_source_rank(src, src_type),
+            "type": cat,
+            "term": term,
+            "html": f"{heading_html}{sources_line}<blockquote>{excerpt}</blockquote>"
+        })
+    # ----------------------------
+    # Fallback: only use glossary.json definitions if no other original sources matched
+    # ----------------------------
+    if not answers and glossary_fallbacks:
+        for item in glossary_fallbacks:
+            h = item["hit"]
+            txt = item["text"]
+            src = item.get("src") or (h.get("file") or h.get("source") or "glossary.json")
+            term = (h.get("term") or "").strip() or "Definition"
+            heading_html = f"<h4>📄 {term}</h4>"
+            excerpt = txt.strip()
+            answers.append({
+                "rank": 10,
+                "type": "pdf",
+                "term": term,
+                "html": f"{heading_html}<p>🔗 <i>Source:</i> {os.path.basename(src)}</p><blockquote>{excerpt}</blockquote>"
+            })
+    # ----------------------------
+    # Final sort & output
+    # ----------------------------
+    if not answers:
+        return "<i>No relevant results found.</i>"
+    answers = sorted(answers, key=lambda a: a["rank"])
+    final_html_parts = [a["html"] for a in answers[:TOP_RESULTS_LIMIT]]
+    summary_counts = " | ".join(f"{k.capitalize()}: {v}" for k, v in src_counts.items() if v > 0)
+    elapsed = time.time() - start
+    print(f"✅ Answers from {len(answers)} sources in {elapsed:.2f}s")
+    return (
+        f"<h3>🧠 Answers (one per source):</h3>"
+        f"<p><i>Sources → {summary_counts}</i></p>"
+        + "<br>".join(final_html_parts)
+    )

core/retrieval.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+import re
+import json
+from whoosh.index import open_dir
+from whoosh.qparser import MultifieldParser
+WHOOSH_INDEX_PATH = "/home/user/app/persistent/whoosh_index"
+_ix = None
+def _load_whoosh():
+    global _ix
+    if _ix is None and os.path.exists(WHOOSH_INDEX_PATH):
+        _ix = open_dir(WHOOSH_INDEX_PATH)
+    return _ix
+def _bm25_search(query, top_n=10):
+    ix = _load_whoosh()
+    if not ix:
+        return []
+    parser = MultifieldParser(["text", "title"], schema=ix.schema)
+    q = parser.parse(query)
+    with ix.searcher() as s:
+        results = s.search(q, limit=top_n)
+        return [{"text": r["text"], "file": r.get("file", "")} for r in results]

core/van_normalizer.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# core/van_normalizer.py
+import re
+import nltk
+from nltk import pos_tag, word_tokenize
+from nltk.stem import WordNetLemmatizer
+# make sure you have these (run once if missing):
+# python -m nltk.downloader punkt averaged_perceptron_tagger wordnet omw-1.4
+lemmatizer = WordNetLemmatizer()
+def normalize_to_van(text: str) -> str:
+    """
+    VAN-based normalization (optimized for clinical trial domain):
+    - Lowercases and removes punctuation
+    - Tokenizes and POS-tags
+    - Keeps only Nouns (N), Adjectives (J), and key Verbs (V)
+    - Explicitly removes determiners/articles (a, an, the)
+    - Lemmatizes each token to its base form
+    - Returns a space-joined string suitable for FAISS embedding
+    """
+    if not text:
+        return ""
+    # Basic cleanup
+    text = text.lower().strip()
+    text = re.sub(r"[^a-z0-9\s-]", " ", text)  # remove punctuation
+    tokens = word_tokenize(text)
+    # POS tagging
+    tagged = pos_tag(tokens)
+    filtered = []
+    for word, tag in tagged:
+        # Skip common determiners, articles, and auxiliary verbs
+        if word in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
+            continue
+        # Keep only verbs, adjectives, and nouns
+        if tag.startswith("V") or tag.startswith("J") or tag.startswith("N"):
+            filtered.append((word, tag))
+    # Lemmatize each word to its appropriate part of speech
+    lemmas = []
+    for word, tag in filtered:
+        pos = (
+            "v" if tag.startswith("V")
+            else "a" if tag.startswith("J")
+            else "n"
+        )
+        lemmas.append(lemmatizer.lemmatize(word, pos))
+    # Join and clean
+    normalized = " ".join(lemmas).strip()
+    normalized = re.sub(r"\s+", " ", normalized)  # collapse multiple spaces
+    return normalized

core/vector_search.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+vector_search.py
+Thin wrapper helpers used to orchestrate searches and resets from the app or admin UI.
+"""
+from typing import List, Dict, Any
+from core import vector_store, vector_sync
+def semantic_search(query: str, top_k: int = 6) -> List[Dict[str, Any]]:
+    """
+    Safe semantic search that falls back gracefully to empty list.
+    """
+    try:
+        return vector_store.search_index(query, top_k=top_k)
+    except Exception as e:
+        print(f"⚠️ semantic_search error: {e}")
+        return []
+def reset_faiss_and_rebuild(glossary_builder_fn=None, rebuild_index_fn=None) -> str:
+    """
+    Clear local caches and run rebuilds. The app's reset_faiss_cache() can call this.
+    - glossary_builder_fn: optional function to run to rebuild glossary (if provided)
+    - rebuild_index_fn: optional function that triggers full rebuild (if provided)
+    """
+    try:
+        vector_store.clear_local_faiss()
+    except Exception as e:
+        print(f"⚠️ clear_local_faiss failed: {e}")
+    out = "🧹 Cleared local FAISS files.\n"
+    # If a glossary builder was provided, run it (safe)
+    if glossary_builder_fn:
+        try:
+            out += glossary_builder_fn() + "\n"
+        except Exception as e:
+            out += f"⚠️ Glossary builder failed: {e}\n"
+    if rebuild_index_fn:
+        try:
+            out += rebuild_index_fn()
+        except Exception as e:
+            out += f"⚠️ Rebuild index failed: {e}\n"
+    return out

core/vector_store.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+vector_store.py
+-----------------------------------------------------
+Maintains FAISS runtime index + metadata cache.
+Features
+--------
+- Ensure local FAISS runtime index exists (download from HF if missing)
+- FAISS semantic search and BM25 text access
+- Automatic TTL reload
+- Full cache clearing for Hugging Face Space
+- Explicit "♻️ FAISS memory cache reset" logging on rebuild
+"""
+import os
+import json
+import time
+import shutil
+from typing import List, Dict, Any, Optional
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import hf_hub_download
+# ------------------------------------------------------------------
+# 🔧 Paths & constants
+# ------------------------------------------------------------------
+PERSISTENT_DIR = "/home/user/app/persistent"
+RUNTIME_DIR = "/home/user/app/runtime_faiss"
+INDEX_NAME = "faiss.index"
+META_NAME = "faiss.index.meta.json"
+GLOSSARY_META = "glossary.json"
+HF_INDEX_REPO = "essprasad/CT-Chat-Index"
+EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+EMBED_MODEL = None  # lazy loaded
+# in-memory cache
+_runtime_index: Optional[faiss.Index] = None
+_runtime_meta: Optional[List[Dict[str, Any]]] = None
+_meta_loaded_time = 0.0
+_META_TTL_SECONDS = 300.0
+# ------------------------------------------------------------------
+# 🔹 Helpers
+# ------------------------------------------------------------------
+def _ensure_dirs():
+    os.makedirs(PERSISTENT_DIR, exist_ok=True)
+    os.makedirs(RUNTIME_DIR, exist_ok=True)
+def _ensure_model():
+    global EMBED_MODEL
+    if EMBED_MODEL is None:
+        print("📥 Loading embedding model for FAISS retrieval…")
+        EMBED_MODEL = SentenceTransformer(EMBED_MODEL_NAME)
+        print("✅ Embedding model loaded.")
+    return EMBED_MODEL
+# ------------------------------------------------------------------
+# 🔹 Cache control
+# ------------------------------------------------------------------
+def clear_local_faiss():
+    """Delete all local FAISS + glossary caches (safe in HF Space)."""
+    for p in [
+        os.path.join(PERSISTENT_DIR, INDEX_NAME),
+        os.path.join(PERSISTENT_DIR, META_NAME),
+        os.path.join(PERSISTENT_DIR, GLOSSARY_META),
+        RUNTIME_DIR,
+    ]:
+        try:
+            if os.path.isdir(p):
+                shutil.rmtree(p, ignore_errors=True)
+            elif os.path.exists(p):
+                os.remove(p)
+            print(f"🗑️ Cleared: {p}")
+        except Exception as e:
+            print(f"⚠️ Failed to clear {p}: {e}")
+    print("♻️ FAISS memory cache reset (runtime + persistent cleared)")
+# ------------------------------------------------------------------
+# 🔹 Loaders
+# ------------------------------------------------------------------
+def _load_local_index() -> bool:
+    """Load FAISS index + metadata from persistent into runtime."""
+    global _runtime_index, _runtime_meta, _meta_loaded_time
+    _ensure_dirs()
+    idx_path = os.path.join(PERSISTENT_DIR, INDEX_NAME)
+    meta_path = os.path.join(PERSISTENT_DIR, META_NAME)
+    try:
+        if not (os.path.exists(idx_path) and os.path.exists(meta_path)):
+            return False
+        os.makedirs(RUNTIME_DIR, exist_ok=True)
+        shutil.copy2(idx_path, os.path.join(RUNTIME_DIR, INDEX_NAME))
+        shutil.copy2(meta_path, os.path.join(RUNTIME_DIR, META_NAME))
+        _runtime_index = faiss.read_index(os.path.join(RUNTIME_DIR, INDEX_NAME))
+        with open(os.path.join(RUNTIME_DIR, META_NAME), "r", encoding="utf-8") as f:
+            _runtime_meta = json.load(f)
+        _meta_loaded_time = time.time()
+        print(f"✅ Loaded FAISS index ({len(_runtime_meta)} vectors).")
+        return True
+    except Exception as e:
+        print(f"⚠️ Could not load local FAISS index: {e}")
+        _runtime_index = None
+        _runtime_meta = None
+        return False
+def _download_index_from_hub() -> bool:
+    """Download FAISS artifacts from Hugging Face dataset repo."""
+    _ensure_dirs()
+    try:
+        print("☁️ Downloading FAISS artifacts from HF dataset…")
+        idx = hf_hub_download(repo_id=HF_INDEX_REPO,
+                              filename=f"persistent/{INDEX_NAME}",
+                              repo_type="dataset")
+        meta = hf_hub_download(repo_id=HF_INDEX_REPO,
+                               filename=f"persistent/{META_NAME}",
+                               repo_type="dataset")
+        shutil.copy2(idx, os.path.join(PERSISTENT_DIR, INDEX_NAME))
+        shutil.copy2(meta, os.path.join(PERSISTENT_DIR, META_NAME))
+        print("✅ FAISS artifacts downloaded and stored persistently.")
+        return _load_local_index()
+    except Exception as e:
+        print(f"⚠️ HF download failed: {e}")
+        return False
+def _ensure_faiss_index(force_refresh: bool = False) -> bool:
+    """
+    Ensure runtime FAISS is available.
+    If force_refresh=True, clears runtime and reloads fresh.
+    """
+    global _runtime_index, _runtime_meta, _meta_loaded_time
+    _ensure_dirs()
+    if force_refresh:
+        try:
+            shutil.rmtree(RUNTIME_DIR, ignore_errors=True)
+            _runtime_index = None
+            _runtime_meta = None
+            print("♻️ Forced FAISS runtime reload requested.")
+        except Exception as e:
+            print(f"⚠️ Force refresh failed: {e}")
+    if _runtime_index is not None and (time.time() - _meta_loaded_time) < _META_TTL_SECONDS:
+        return True
+    if _load_local_index():
+        return True
+    if _download_index_from_hub():
+        return True
+    print("⚠️ No FAISS index found locally or remotely.")
+    return False
+# ------------------------------------------------------------------
+# 🔹 Accessors
+# ------------------------------------------------------------------
+def load_all_text_chunks() -> List[Dict[str, Any]]:
+    """Return metadata list for BM25 fallback or analysis."""
+    global _runtime_meta, _meta_loaded_time
+    if _runtime_meta is None:
+        if not _ensure_faiss_index():
+            return []
+    if (time.time() - _meta_loaded_time) > _META_TTL_SECONDS:
+        try:
+            meta_path = os.path.join(RUNTIME_DIR, META_NAME)
+            with open(meta_path, "r", encoding="utf-8") as f:
+                _runtime_meta = json.load(f)
+                _meta_loaded_time = time.time()
+        except Exception:
+            pass
+    return _runtime_meta or []
+# ------------------------------------------------------------------
+# 🔹 Core Search
+# ------------------------------------------------------------------
+def search_index(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
+    """Perform semantic FAISS search and return metadata hits."""
+    if not _ensure_faiss_index():
+        return []
+    try:
+        model = _ensure_model()
+        q_emb = model.encode([query], convert_to_numpy=True).astype("float32")
+        faiss.normalize_L2(q_emb)
+        D, I = _runtime_index.search(q_emb, top_k)
+        results = []
+        for dist, idx in zip(D[0], I[0]):
+            if idx < 0 or idx >= len(_runtime_meta):
+                continue
+            meta = dict(_runtime_meta[idx])
+            meta["score"] = float(dist)
+            meta["file"] = meta.get("file") or meta.get("source") or "unknown"
+            meta["text"] = meta.get("text") or meta.get("definition", "")
+            results.append(meta)
+        return results
+    except Exception as e:
+        print(f"⚠️ FAISS search failed: {e}")
+        return []

core/vector_sync.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+vector_sync.py
+Responsibilities:
+- rebuild_faiss_from_glossary(glossary_path) -> builds a new faiss.Index + meta list
+- _upload_to_dataset(index_path, meta_path, repo_id) -> upload via huggingface_hub
+- safe helpers for creating normalized metadata entries
+"""
+import os
+import re
+import json
+import shutil
+from typing import Tuple, List, Dict, Any
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import upload_file
+# default embedder (same model used elsewhere)
+EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+# directories
+PERSISTENT_DIR = "/home/user/app/persistent"
+TMP_DIR = "/home/user/app/tmp"
+os.makedirs(PERSISTENT_DIR, exist_ok=True)
+os.makedirs(TMP_DIR, exist_ok=True)
+def _ensure_model():
+    """Return global sentence-transformer model."""
+    return SentenceTransformer(EMBED_MODEL_NAME)
+def _normalize_meta_row(row: Dict[str, Any]) -> Dict[str, Any]:
+    """Ensure consistent meta record fields."""
+    out = {
+        "term": row.get("term") or row.get("Term") or row.get("name") or "",
+        "text": row.get("text") or row.get("definition") or row.get("content") or "",
+        # keep both 'file' (local/basename) and full 'sources' list
+        "file": row.get("file") or row.get("source") or "",
+        "type": row.get("type") or "",
+        "sources": row.get("sources") if isinstance(row.get("sources"), list) else [row.get("source")] if row.get("source") else []
+    }
+    return out
+# ==========================================================
+# 🧠 Main Function: Rebuild FAISS from glossary.json
+# ==========================================================
+def rebuild_faiss_from_glossary(glossary_path: str):
+    """
+    Build FAISS index + metadata from glossary JSON file.
+    Handles mixed entries (PDF, Excel, Web, Other).
+    Fully resilient against malformed or oversized rows.
+    """
+    print(f"🧩 Building FAISS from glossary: {glossary_path}")
+    if not os.path.exists(glossary_path):
+        raise FileNotFoundError(f"Glossary not found: {glossary_path}")
+    # --- Load JSON safely
+    with open(glossary_path, "r", encoding="utf-8") as f:
+        try:
+            glossary_data = json.load(f)
+        except Exception as e:
+            raise RuntimeError(f"❌ Failed to load glossary JSON: {e}")
+    # Normalize structure
+    if isinstance(glossary_data, dict):
+        glossary_items = list(glossary_data.values())
+    elif isinstance(glossary_data, list):
+        glossary_items = glossary_data
+    else:
+        raise ValueError("Invalid glossary format — must be list or dict.")
+    model = SentenceTransformer(EMBED_MODEL_NAME)
+    entries, metas, bad_entries, long_entries = [], [], [], []
+    # helper: normalized type inference
+    def infer_type_from_source(src: str, declared_type: str = "") -> str:
+        src_l = (src or "").lower()
+        declared = (declared_type or "").lower()
+        if src_l.endswith(".pdf") or "pdf" in declared:
+            return "pdf"
+        if src_l.endswith((".xlsx", ".xls")) or "excel" in declared or "xls" in src_l:
+            return "excel"
+        if src_l.startswith("http") or declared == "web" or "http" in src_l:
+            return "web"
+        return "other"
+    # --- Process glossary items
+    for i, item in enumerate(glossary_items):
+        try:
+            if not isinstance(item, dict):
+                bad_entries.append(item)
+                continue
+            term = str(item.get("term") or item.get("Term") or item.get("name") or "").strip()
+            definition = str(item.get("definition") or item.get("text") or item.get("content") or "").strip()
+            # Normalize sources (keep list)
+            src_field = item.get("sources") or item.get("source") or item.get("file") or ""
+            if isinstance(src_field, list):
+                src_list = [str(s).strip() for s in src_field if s]
+                src = ", ".join(src_list)
+            else:
+                src_list = [str(src_field).strip()] if src_field else []
+                src = str(src_field).strip()
+            declared_type = str(item.get("type") or "").strip().lower()
+            entry_type = infer_type_from_source(src, declared_type)
+            # Clean up noisy HTML tags and whitespace
+            definition_clean = re.sub(r"<[^>]*>", "", definition)
+            definition_clean = re.sub(r"\s+", " ", definition_clean).strip()
+            # Skip if missing essentials
+            if not term or not definition_clean:
+                bad_entries.append(item)
+                continue
+            # Skip extremely long definitions (likely raw HTML or large web content)
+            if len(definition_clean) > 3000:
+                long_entries.append({
+                    "term": term,
+                    "len": len(definition_clean),
+                    "source": src
+                })
+                continue
+            text = f"Definition of {term}: {definition_clean}"
+            entries.append(text)
+            metas.append({
+                "term": term,
+                "definition": definition_clean,
+                # preserve the original source list and file name
+                "sources": src_list if src_list else [src] if src else [],
+                "source": src,
+                "type": entry_type,
+                "file": os.path.basename(glossary_path)
+            })
+        except Exception as e:
+            bad_entries.append({
+                "index": i,
+                "error": str(e),
+                "raw": str(item)[:300]
+            })
+            continue
+    # --- Diagnostics
+    pdf_count = sum(1 for m in metas if m["type"].lower() == "pdf")
+    excel_count = sum(1 for m in metas if m["type"].lower() == "excel")
+    web_count = sum(1 for m in metas if m["type"].lower() == "web")
+    other_count = len(metas) - (pdf_count + excel_count + web_count)
+    print(f"🧠 Encoding {len(entries)} entries (PDF={pdf_count}, Excel={excel_count}, Web={web_count}, Other={other_count})…")
+    if bad_entries:
+        print(f"⚠️ {len(bad_entries)} malformed entries skipped.")
+        for b in bad_entries[:3]:
+            print("  →", json.dumps(b, ensure_ascii=False)[:300])
+    if long_entries:
+        print(f"⚠️ {len(long_entries)} very long entries (>3000 chars) skipped.")
+        for l in long_entries[:3]:
+            print(f"  → Skipped {l['term']} ({l['len']} chars) from {l['source']}")
+    if not entries:
+        raise RuntimeError("❌ No valid glossary entries found after cleanup!")
+    # --- Encoding
+    embeddings = model.encode(entries, show_progress_bar=True, convert_to_numpy=True).astype("float32")
+    faiss.normalize_L2(embeddings)
+    index = faiss.IndexFlatIP(embeddings.shape[1])
+    index.add(embeddings)
+    print(f"✅ Glossary vectors built ({len(entries)} total entries).")
+    # metas is list of dicts aligned with vectors — return exactly as before
+    return index, metas
+# ==========================================================
+# ☁️ Upload Helper
+# ==========================================================
+def _upload_to_dataset(index_path: str, meta_path: str, repo_id: str) -> None:
+    """
+    Upload FAISS index and metadata JSON to Hugging Face dataset.
+    """
+    try:
+        print(f"☁️ Uploading {index_path} and {meta_path} to {repo_id}...")
+        upload_file(
+            path_or_fileobj=index_path,
+            path_in_repo=f"persistent/{os.path.basename(index_path)}",
+            repo_id=repo_id,
+            repo_type="dataset"
+        )
+        upload_file(
+            path_or_fileobj=meta_path,
+            path_in_repo=f"persistent/{os.path.basename(meta_path)}",
+            repo_id=repo_id,
+            repo_type="dataset"
+        )
+        print("✅ Upload complete.")
+    except Exception as e:
+        print(f"⚠️ Upload failed: {e}")
+        raise

core/web_loader.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import requests, re, json, time, os
+from bs4 import BeautifulSoup
+def web_crawler_loader(
+    urls_file="/home/user/app/data/urls.txt",
+    cache_path="/home/user/app/persistent/web_cache.json",
+    max_pages=3,
+    timeout=20,
+    force_refresh=False,
+):
+    """Fetch and cache text content from official URLs."""
+    cache = {}
+    if os.path.exists(cache_path) and not force_refresh:
+        try:
+            with open(cache_path, "r", encoding="utf-8") as f:
+                cache = json.load(f)
+        except Exception:
+            cache = {}
+    if not os.path.exists(urls_file):
+        print(f"⚠️ URLs file missing: {urls_file}")
+        return list(cache.values())
+    with open(urls_file, "r", encoding="utf-8") as f:
+        urls = [u.strip() for u in f if u.strip() and not u.startswith("#")]
+    new_entries = {}
+    for i, url in enumerate(urls[: max_pages * 10]):
+        if url in cache and not force_refresh:
+            new_entries[url] = cache[url]
+            continue
+        try:
+            print(f"🌐 Fetching ({i+1}/{len(urls)}): {url}")
+            r = requests.get(url, timeout=timeout, headers={"User-Agent": "ClinicalTrialChatBot/1.0"})
+            if r.status_code != 200:
+                print(f"⚠️ Skipped {url}: HTTP {r.status_code}")
+                continue
+            soup = BeautifulSoup(r.text, "html.parser")
+            for tag in soup(["script", "style", "header", "footer", "nav", "iframe"]):
+                tag.decompose()
+            text = " ".join(soup.get_text().split())
+            if len(text) < 400:
+                continue
+            entry_text = f"Source URL: {url}. {text[:3000]}"
+            new_entries[url] = {"source": url, "type": "Website", "text": entry_text}
+            time.sleep(1)
+        except Exception as e:
+            print(f"⚠️ Error fetching {url}: {e}")
+    cache.update(new_entries)
+    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
+    with open(cache_path, "w", encoding="utf-8") as f:
+        json.dump(cache, f, indent=2)
+    print(f"💾 Web cache updated ({len(cache)} entries).")
+    return list(cache.values())