essprasad commited on
Commit
cdea1c3
·
verified ·
1 Parent(s): 7a82071

Upload 11 files

Browse files
core/admin_tasks.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ core/admin_tasks.py
3
+
4
+ Centralized admin / maintenance functions used by both the Gradio UI (app.py)
5
+ and the FastAPI admin endpoints (api.py). These are synchronous as in your
6
+ current setup and return friendly status strings for display.
7
+ """
8
+
9
+ import os
10
+ import json
11
+ import shutil
12
+ import glob
13
+ import traceback
14
+
15
+ try:
16
+ import pandas as pd
17
+ except Exception:
18
+ pd = None
19
+
20
+ try:
21
+ import faiss
22
+ except Exception:
23
+ faiss = None
24
+
25
+ try:
26
+ from sentence_transformers import SentenceTransformer
27
+ except Exception:
28
+ SentenceTransformer = None
29
+
30
+ from huggingface_hub import hf_hub_download, list_repo_files
31
+
32
+ # functions from your project (should exist)
33
+ # rebuild_faiss_from_glossary should return (index, metas) like before.
34
+ try:
35
+ from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
36
+ except Exception:
37
+ rebuild_faiss_from_glossary = None
38
+ _upload_to_dataset = None
39
+
40
+ # Optional web loader
41
+ try:
42
+ from core.web_loader import web_crawler_loader
43
+ except Exception:
44
+ web_crawler_loader = None
45
+
46
+ PERSISTENT_DIR = "/home/user/app/persistent"
47
+ DATASET_INDEX_REPO = os.environ.get("DATASET_INDEX_REPO", "essprasad/CT-Chat-Index")
48
+ DATASET_DOCS_REPO = os.environ.get("DATASET_DOCS_REPO", "essprasad/CT-Chat-Docs")
49
+
50
+ def _ensure_dirs():
51
+ os.makedirs(PERSISTENT_DIR, exist_ok=True)
52
+
53
+ def clear_index():
54
+ """Delete local FAISS and related caches. Returns a message string."""
55
+ removed = []
56
+ paths = [
57
+ os.path.join(PERSISTENT_DIR, "faiss.index"),
58
+ os.path.join(PERSISTENT_DIR, "faiss.index.meta.json"),
59
+ os.path.join(PERSISTENT_DIR, "glossary.json"),
60
+ "/home/user/app/data/docs_cache",
61
+ "/home/user/app/runtime_faiss",
62
+ ]
63
+ for p in paths:
64
+ try:
65
+ if os.path.isdir(p):
66
+ shutil.rmtree(p, ignore_errors=True)
67
+ removed.append(f"🗑️ Deleted folder: {p}")
68
+ elif os.path.exists(p):
69
+ os.remove(p)
70
+ removed.append(f"🗑️ Deleted file: {p}")
71
+ except Exception as e:
72
+ removed.append(f"⚠️ Failed to delete {p}: {e}")
73
+ if not removed:
74
+ return "ℹ️ No cache files found."
75
+ return "\n".join(removed)
76
+
77
+
78
+ def rebuild_glossary():
79
+ """
80
+ Calls the existing glossary builder (core.glossary_builder.rebuild_and_upload).
81
+ Returns status string.
82
+ """
83
+ try:
84
+ from core.glossary_builder import rebuild_and_upload
85
+ except Exception as e:
86
+ return f"⚠️ Cannot import glossary builder: {e}"
87
+ try:
88
+ rebuild_and_upload()
89
+ return "✅ Glossary rebuilt and uploaded successfully."
90
+ except Exception as e:
91
+ tb = traceback.format_exc()
92
+ return f"⚠️ Glossary rebuild failed: {e}\n{tb}"
93
+
94
+
95
+ def rebuild_index(force_download_glossary: bool = False):
96
+ """
97
+ Rebuild FAISS index from glossary.json + Excel + (optionally) web content.
98
+ Returns status string. Mirrors the logic in your previous rebuild_index implementation.
99
+ """
100
+ _ensure_dirs()
101
+ try:
102
+ if rebuild_faiss_from_glossary is None:
103
+ return "⚠️ rebuild_faiss_from_glossary is not available in core.vector_sync."
104
+
105
+ glossary_path = os.path.join(PERSISTENT_DIR, "glossary.json")
106
+ # Attempt to download glossary.json from HF dataset if missing
107
+ if not os.path.exists(glossary_path) or force_download_glossary:
108
+ try:
109
+ downloaded = hf_hub_download(repo_id=DATASET_INDEX_REPO, filename="persistent/glossary.json", repo_type="dataset")
110
+ shutil.copy2(downloaded, glossary_path)
111
+ except Exception as e:
112
+ # Continue even if glossary download fails; rebuild_faiss_from_glossary may handle absent file
113
+ return f"⚠️ Could not download glossary.json from {DATASET_INDEX_REPO}: {e}"
114
+
115
+ # Build faiss index using the project helper
116
+ index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path)
117
+ loaded = len(metas) if isinstance(metas, (list, tuple)) else 0
118
+
119
+ # Index Excel files from docs dataset
120
+ try:
121
+ repo_files = list_repo_files(DATASET_DOCS_REPO, repo_type="dataset")
122
+ excel_files = [f for f in repo_files if f.lower().endswith((".xls", ".xlsx"))]
123
+ except Exception:
124
+ excel_files = []
125
+
126
+ # If we have SentenceTransformer available we will embed and add Excel content
127
+ if SentenceTransformer is not None and faiss is not None and excel_files:
128
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
129
+ excel_entries = []
130
+ for file_name in excel_files:
131
+ try:
132
+ fp = hf_hub_download(repo_id=DATASET_DOCS_REPO, filename=file_name, repo_type="dataset")
133
+ # read sheets and look for MRCT-style columns (best-effort)
134
+ try:
135
+ xls = pd.read_excel(fp, sheet_name=None)
136
+ except Exception:
137
+ xls = {}
138
+ for sheet, df in xls.items():
139
+ if not isinstance(df, pd.DataFrame):
140
+ continue
141
+ cols = [c.lower() for c in df.columns.astype(str)]
142
+ # heuristic — look for "glossary term" or "glossary term" header
143
+ if not any("glossary term" in c or "term" == c.strip().lower() for c in cols):
144
+ continue
145
+ df = df.fillna("").dropna(how="all")
146
+ for _, row in df.iterrows():
147
+ term = str(row.get("Glossary Term", "") or row.get("term", "")).strip()
148
+ if not term:
149
+ # try first column
150
+ try:
151
+ term = str(row.iloc[0]).strip()
152
+ except Exception:
153
+ term = ""
154
+ if not term:
155
+ continue
156
+ combined = " ".join(str(x) for x in row.values if str(x).strip())
157
+ excel_entries.append({
158
+ "file": file_name,
159
+ "sheet": sheet,
160
+ "term": term,
161
+ "type": "excel",
162
+ "text": combined,
163
+ "source": file_name
164
+ })
165
+ except Exception:
166
+ # non-fatal: skip problematic excel
167
+ continue
168
+
169
+ if excel_entries:
170
+ texts = [e["text"] for e in excel_entries]
171
+ embs = model.encode(texts, show_progress_bar=False, convert_to_numpy=True).astype("float32")
172
+ try:
173
+ faiss.normalize_L2(embs)
174
+ index.add(embs)
175
+ if isinstance(metas, list):
176
+ metas.extend(excel_entries)
177
+ loaded = len(metas)
178
+ except Exception:
179
+ # index may be incompatible or None
180
+ pass
181
+
182
+ # Optionally fetch & embed web content if web_crawler_loader exists
183
+ if web_crawler_loader is not None and SentenceTransformer is not None and faiss is not None:
184
+ try:
185
+ web_entries = web_crawler_loader(
186
+ urls_file="/home/user/app/data/urls.txt",
187
+ cache_path=os.path.join(PERSISTENT_DIR, "web_cache.json"),
188
+ max_pages=2,
189
+ timeout=15,
190
+ force_refresh=False,
191
+ )
192
+ if web_entries:
193
+ web_texts = [w.get("text", "") for w in web_entries if len(w.get("text","")) > 50]
194
+ if web_texts:
195
+ model = model if 'model' in locals() else SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
196
+ web_emb = model.encode(web_texts, show_progress_bar=False, convert_to_numpy=True).astype("float32")
197
+ faiss.normalize_L2(web_emb)
198
+ index.add(web_emb)
199
+ if isinstance(metas, list):
200
+ metas.extend(web_entries)
201
+ loaded = len(metas)
202
+ except Exception:
203
+ pass
204
+
205
+ # Save the index + meta back to persistent
206
+ try:
207
+ faiss_path = os.path.join(PERSISTENT_DIR, "faiss.index")
208
+ meta_path = os.path.join(PERSISTENT_DIR, "faiss.index.meta.json")
209
+ if faiss is not None and hasattr(faiss, "write_index"):
210
+ faiss.write_index(index, faiss_path)
211
+ with open(meta_path, "w", encoding="utf-8") as f:
212
+ json.dump(metas, f, indent=2)
213
+ # Try upload if helper present
214
+ if _upload_to_dataset is not None:
215
+ try:
216
+ _upload_to_dataset(faiss_path, meta_path, DATASET_INDEX_REPO)
217
+ except Exception:
218
+ pass
219
+ except Exception:
220
+ pass
221
+
222
+ return f"✅ Rebuild complete: {loaded} entries."
223
+ except Exception as e:
224
+ tb = traceback.format_exc()
225
+ return f"⚠️ Rebuild failed: {e}\n{tb}"
226
+
227
+
228
+ def reset_faiss_cache():
229
+ """
230
+ Wipe persistent & runtime FAISS/glossary, then call rebuild_glossary + rebuild_index.
231
+ Returns concatenated status string.
232
+ """
233
+ msgs = []
234
+ # wipe persistent
235
+ try:
236
+ to_remove = [
237
+ os.path.join(PERSISTENT_DIR, "faiss.index"),
238
+ os.path.join(PERSISTENT_DIR, "faiss.index.meta.json"),
239
+ os.path.join(PERSISTENT_DIR, "glossary.json"),
240
+ os.path.join(PERSISTENT_DIR, "web_cache.json"),
241
+ "/home/user/app/runtime_faiss",
242
+ ]
243
+ for p in to_remove:
244
+ try:
245
+ if os.path.isdir(p):
246
+ shutil.rmtree(p, ignore_errors=True)
247
+ elif os.path.exists(p):
248
+ os.remove(p)
249
+ except Exception:
250
+ pass
251
+ msgs.append("🧹 Persistent FAISS + glossary caches cleared.")
252
+ except Exception as e:
253
+ msgs.append(f"⚠️ Failed clearing caches: {e}")
254
+
255
+ # Rebuild glossary then index
256
+ try:
257
+ msgs.append(rebuild_glossary())
258
+ except Exception as e:
259
+ msgs.append(f"⚠️ Rebuild glossary failed: {e}")
260
+ try:
261
+ msgs.append(rebuild_index())
262
+ except Exception as e:
263
+ msgs.append(f"⚠️ Rebuild index failed: {e}")
264
+
265
+ return "\n".join(msgs)
core/bm25.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+ import math
5
+ from collections import defaultdict, Counter
6
+
7
+ # --- 🔧 NEW: Lightweight stemming and lemmatization helpers
8
+ try:
9
+ import nltk
10
+ from nltk.stem import WordNetLemmatizer, PorterStemmer
11
+ from nltk.corpus import wordnet
12
+ nltk.download("wordnet", quiet=True)
13
+ nltk.download("omw-1.4", quiet=True)
14
+ except Exception:
15
+ WordNetLemmatizer = PorterStemmer = None
16
+
17
+ # Initialize stemmer and lemmatizer
18
+ _lemmatizer = WordNetLemmatizer() if WordNetLemmatizer else None
19
+ _stemmer = PorterStemmer() if PorterStemmer else None
20
+
21
+
22
+ def _normalize_token(token: str) -> str:
23
+ """Normalize a token by lowercasing, lemmatizing, and stemming."""
24
+ token = token.lower().strip()
25
+ if _lemmatizer:
26
+ try:
27
+ token = _lemmatizer.lemmatize(token)
28
+ except Exception:
29
+ pass
30
+ if _stemmer:
31
+ try:
32
+ token = _stemmer.stem(token)
33
+ except Exception:
34
+ pass
35
+ return token
36
+
37
+
38
+ class BM25:
39
+ def __init__(self, corpus):
40
+ # corpus = list of dicts each with 'text'
41
+ # 🔧 FIX: support for 'definition' or 'content' fallback if 'text' missing
42
+ self.corpus = corpus
43
+ self.tokenized_corpus = [self._tokenize(self._get_text(doc)) for doc in corpus]
44
+ self.doc_lens = [len(tokens) for tokens in self.tokenized_corpus]
45
+ self.avgdl = sum(self.doc_lens) / len(self.doc_lens) if self.doc_lens else 0.0
46
+ self.doc_freqs = self._calc_doc_freqs()
47
+ self.k1 = 1.5
48
+ self.b = 0.75
49
+
50
+ def _get_text(self, doc):
51
+ """Safely extract text from multiple possible keys ('text', 'definition', 'content')."""
52
+ if not isinstance(doc, dict):
53
+ return ""
54
+ if "text" in doc and isinstance(doc["text"], str):
55
+ return doc["text"]
56
+ elif "definition" in doc and isinstance(doc["definition"], str):
57
+ return doc["definition"]
58
+ elif "content" in doc and isinstance(doc["content"], str):
59
+ return doc["content"]
60
+ return ""
61
+
62
+ def _tokenize(self, text):
63
+ """Tokenize and normalize each word with stemming and lemmatization."""
64
+ raw_tokens = re.findall(r"\w+", (text or "").lower())
65
+ return [_normalize_token(t) for t in raw_tokens if t]
66
+
67
+ def _calc_doc_freqs(self):
68
+ freqs = defaultdict(int)
69
+ for doc in self.tokenized_corpus:
70
+ for word in set(doc):
71
+ freqs[word] += 1
72
+ return freqs
73
+
74
+ def _idf(self, term):
75
+ N = len(self.tokenized_corpus)
76
+ df = self.doc_freqs.get(term, 0)
77
+ # smoothed idf to avoid division issues
78
+ return math.log(1 + (N - df + 0.5) / (df + 0.5)) if N > 0 else 0.0
79
+
80
+ def get_scores(self, query_tokens):
81
+ scores = [0.0] * len(self.tokenized_corpus)
82
+ for idx, doc_tokens in enumerate(self.tokenized_corpus):
83
+ freqs = Counter(doc_tokens)
84
+ dl = self.doc_lens[idx]
85
+ for term in query_tokens:
86
+ idf = self._idf(term)
87
+ tf = freqs.get(term, 0)
88
+ denom = tf + self.k1 * (1 - self.b + self.b * dl / (self.avgdl or 1.0))
89
+ score = 0.0
90
+ if denom != 0:
91
+ score = idf * ((tf * (self.k1 + 1)) / denom)
92
+ scores[idx] += score
93
+ return scores
94
+
95
+
96
+ def search_bm25(query, docs=None, top_n=10):
97
+ """
98
+ BM25 search helper.
99
+ - query: string
100
+ - docs: optional list of dicts (each may have 'text'/'definition'/'content');
101
+ if None, will load from vector_store.load_all_text_chunks()
102
+ - top_n: int
103
+ Returns list of doc dicts with added 'score' field.
104
+ """
105
+ from core.vector_store import load_all_text_chunks
106
+
107
+ if docs is None:
108
+ docs = load_all_text_chunks() or []
109
+ if not docs:
110
+ return []
111
+
112
+ bm25 = BM25(docs)
113
+
114
+ # 🔧 Normalize query tokens with same stem/lemma logic
115
+ query_tokens = [_normalize_token(t) for t in re.findall(r"\w+", (query or "").lower()) if t]
116
+ if not query_tokens:
117
+ return []
118
+
119
+ scores = bm25.get_scores(query_tokens)
120
+
121
+ # --- 🎯 NEW: Boost Excel glossary sources (MRCT, xlsx/xls) by +15%
122
+ for i, doc in enumerate(docs):
123
+ src = (doc.get("file") or doc.get("source") or "").lower()
124
+ if any(x in src for x in [".xlsx", ".xls", "mrct", "clinical-research-glossary"]):
125
+ scores[i] *= 1.15 # Excel source boost
126
+
127
+ # --- Rank and return top_n docs
128
+ top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
129
+ results = []
130
+ for i in top_indices:
131
+ doc = dict(docs[i]) # shallow copy
132
+ # 🔧 Ensure 'text' key exists so retriever can render it
133
+ if "text" not in doc:
134
+ doc["text"] = bm25._get_text(doc)
135
+ doc["score"] = float(scores[i])
136
+ results.append(doc)
137
+ return results
core/glossary.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/glossary.py
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ from difflib import get_close_matches
7
+ from huggingface_hub import hf_hub_download
8
+
9
+ GLOSSARY = None
10
+ GLOSSARY_TERMS_CACHE = [] # 🧠 Cache of glossary keys for fuzzy matching
11
+ DATASET_REPO = "essprasad/CT-Chat-Index"
12
+ GLOSSARY_FILENAME = "persistent/glossary.json"
13
+
14
+
15
+ def _normalize_term(term: str) -> str:
16
+ """Normalize glossary terms for matching, with fuzzy fallback."""
17
+ if not term:
18
+ return ""
19
+ term = term.lower().strip()
20
+ term = re.sub(r'[\-_/\\.,;:]+', ' ', term)
21
+ term = re.sub(r'\s+', ' ', term)
22
+
23
+ # Common clinical research synonym normalization
24
+ term = term.replace("e crf", "ecrf").replace("e-crf", "ecrf").replace("e/crf", "ecrf").replace("e_crf", "ecrf")
25
+ term = term.replace("electronic case report form", "ecrf")
26
+ term = term.replace("case report form", "crf")
27
+ term = term.replace("informed consent form", "icf")
28
+ term = term.replace("good clinical practice", "gcp")
29
+ term = term.replace("serious adverse event", "sae")
30
+ term = term.replace("adverse event", "ae")
31
+ term = term.replace("21 cfr part 11", "21cfrpart11")
32
+ term = term.replace("clinical study report", "csr")
33
+
34
+ term = term.strip()
35
+
36
+ # 🧩 Fuzzy matching fallback (for plural/singular or typos)
37
+ if GLOSSARY_TERMS_CACHE:
38
+ if term not in GLOSSARY_TERMS_CACHE:
39
+ close = get_close_matches(term, GLOSSARY_TERMS_CACHE, n=1, cutoff=0.85)
40
+ if close:
41
+ # return the closest key for better recall
42
+ return close[0]
43
+
44
+ return term
45
+
46
+
47
+ def _load_glossary():
48
+ """Load glossary.json from Hugging Face Hub (cached)."""
49
+ global GLOSSARY, GLOSSARY_TERMS_CACHE
50
+ if GLOSSARY is not None:
51
+ return GLOSSARY
52
+ try:
53
+ path = hf_hub_download(
54
+ repo_id=DATASET_REPO,
55
+ filename=GLOSSARY_FILENAME,
56
+ repo_type="dataset",
57
+ )
58
+ with open(path, "r", encoding="utf-8") as f:
59
+ raw = json.load(f)
60
+
61
+ GLOSSARY = {}
62
+ for k, vlist in raw.items():
63
+ if not isinstance(k, str) or len(k.split()) > 12 or re.search(r'\d{4}', k):
64
+ continue
65
+
66
+ candidate_key = k
67
+ if isinstance(vlist, dict):
68
+ candidate_key = vlist.get("term") or vlist.get("name") or vlist.get("title") or k
69
+
70
+ norm = _normalize_term(candidate_key)
71
+ if not norm:
72
+ continue
73
+
74
+ if isinstance(vlist, dict):
75
+ dfn = vlist.get("definition") or vlist.get("text") or ""
76
+ sources = vlist.get("sources", [])
77
+ elif isinstance(vlist, str):
78
+ dfn = vlist
79
+ sources = []
80
+ else:
81
+ dfn, sources = "", []
82
+
83
+ if not dfn or len(dfn.strip()) < 5:
84
+ continue
85
+
86
+ if norm not in GLOSSARY:
87
+ GLOSSARY[norm] = {
88
+ "term": candidate_key.strip(),
89
+ "definition": dfn.strip(),
90
+ "sources": sources if isinstance(sources, list) else []
91
+ }
92
+ else:
93
+ # Merge sources if already exists
94
+ existing = GLOSSARY[norm]
95
+ existing_sources = set(existing.get("sources", []))
96
+ new_sources = set(sources) if sources else set()
97
+ existing["sources"] = list(existing_sources.union(new_sources))
98
+
99
+ # 🧠 Store all glossary keys for fuzzy fallback
100
+ GLOSSARY_TERMS_CACHE = list(GLOSSARY.keys())
101
+
102
+ print(f"✅ Glossary downloaded and cached ({len(GLOSSARY)} entries).")
103
+ return GLOSSARY
104
+ except Exception as e:
105
+ print(f"⚠️ Failed to load glossary from Hugging Face: {e}")
106
+ return {}
107
+
108
+
109
+ __all__ = ["_load_glossary", "_normalize_term"]
core/glossary_builder.py ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 📘 glossary_builder.py — FINAL VERSION WITH MRCT SECTION FIX + CDISC + ABBREVIATIONS
3
+ ------------------------------------------------------------------------------------
4
+ Builds a unified glossary from:
5
+ - PDF glossary files
6
+ - MRCT Clinical Research Glossary (Excel)
7
+ - CDISC Glossary (Excel)
8
+ - Abbreviations (Excel)
9
+ - Web glossary sources
10
+
11
+ Features:
12
+ ✔ Correctly splits MRCT concatenated cells (Glossary Definition + Use in Context…)
13
+ ✔ Removes all duplicated sections
14
+ ✔ Maintains the correct order of sections
15
+ ✔ Handles CDISC Submission Value → Definition extraction
16
+ ✔ Handles Abbreviations.xlsx column patterns
17
+ """
18
+
19
+ import os
20
+ import re
21
+ import json
22
+ import time
23
+ import fitz
24
+ import requests
25
+ import pandas as pd
26
+ from bs4 import BeautifulSoup
27
+ from huggingface_hub import (
28
+ upload_file, HfFolder, list_repo_files, hf_hub_download
29
+ )
30
+
31
+
32
+ # ------------------------------------------------------------------------------
33
+ # CONFIG
34
+ # ------------------------------------------------------------------------------
35
+ DATASET_REPO = "essprasad/CT-Chat-Index"
36
+ DOCS_REPO = "essprasad/CT-Chat-Docs"
37
+
38
+ LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
39
+ REMOTE_GLOSSARY = "persistent/glossary.json"
40
+
41
+ TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")
42
+
43
+ WEB_SOURCES = [
44
+ "https://mrctcenter.org/glossaryterm/clinical-research/",
45
+ "https://www.fda.gov/patients/drug-development-process/step-3-clinical-research",
46
+ "https://www.cdisc.org/",
47
+ "https://www.ich.org/",
48
+ "https://www.ema.europa.eu/",
49
+ "https://www.who.int/",
50
+ "https://clinicaltrials.gov/",
51
+ ]
52
+
53
+
54
+ # ------------------------------------------------------------------------------
55
+ # HELPERS
56
+ # ------------------------------------------------------------------------------
57
+ def normalize_term(term: str) -> str:
58
+ if not term:
59
+ return ""
60
+ s = re.sub(r"[\-_/\\.,;:]+", " ", term.lower().strip())
61
+ s = re.sub(r"\s+", " ", s)
62
+ synonyms = {
63
+ "electronic case report form": "ecrf",
64
+ "case report form": "crf",
65
+ "good clinical practice": "gcp",
66
+ "clinical study report": "csr",
67
+ "informed consent form": "icf",
68
+ "adverse event": "ae",
69
+ "serious adverse event": "sae",
70
+ "21 cfr part 11": "21cfrpart11",
71
+ }
72
+ return synonyms.get(s, s)
73
+
74
+
75
+ def extract_text_from_pdf(path):
76
+ try:
77
+ doc = fitz.open(path)
78
+ text = "\n".join(page.get_text("text") for page in doc)
79
+ doc.close()
80
+ return text
81
+ except Exception as e:
82
+ print(f"⚠️ Error reading PDF {path}: {e}")
83
+ return ""
84
+
85
+
86
+ def extract_definitions_from_text(text):
87
+ glossary = {}
88
+ text = re.sub(r"\r", "", text)
89
+ lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
90
+
91
+ i = 0
92
+ while i < len(lines):
93
+ term = lines[i]
94
+
95
+ if len(term) <= 1 or term.isdigit():
96
+ i += 1
97
+ continue
98
+ if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index"]):
99
+ i += 1
100
+ continue
101
+
102
+ def_lines = []
103
+ j = i + 1
104
+
105
+ while j < len(lines):
106
+ nxt = lines[j]
107
+ if (
108
+ re.match(r"^[A-Za-z][A-Za-z0-9\- ]{0,20}$", nxt)
109
+ and not nxt.endswith(".")
110
+ ):
111
+ break
112
+ def_lines.append(nxt)
113
+ j += 1
114
+
115
+ definition = " ".join(def_lines).strip()
116
+
117
+ if len(definition.split()) < 5:
118
+ i += 1
119
+ continue
120
+
121
+ norm = normalize_term(term)
122
+ glossary[norm] = {
123
+ "term": term,
124
+ "definition": definition,
125
+ }
126
+
127
+ i = j
128
+
129
+ return glossary
130
+
131
+
132
+ def detect_source_type(src: str) -> str:
133
+ s = src.lower()
134
+ if s.endswith(".pdf"):
135
+ return "pdf"
136
+ if s.endswith((".xls", ".xlsx")):
137
+ return "excel"
138
+ if s.startswith("http"):
139
+ return "web"
140
+ return "other"
141
+
142
+
143
+ def extract_web_glossary(url):
144
+ results = []
145
+ try:
146
+ print(f"🌐 Fetching {url}…")
147
+ r = requests.get(url, timeout=10)
148
+ if r.status_code != 200:
149
+ print(f"⚠️ Skipped {url} (HTTP {r.status_code})")
150
+ return []
151
+
152
+ soup = BeautifulSoup(r.text, "html.parser")
153
+ text = soup.get_text(separator="\n")
154
+
155
+ matches = re.findall(
156
+ r"([A-Z][A-Za-z0-9 \-]{3,30})[:\-]\s*(.{10,200})", text
157
+ )
158
+
159
+ for term, definition in matches[:50]:
160
+ results.append(
161
+ {
162
+ "term": term.strip(),
163
+ "definition": definition.strip(),
164
+ "sources": [url],
165
+ "file": url,
166
+ "type": "web",
167
+ }
168
+ )
169
+
170
+ except Exception as e:
171
+ print(f"⚠️ Web extraction error for {url}: {e}")
172
+
173
+ return results
174
+
175
+
176
+ # ------------------------------------------------------------------------------
177
+ # MRCT STRUCTURED CELL PARSER
178
+ # ------------------------------------------------------------------------------
179
+ SECTION_LABELS = [
180
+ "Glossary Definition",
181
+ "Use in Context",
182
+ "More Info",
183
+ "Other Info to Think About When Joining a Study",
184
+ "Related Terms",
185
+ "Resource URL",
186
+ ]
187
+
188
+ LABEL_RE = re.compile(
189
+ r"(?i)(Glossary Definition:|Use in Context:|More Info:|Other Info to Think About When Joining a Study:|Related Terms:|Resource URL:)"
190
+ )
191
+
192
+
193
+ def parse_mrct_cell(cell: str):
194
+ if not isinstance(cell, str) or not cell.strip():
195
+ return []
196
+
197
+ text = re.sub(r"\s{2,}", " ", cell.strip())
198
+
199
+ # Split by labels
200
+ pieces = re.split(LABEL_RE, text)
201
+ out = []
202
+ i = 0
203
+ while i < len(pieces):
204
+ p = pieces[i].strip()
205
+ if p == "":
206
+ i += 1
207
+ continue
208
+ if p.endswith(":"):
209
+ label = p[:-1].strip()
210
+ value = pieces[i + 1].strip() if i + 1 < len(pieces) else ""
211
+ out.append((label, value))
212
+ i += 2
213
+ else:
214
+ out.append(("Glossary Definition", p))
215
+ i += 1
216
+ return out
217
+
218
+
219
+ # ------------------------------------------------------------------------------
220
+ # MAIN: GLOSSARY REBUILD
221
+ # ------------------------------------------------------------------------------
222
+ def rebuild_and_upload():
223
+ start = time.time()
224
+ print("📘 Starting glossary rebuild…")
225
+
226
+ try:
227
+ all_files = list_repo_files(
228
+ repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN
229
+ )
230
+ pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
231
+ excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
232
+ except Exception as e:
233
+ raise RuntimeError(f"❌ Cannot list files: {e}")
234
+
235
+ all_defs = {}
236
+
237
+ # ----------------------------------------------------
238
+ # 1️⃣ PDFs
239
+ # ----------------------------------------------------
240
+ skip_patterns = [
241
+ "topic_", "template", "schedule", "protocol",
242
+ "painac", "sas", "glossary_printable"
243
+ ]
244
+
245
+ for pdf in pdfs:
246
+ if any(sp in pdf.lower() for sp in skip_patterns):
247
+ print(f"⏩ Skipping non-glossary PDF: {pdf}")
248
+ continue
249
+
250
+ print(f"🔍 Processing PDF: {pdf}")
251
+ try:
252
+ path = hf_hub_download(
253
+ repo_id=DOCS_REPO,
254
+ filename=pdf,
255
+ token=TOKEN,
256
+ repo_type="dataset"
257
+ )
258
+ text = extract_text_from_pdf(path)
259
+ defs = extract_definitions_from_text(text)
260
+
261
+ for k, v in defs.items():
262
+ v["sources"] = [pdf]
263
+ v["file"] = pdf
264
+ v["type"] = "pdf"
265
+ all_defs[f"{k}__{pdf}"] = v
266
+
267
+ except Exception as e:
268
+ print(f"⚠️ PDF extraction error: {pdf}: {e}")
269
+
270
+ # ----------------------------------------------------
271
+ # 2️⃣ Excel files (MRCT + Abbreviations + CDISC)
272
+ # ----------------------------------------------------
273
+ for excel in excels:
274
+ try:
275
+ print(f"📗 Processing Excel: {excel}")
276
+ path = hf_hub_download(
277
+ repo_id=DOCS_REPO,
278
+ filename=excel,
279
+ token=TOKEN,
280
+ repo_type="dataset"
281
+ )
282
+
283
+ xls = pd.read_excel(path, sheet_name=None)
284
+
285
+ for sheet_name, df in xls.items():
286
+ df = df.fillna("").dropna(how="all")
287
+ if df.empty:
288
+ continue
289
+
290
+ df.columns = [str(c).strip() for c in df.columns]
291
+ lower_cols = {c.lower(): c for c in df.columns}
292
+
293
+ # -----------------------------
294
+ # Detect term column
295
+ # -----------------------------
296
+ term_col = next(
297
+ (
298
+ c
299
+ for c in df.columns
300
+ if "glossary term" in c.lower() or c.lower() == "term"
301
+ ),
302
+ None,
303
+ )
304
+
305
+ # Abbreviations
306
+ if not term_col:
307
+ for c in [
308
+ "acronym",
309
+ "abbreviation",
310
+ "acryonym/abbreviation/initial",
311
+ "initial",
312
+ ]:
313
+ if c in lower_cols:
314
+ term_col = lower_cols[c]
315
+ break
316
+
317
+ # CDISC
318
+ if not term_col:
319
+ for c in ["cdisc submission value", "submission value"]:
320
+ if c in lower_cols:
321
+ term_col = lower_cols[c]
322
+ break
323
+
324
+ # Fallback
325
+ if not term_col:
326
+ for c in df.columns:
327
+ if "submission" in c.lower():
328
+ term_col = c
329
+ break
330
+
331
+ if not term_col:
332
+ print(f"⏩ Skipping sheet {sheet_name} — no term column")
333
+ continue
334
+
335
+ # -----------------------------
336
+ # MRCT Structured Format
337
+ # -----------------------------
338
+ if "Glossary Definition" in df.columns:
339
+ # All possible MRCT columns
340
+ mrct_cols = [
341
+ "Glossary Definition",
342
+ "Use in Context",
343
+ "More Info",
344
+ "Other Info to Think About When Joining a Study",
345
+ "Related Terms",
346
+ "Resource URL",
347
+ ]
348
+ def_cols = [c for c in mrct_cols if c in df.columns]
349
+ else:
350
+ # Generic fallback
351
+ def_cols = [
352
+ c
353
+ for c in df.columns
354
+ if any(
355
+ k in c.lower()
356
+ for k in [
357
+ "definition",
358
+ "description",
359
+ "cdisc definition",
360
+ "context",
361
+ "info",
362
+ "related",
363
+ ]
364
+ )
365
+ ]
366
+
367
+ if not def_cols and len(df.columns) > 1:
368
+ def_cols = [df.columns[1]]
369
+
370
+ # -----------------------------
371
+ # Extract rows
372
+ # -----------------------------
373
+ for _, row in df.iterrows():
374
+ term = str(row.get(term_col, "")).strip()
375
+ if not term:
376
+ continue
377
+
378
+ # Clean + dedupe
379
+ def_parts = []
380
+ seen = set()
381
+
382
+ if "Glossary Definition" in df.columns:
383
+ raw = str(row.get("Glossary Definition", "")).strip()
384
+ parsed = parse_mrct_cell(raw)
385
+
386
+ if parsed:
387
+ # Preferred order
388
+ for label in SECTION_LABELS:
389
+ for plabel, ptext in parsed:
390
+ if plabel.lower() == label.lower() and ptext.strip():
391
+ if ptext not in seen:
392
+ def_parts.append(f"<b>{label}:</b> {ptext}")
393
+ seen.add(ptext)
394
+
395
+ # Add missing columns (non-duplicates)
396
+ for c in def_cols:
397
+ val = str(row.get(c, "")).strip()
398
+ if val and val not in seen:
399
+ def_parts.append(f"<b>{c}:</b> {val}")
400
+ seen.add(val)
401
+ else:
402
+ # Fallback to direct column reading
403
+ for c in def_cols:
404
+ val = str(row.get(c, "")).strip()
405
+ if val and val not in seen:
406
+ def_parts.append(f"<b>{c}:</b> {val}")
407
+ seen.add(val)
408
+ else:
409
+ # Non-MRCT Excel rows
410
+ for c in def_cols:
411
+ val = str(row.get(c, "")).strip()
412
+ if val and val not in seen:
413
+ def_parts.append(f"<b>{c}:</b> {val}")
414
+ seen.add(val)
415
+
416
+ if not def_parts:
417
+ continue
418
+
419
+ entry = {
420
+ "term": term,
421
+ "definition": "<br>".join(def_parts),
422
+ "sources": [excel],
423
+ "file": excel,
424
+ "sheet": sheet_name,
425
+ "type": "excel",
426
+ }
427
+
428
+ key = f"{normalize_term(term)}__{excel}"
429
+ all_defs[key] = entry
430
+
431
+ print(f"✅ Processed Excel: {excel}")
432
+
433
+ except Exception as e:
434
+ print(f"⚠️ Excel extraction error: {excel}: {e}")
435
+
436
+ # ----------------------------------------------------
437
+ # 3️⃣ Web Sources
438
+ # ----------------------------------------------------
439
+ web_defs = []
440
+ for url in WEB_SOURCES:
441
+ items = extract_web_glossary(url)
442
+ for e in items:
443
+ key = f"{normalize_term(e['term'])}__{e['file']}"
444
+ all_defs[key] = e
445
+ web_defs.append(e)
446
+
447
+ print(f"🌐 Added {len(web_defs)} web entries.")
448
+
449
+ # ----------------------------------------------------
450
+ # 4️⃣ SAVE glossary.json
451
+ # ----------------------------------------------------
452
+ os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
453
+ with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
454
+ json.dump(list(all_defs.values()), f, indent=2, ensure_ascii=False)
455
+
456
+ print(f"💾 Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")
457
+
458
+ # ----------------------------------------------------
459
+ # 5️⃣ UPLOAD TO HUGGINGFACE
460
+ # ----------------------------------------------------
461
+ if TOKEN:
462
+ try:
463
+ upload_file(
464
+ path_or_fileobj=LOCAL_GLOSSARY,
465
+ path_in_repo=REMOTE_GLOSSARY,
466
+ repo_id=DATASET_REPO,
467
+ repo_type="dataset",
468
+ token=TOKEN,
469
+ commit_message="Glossary updated (PDF + Excel + Web)",
470
+ )
471
+ print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
472
+ except Exception as e:
473
+ print(f"⚠️ Upload failed: {e}")
474
+
475
+ print(f"✨ Glossary rebuild complete in {time.time() - start:.1f}s")
476
+
477
+
478
+ # ------------------------------------------------------------------------------
479
+ if __name__ == "__main__":
480
+ rebuild_and_upload()
core/hybrid_retriever.py ADDED
@@ -0,0 +1,925 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hybrid retriever (drop-in replacement)
3
+ -------------------------------------
4
+ - Preserves original function & variable names and signatures.
5
+ - Integrates CDISC Excel runtime loader, Abbreviations.xlsx loader,
6
+ PyMuPDF-based clinical-informatics PDF parser,
7
+ and MRCT duplicate-section dedupe.
8
+ - Injects abbreviation and CDISC entries as separate answers (one per term).
9
+ - Uses FAISS + BM25 retrieval as before.
10
+ """
11
+
12
+ import os
13
+ import re
14
+ import time
15
+ import glob
16
+ from urllib.parse import urlparse
17
+ from difflib import SequenceMatcher
18
+ from pathlib import Path
19
+ import logging
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # optional libs
24
+ try:
25
+ import pandas as pd
26
+ except Exception:
27
+ pd = None
28
+
29
+ try:
30
+ import fitz # PyMuPDF
31
+ except Exception:
32
+ fitz = None
33
+
34
+ # project imports
35
+ from core.glossary import _normalize_term
36
+ from core.vector_store import _ensure_faiss_index, search_index, load_all_text_chunks
37
+ from core.bm25 import search_bm25
38
+ from utils.nlp_helpers import extract_van_tokens, normalize_query_text
39
+
40
+ # ----------------------------
41
+ # CONFIG
42
+ # ----------------------------
43
+ DENSE_TOP_K = 10
44
+ FUZZY_THRESHOLD = 0.15
45
+ TOP_RESULTS_LIMIT = 5
46
+
47
+ GCDMP_FILENAME = "GCDMP_Glossary.pdf" # exact filename in your HF space/persistent store
48
+
49
+ # ----------------------------
50
+ # UTILITIES (preserve names)
51
+ # ----------------------------
52
+ def fuzzy_ratio(a: str, b: str) -> float:
53
+ return SequenceMatcher(None, a or "", b or "").ratio()
54
+
55
+ def strip_question_phrases(text: str) -> str:
56
+ text = (text or "").lower().strip()
57
+ prefixes = [
58
+ "what", "how", "when", "why", "define", "definition", "meaning", "explain",
59
+ "describe", "expand", "abbreviate", "compare", "identify", "classify",
60
+ "determine", "do", "did", "does", "done", "can", "shall",
61
+ "will", "where", "which", "who", "whose", "have", "might", "could", "would",
62
+ "kindly", "please", "may", "you", "i", "we","us" "they", "there", "here",
63
+ "what's", "i'll", "where's", "how's", "there's", "who's", "didn't", "doesn't",
64
+ "give", "provide", "mention", "state", "arrange", "asking", "tell", "explain me",
65
+ "can you", "could you", "would you", "please explain", "let me know",
66
+ "say something about", "give details of", "show me", "find", "list", "expand on"
67
+ "what", "how", "when", "why", "define", "definition", "meaning", "explain",
68
+ "is", "was", "were", "are",
69
+ ]
70
+ prefix_pattern = r"^(" + "|".join(re.escape(p) for p in prefixes) + r")(\s+|['’])"
71
+ while re.match(prefix_pattern, text):
72
+ text = re.sub(prefix_pattern, "", text).strip()
73
+ text = re.sub(r"[?.!]+$", "", text)
74
+ text = re.sub(r"\s{2,}", " ", text)
75
+ return text.strip()
76
+
77
+ def add_links_to_text(text: str) -> str:
78
+ return re.sub(r"(https?://[^\s<]+)", r'<a href="\1" target="_blank" rel="noopener noreferrer">\1</a>', text)
79
+
80
+ def get_source_rank(src: str, src_type: str) -> int:
81
+ s = (src or "").lower()
82
+ # 1. GCDMP glossary PDF → highest priority
83
+ if GCDMP_FILENAME.lower() in s:
84
+ return 1
85
+ # 2. MRCT Excel or MRCT filename
86
+ if src_type == "excel" or "mrct" in s:
87
+ return 2
88
+ # 3. ICH documents (E6, E3, E2A, E9, E1)
89
+ if any(x in s for x in ["ich_e6", "ich-e6", "ich e6", "ich_e3", "ich-e3", "ich e3", "ich_e2", "ich-e2", "ich e2", "ich_e9", "ich-e9", "ich e9", "ich_e1", "ich-e1", "ich e1"]):
90
+ return 3
91
+ # 4. Other PDFs
92
+ if src_type == "pdf":
93
+ return 4
94
+ # 5. Web sources
95
+ if src_type == "web":
96
+ return 5
97
+ return 6
98
+
99
+ # Patterns to filter junk lines commonly found in PDF extractions
100
+ JUNK_PATTERNS = [
101
+ r"^\s*\d+\s*$", # page-only lines
102
+ r"^\s*Page\s+\d+\s*$",
103
+ r"^\s*Table of Contents.*$",
104
+ r"^\s*Figure\s+\d+.*$",
105
+ r"^\s*Section\s+\d+.*$",
106
+ r".*\.{5,}.*", # dotted lines
107
+ r"^\s*-{3,}\s*$",
108
+ r"^\s*_+\s*$",
109
+ r"^\s*required by regulatory authorities.*$",
110
+ ]
111
+ _COMPILED_JUNK = [re.compile(p, flags=re.IGNORECASE) for p in JUNK_PATTERNS]
112
+
113
+ def clean_extracted_text(text: str) -> str:
114
+ if not text:
115
+ return text
116
+ lines = text.splitlines()
117
+ cleaned = []
118
+ for line in lines:
119
+ s = line.strip()
120
+ if not s:
121
+ continue
122
+ junk = False
123
+ for pat in _COMPILED_JUNK:
124
+ if pat.match(s):
125
+ junk = True
126
+ break
127
+ if junk:
128
+ continue
129
+ s = re.sub(r'\.{3,}', '.', s)
130
+ s = re.sub(r'\s{2,}', ' ', s)
131
+ cleaned.append(s)
132
+ return "\n".join(cleaned)
133
+
134
+ def dedupe_section_headers(txt):
135
+ """
136
+ Remove repeated section headers such as 'Use in Context', 'More Info', etc.
137
+ Keeps first occurrence of each heading.
138
+ """
139
+ if not txt:
140
+ return txt
141
+ lines = txt.splitlines()
142
+ seen = set()
143
+ out_lines = []
144
+ heading_labels = {
145
+ "Glossary Definition", "Use in Context", "More Info",
146
+ "Other Info to Think About When Joining a Study", "Related Terms", "Term URL",
147
+ "Other Resources", "Resource URL"
148
+ }
149
+ for line in lines:
150
+ # detect heading start
151
+ m = re.match(r"^([A-Za-z0-9 \-]{3,200}):\s*$", line)
152
+ if m:
153
+ h = m.group(1).strip()
154
+ if h in heading_labels:
155
+ if h in seen:
156
+ # skip this heading line
157
+ continue
158
+ else:
159
+ seen.add(h)
160
+ out_lines.append(line)
161
+ return "\n".join(out_lines)
162
+
163
+ # ----------------------------
164
+ # Excel / MRCT section parser (robust)
165
+ # ----------------------------
166
+ # Recognizes label variants and returns ordered sections as list of (label, text)
167
+ _SECTION_LABELS_ORDER = [
168
+ "Glossary Definition",
169
+ "Use in Context",
170
+ "More Info",
171
+ "Other Info to Think About When Joining a Study",
172
+ "Related Terms",
173
+ "Other Resources",
174
+ "Resource URL",
175
+ "Term URL",
176
+ "CDISC/NCI URL"
177
+ ]
178
+
179
+ # label alternatives to catch small variations
180
+ _LABEL_ALIASES = {
181
+ "glossary definition": ["glossary definition", "definition", "glossarydefinition"],
182
+ "use in context": ["use in context", "use in context:"],
183
+ "more info": ["more info", "more information", "additional info", "additional information"],
184
+ "other info to think about when joining a study": [
185
+ "other info to think about when joining a study",
186
+ "other info to think about when joining the study",
187
+ "other info to think about when joining a study:"
188
+ ],
189
+ "related terms": ["related terms", "related term", "related terms:"],
190
+ "other resources": ["other resources", "other resource"],
191
+ "resource url": ["resource url", "other resources:", "other resources:"],
192
+ "term url": ["term url", "term url:"],
193
+ "cdisc/nci url": ["cdisc/nci url", "cdisc nci url"],
194
+ }
195
+
196
+ # regex to find labels in a single blob of text
197
+ _LABEL_RE = re.compile(
198
+ r"(?P<label>(Glossary Definition|Definition|Use in Context|More Info|More Information|Other Info to Think About When Joining a Study|Other Info|Related Terms|Related Terms:|Related Term|Other Resources|Resource URL|Term URL|CDISC/NCI URL))\s*[:\-]\s*",
199
+ flags=re.IGNORECASE
200
+ )
201
+
202
+ def parse_excel_sections(blob: str):
203
+ """
204
+ Parse a text blob that may contain multiple labeled sections (MRCT style).
205
+ Returns ordered list of (label, text) based on _SECTION_LABELS_ORDER.
206
+ If labels are missing, returns a single ('Glossary Definition', blob).
207
+ """
208
+ if not blob or not isinstance(blob, str):
209
+ return []
210
+
211
+ # Normalize common HTML tags if present (some entries might include <br> or <b>)
212
+ b = re.sub(r"<br\s*/?>", "\n", blob, flags=re.IGNORECASE)
213
+ b = re.sub(r"</?[^>]+>", "", b) # strip tags conservatively
214
+
215
+ # Find all label matches and their positions
216
+ matches = list(_LABEL_RE.finditer(b))
217
+ if not matches:
218
+ # no labels found: attempt heuristic splits by known label-like lines
219
+ # split by double newline or "Related Terms:" if present
220
+ if "\n\n" in b:
221
+ parts = [p.strip() for p in b.split("\n\n") if p.strip()]
222
+ # heuristically map first part to glossary definition
223
+ out = [("Glossary Definition", parts[0])]
224
+ # remaining parts appended as "More Info" or "Other Info"
225
+ for i, p in enumerate(parts[1:], start=1):
226
+ label = "More Info" if i == 1 else f"Other Info {i}"
227
+ out.append((label, p))
228
+ return out
229
+ # fallback single block
230
+ return [("Glossary Definition", b.strip())]
231
+
232
+ # build spans for label->value
233
+ spans = []
234
+ for idx, m in enumerate(matches):
235
+ start = m.end()
236
+ end = matches[idx + 1].start() if idx + 1 < len(matches) else len(b)
237
+ label = m.group("label").strip().rstrip(":").strip()
238
+ val = b[start:end].strip()
239
+ spans.append((label, val))
240
+
241
+ # Normalize labels to canonical labels and build ordered dict
242
+ canonical = {}
243
+ for lab, val in spans:
244
+ key = lab.lower().strip().rstrip(":")
245
+ # map through alias set
246
+ mapped = None
247
+ for canon, aliases in _LABEL_ALIASES.items():
248
+ for alias in aliases:
249
+ if alias in key:
250
+ mapped = canon
251
+ break
252
+ if mapped:
253
+ break
254
+ if not mapped:
255
+ # fallback: title-case the label
256
+ mapped = lab.strip().title()
257
+ canonical[mapped] = canonical.get(mapped, "") + ("\n\n" + val if canonical.get(mapped) else val)
258
+
259
+ # produce ordered list according to preferred order
260
+ out = []
261
+ for label in _SECTION_LABELS_ORDER:
262
+ key = label.lower()
263
+ if key in canonical:
264
+ out.append((label, canonical[key].strip()))
265
+ # if nothing matched (odd case), return spans as-is (label, val)
266
+ if not out:
267
+ for lab, val in spans:
268
+ out.append((lab.strip(), val.strip()))
269
+ return out
270
+
271
+ # ----------------------------
272
+ # Abbreviations.xlsx loader (Priority B: treated as excel)
273
+ # ----------------------------
274
+ def load_abbreviations_entries(search_paths=None):
275
+ """
276
+ Discover Abbreviations.xlsx and return list of candidate dicts:
277
+ [{'definition','text','file','type','term','sources'}...]
278
+ Auto-detects header names: term/acronym and definition/long name/description.
279
+ Falls back to positional columns (A=term, B=definition).
280
+ """
281
+ if pd is None:
282
+ logger.warning("pandas not installed — skipping Abbreviations.xlsx load.")
283
+ return []
284
+
285
+ # common HF cache & persistent paths
286
+ HF_CACHE_ROOT = "/root/.cache/huggingface/hub"
287
+ HF_DATASET_PREFIX = os.path.join(HF_CACHE_ROOT, "datasets--essprasad--CT-Chat-Docs")
288
+
289
+ default_paths = [
290
+ ".", "/workspace/data", "/mnt/data", os.getcwd(),
291
+ HF_CACHE_ROOT,
292
+ HF_DATASET_PREFIX,
293
+ os.path.join(HF_DATASET_PREFIX, "snapshots"),
294
+ "/home/user/app/persistent", "/home/user/app/persistent/glossary",
295
+ "/app/persistent", "/persistent", "/root/.cache"
296
+ ]
297
+ if search_paths:
298
+ default_paths = list(search_paths) + default_paths
299
+
300
+ files = []
301
+ for base in default_paths:
302
+ try:
303
+ files.extend(glob.glob(os.path.join(base, "*Abbreviations*.xls*"), recursive=True))
304
+ files.extend(glob.glob(os.path.join(base, "**/Abbreviations.xlsx"), recursive=True))
305
+ files.extend(glob.glob(os.path.join(base, "**/*abbrev*.xls*"), recursive=True))
306
+ files.extend(glob.glob(os.path.join(base, "**/*abbreviations*.xls*"), recursive=True))
307
+ except Exception:
308
+ continue
309
+ files = list(dict.fromkeys(files))
310
+
311
+ entries = []
312
+ for fx in files:
313
+ try:
314
+ if fx.lower().endswith("x") and pd is not None:
315
+ try:
316
+ df = pd.read_excel(fx, engine="openpyxl")
317
+ except Exception:
318
+ df = pd.read_excel(fx)
319
+ else:
320
+ df = pd.read_excel(fx)
321
+ except Exception as e:
322
+ logger.exception("Failed to read Abbreviations file %s: %s", fx, e)
323
+ continue
324
+
325
+ cols = {c.strip().lower(): c for c in df.columns}
326
+ # common possible headers for term
327
+ term_candidates = ["abbreviation", "acronym", "term", "short form", "initial", "abbrev", "abbrev."]
328
+ def_candidates = ["definition", "description", "long name", "meaning", "full form", "explanation"]
329
+
330
+ term_col = None
331
+ def_col = None
332
+ for k, v in cols.items():
333
+ if any(tc in k for tc in term_candidates) and term_col is None:
334
+ term_col = v
335
+ if any(dc in k for dc in def_candidates) and def_col is None:
336
+ def_col = v
337
+
338
+ # fallback to positional columns A/B (0/1)
339
+ if not term_col or not def_col:
340
+ try:
341
+ term_col = term_col or df.columns[0]
342
+ def_col = def_col or (df.columns[1] if len(df.columns) > 1 else df.columns[0])
343
+ except Exception:
344
+ logger.warning("Abbreviations file %s missing expected term/definition columns. Skipping.", fx)
345
+ continue
346
+
347
+ count = 0
348
+ for _, row in df.iterrows():
349
+ term = str(row.get(term_col) or "").strip()
350
+ definition = str(row.get(def_col) or "").strip()
351
+ if not term or not definition:
352
+ continue
353
+ # combine into text for retrieval
354
+ combined = definition
355
+ entries.append({
356
+ "definition": definition,
357
+ "text": combined,
358
+ "file": os.path.basename(fx),
359
+ "type": "excel",
360
+ "term": term,
361
+ "sources": [os.path.basename(fx)]
362
+ })
363
+ count += 1
364
+ logger.info("Loaded %d abbreviations from %s", count, fx)
365
+ logger.info("Total loaded abbreviations: %d", len(entries))
366
+ return entries
367
+
368
+ # ----------------------------
369
+ # CDISC Excel loader (improved HF cache discovery)
370
+ # ----------------------------
371
+ def load_cdisc_entries(search_paths=None):
372
+ """
373
+ Discover CDISC Excel files and return a list of normalized candidate dicts:
374
+ [{'definition','text','file','type','term','sources'}...]
375
+ """
376
+ if pd is None:
377
+ logger.warning("pandas not installed — skipping CDISC Excel load.")
378
+ return []
379
+
380
+ # HF cache path where datasets are downloaded during rebuild
381
+ HF_CACHE_ROOT = "/root/.cache/huggingface/hub"
382
+ HF_DATASET_PREFIX = os.path.join(HF_CACHE_ROOT, "datasets--essprasad--CT-Chat-Docs")
383
+
384
+ default_paths = [
385
+ ".", "/workspace/data", "/mnt/data", os.getcwd(),
386
+ HF_CACHE_ROOT,
387
+ HF_DATASET_PREFIX,
388
+ os.path.join(HF_DATASET_PREFIX, "snapshots"),
389
+ "/home/user/app/persistent", "/home/user/app/persistent/glossary",
390
+ "/app/persistent", "/persistent", "/root/.cache"
391
+ ]
392
+ if search_paths:
393
+ default_paths = list(search_paths) + default_paths
394
+
395
+ # find files (recursive search)
396
+ files = []
397
+ for base in default_paths:
398
+ try:
399
+ files.extend(glob.glob(os.path.join(base, "*[Cc][Dd][Ii][Ss][Cc]*.xls*"), recursive=True))
400
+ files.extend(glob.glob(os.path.join(base, "**/CDISC Glossary.xlsx"), recursive=True))
401
+ files.extend(glob.glob(os.path.join(base, "**/*CDISC*.xls*"), recursive=True))
402
+ except Exception:
403
+ continue
404
+ files = list(dict.fromkeys(files)) # unique
405
+
406
+ entries = []
407
+ for fx in files:
408
+ try:
409
+ # read with openpyxl for .xlsx when available
410
+ if fx.lower().endswith("x") and pd is not None:
411
+ try:
412
+ df = pd.read_excel(fx, engine="openpyxl")
413
+ except Exception:
414
+ df = pd.read_excel(fx)
415
+ else:
416
+ df = pd.read_excel(fx)
417
+ except Exception as e:
418
+ logger.exception("Failed to read CDISC file %s: %s", fx, e)
419
+ continue
420
+
421
+ cols = {c.strip().lower(): c for c in df.columns}
422
+ term_col = cols.get("cdisc submission value") or cols.get("term") or cols.get("submission value")
423
+ syn_col = cols.get("cdisc synonym(s)") or cols.get("cdisc synonym") or cols.get("synonym(s)") or cols.get("synonyms")
424
+ def_col = cols.get("cdisc definition") or cols.get("definition") or cols.get("cdisc definition(s)")
425
+
426
+ # fallback to positional columns A/B/C if headers differ
427
+ if not term_col or not def_col:
428
+ try:
429
+ term_col = term_col or df.columns[0]
430
+ def_col = def_col or df.columns[2]
431
+ syn_col = syn_col or (df.columns[1] if len(df.columns) > 1 else None)
432
+ except Exception:
433
+ logger.warning("CDISC file %s missing expected columns (A or C). Skipping.", fx)
434
+ continue
435
+
436
+ for _, row in df.iterrows():
437
+ term = str(row.get(term_col) or "").strip()
438
+ synonyms = str(row.get(syn_col) or "").strip() if syn_col else ""
439
+ definition = str(row.get(def_col) or "").strip()
440
+ if not term or not definition:
441
+ continue
442
+ # Build combined text including synonyms for better retrieval
443
+ text_parts = [definition]
444
+ if synonyms:
445
+ text_parts.append("Synonyms: " + synonyms)
446
+ combined = "\n\n".join([p for p in text_parts if p])
447
+ entries.append({
448
+ "definition": definition,
449
+ "text": combined,
450
+ "file": os.path.basename(fx),
451
+ "type": "excel",
452
+ "term": term,
453
+ "sources": [os.path.basename(fx)]
454
+ })
455
+ logger.info("Loaded %d CDISC entries from %d files", len(entries), len(files))
456
+ return entries
457
+
458
+ # ----------------------------
459
+ # Clinical-informatics PDF parser using PyMuPDF (fitz)
460
+ # ----------------------------
461
+ def parse_clinical_informatics_pdf(path):
462
+ """
463
+ Parse clinical-informatics-acronym-glossary.pdf using PyMuPDF (fitz).
464
+ Extracts acronym headings and description blocks; returns list of candidate dicts.
465
+ """
466
+ if fitz is None:
467
+ logger.warning("PyMuPDF (fitz) not installed — skipping clinical-informatics PDF parsing.")
468
+ return []
469
+
470
+ try:
471
+ doc = fitz.open(path)
472
+ except Exception as e:
473
+ logger.exception("Failed to open PDF %s: %s", path, e)
474
+ return []
475
+
476
+ full_text = []
477
+ for page in doc:
478
+ try:
479
+ text = page.get_text("text")
480
+ if text:
481
+ full_text.append(text)
482
+ except Exception:
483
+ continue
484
+ doc.close()
485
+
486
+ full = "\n".join(full_text)
487
+ full = full.replace("\r", "")
488
+ # split into blocks by blank lines
489
+ blocks = [b.strip() for b in re.split(r"\n\s*\n", full) if b.strip()]
490
+
491
+ entries = []
492
+ i = 0
493
+ while i < len(blocks):
494
+ blk = blocks[i]
495
+ heading = blk.splitlines()[0].strip()
496
+ # detect an acronym-like heading: all-caps, digits, dashes, short
497
+ if re.match(r"^[A-Z0-9\-/]{1,12}$", heading):
498
+ term = heading
499
+ desc_parts = []
500
+ j = i + 1
501
+ while j < len(blocks):
502
+ next_head = blocks[j].splitlines()[0].strip()
503
+ if re.match(r"^[A-Z0-9\-/]{1,12}$", next_head):
504
+ break
505
+ desc_parts.append(blocks[j])
506
+ j += 1
507
+ if desc_parts:
508
+ definition = "\n\n".join(desc_parts).strip()
509
+ entries.append({
510
+ "definition": definition,
511
+ "text": definition,
512
+ "file": os.path.basename(path),
513
+ "type": "pdf",
514
+ "term": term,
515
+ "sources": [os.path.basename(path)]
516
+ })
517
+ i = j
518
+ else:
519
+ i += 1
520
+
521
+ logger.info("Parsed %d entries from clinical-informatics PDF (PyMuPDF).", len(entries))
522
+ return entries
523
+
524
+ # ----------------------------
525
+ # MAIN RETRIEVER (preserve name)
526
+ # ----------------------------
527
+ def summarize_combined(query: str, mode: str = "short") -> str:
528
+ start = time.time()
529
+ if not query or not query.strip():
530
+ return "<i>No query provided.</i>"
531
+
532
+ # Normalize user query
533
+ cleaned = strip_question_phrases(query)
534
+ expanded = normalize_query_text(cleaned)
535
+ van_tokens = extract_van_tokens(expanded)
536
+ normalized = " ".join(van_tokens).strip() or cleaned
537
+ nq = normalized.lower().strip()
538
+ print(f"🔍 summarize_combined() | cleaned='{cleaned}' normalized='{nq}'")
539
+
540
+ # Acronym expansion map (preserve/extend)
541
+ acronym_map = {
542
+ "ae": "adverse event", "adr": "adverse drug reaction",
543
+ "crf": "case report form", "ecrf": "electronic case report form",
544
+ "cro": "contract research organization", "csr": "clinical study report",
545
+ "ctms": "clinical trial management system", "edc": "electronic data capture",
546
+ "ehr": "electronic health record", "emr": "electronic medical record",
547
+ "gcp": "good clinical practice", "irb": "institutional review board",
548
+ "iec": "independent ethics committee", "ind": "investigational new drug application",
549
+ "mrct": "multi-regional clinical trials", "qa": "quality assurance",
550
+ "qc": "quality control", "sae": "serious adverse event", "sap": "statistical analysis plan",
551
+ "siv": "site initiation visit", "sop": "standard operating procedure",
552
+ "ssu": "study start-up", "uat": "user acceptance testing",
553
+ "whodrug": "world health organization drug dictionary",
554
+ }
555
+
556
+ glossary_key = _normalize_term(nq)
557
+ if glossary_key in acronym_map:
558
+ expanded_term = acronym_map[glossary_key]
559
+ nq = _normalize_term(expanded_term)
560
+ print(f"🔁 Acronym expanded → {expanded_term}")
561
+
562
+ # ----------------------------
563
+ # FAISS + BM25 retrieval
564
+ # ----------------------------
565
+ dense_hits, bm25_hits = [], []
566
+ try:
567
+ if _ensure_faiss_index():
568
+ dense_hits = search_index(normalized, top_k=DENSE_TOP_K) or []
569
+ print(f"✅ FAISS hits: {len(dense_hits)}")
570
+ except Exception as e:
571
+ print(f"⚠️ FAISS search failed: {e}")
572
+
573
+ try:
574
+ docs = load_all_text_chunks()
575
+ if docs:
576
+ bm25_hits = search_bm25(normalized, docs, top_n=8) or []
577
+ print(f"✅ BM25 hits: {len(bm25_hits)}")
578
+ except Exception as e:
579
+ print(f"⚠️ BM25 fallback failed: {e}")
580
+
581
+ # ----------------------------
582
+ # Inject Abbreviations + CDISC + clinical-informatics PDF parsed entries (runtime)
583
+ # ----------------------------
584
+ extra_hits = []
585
+ try:
586
+ abbrev_entries = load_abbreviations_entries()
587
+ for e in abbrev_entries:
588
+ extra_hits.append({
589
+ "definition": e["definition"],
590
+ "text": e["text"],
591
+ "file": e["file"],
592
+ "type": e["type"],
593
+ "term": e["term"],
594
+ "sources": e.get("sources", [])
595
+ })
596
+ except Exception as e:
597
+ logger.exception("Abbreviations load failed: %s", e)
598
+
599
+ try:
600
+ cdisc_entries = load_cdisc_entries()
601
+ for e in cdisc_entries:
602
+ extra_hits.append({
603
+ "definition": e["definition"],
604
+ "text": e["text"],
605
+ "file": e["file"],
606
+ "type": e["type"],
607
+ "term": e["term"],
608
+ "sources": e.get("sources", [])
609
+ })
610
+ except Exception as e:
611
+ logger.exception("CDISC load failed: %s", e)
612
+
613
+ try:
614
+ pdf_paths = glob.glob("./*clinical*informatics*.pdf") + glob.glob("/mnt/data/*clinical*informatics*.pdf") + glob.glob("/workspace/data/*clinical*informatics*.pdf")
615
+ pdf_paths = list(dict.fromkeys(pdf_paths))
616
+ for p in pdf_paths:
617
+ parsed = parse_clinical_informatics_pdf(p)
618
+ for e in parsed:
619
+ extra_hits.append({
620
+ "definition": e["definition"],
621
+ "text": e["text"],
622
+ "file": e["file"],
623
+ "type": e["type"],
624
+ "term": e["term"],
625
+ "sources": e.get("sources", [])
626
+ })
627
+ except Exception as e:
628
+ logger.exception("clinical-informatics parse failed: %s", e)
629
+
630
+ hits = (dense_hits or []) + (bm25_hits or []) + extra_hits
631
+ if not hits:
632
+ return "<i>No relevant information found.</i>"
633
+
634
+ # ----------------------------
635
+ # Group by original resolved source (prefer real source over glossary.json)
636
+ # ----------------------------
637
+ grouped = {}
638
+ glossary_fallbacks = []
639
+
640
+ for h in hits:
641
+ raw_src = h.get("file") or h.get("source") or h.get("source_file") or "unknown"
642
+ meta_sources = h.get("sources") or h.get("source_list") or []
643
+
644
+ # prefer a non-glossary meta source if available
645
+ src = raw_src
646
+ if isinstance(meta_sources, (list, tuple)) and meta_sources:
647
+ chosen = None
648
+ for s in meta_sources:
649
+ if isinstance(s, str) and not s.lower().endswith("glossary.json"):
650
+ chosen = s
651
+ break
652
+ if chosen:
653
+ src = chosen
654
+ else:
655
+ src = meta_sources[0]
656
+
657
+ src_type = (h.get("type") or "").lower()
658
+ term = (h.get("term") or "").strip()
659
+ term_lower = term.lower()
660
+
661
+ txt = (h.get("definition") or h.get("text") or h.get("content") or h.get("full_text") or "").strip()
662
+ if not txt:
663
+ continue
664
+
665
+ txt = clean_extracted_text(txt)
666
+ # If original stored file was glossary.json, keep as fallback only
667
+ if str(raw_src).lower().endswith("glossary.json"):
668
+ glossary_fallbacks.append({"hit": h, "text": txt, "src": src})
669
+
670
+ # Save resolved sources for provenance. Ensure URL-like sources are preserved.
671
+ resolved_sources = meta_sources if meta_sources else []
672
+ # If resolved_sources empty, try to collect URL-like values from hit fields or src
673
+ if not resolved_sources:
674
+ possible = []
675
+ for key in ("url", "source", "link", "file"):
676
+ v = h.get(key)
677
+ if isinstance(v, str) and v.startswith("http"):
678
+ possible.append(v)
679
+ if isinstance(src, str) and src.startswith("http"):
680
+ possible.append(src)
681
+ # fallback to raw_src if nothing else
682
+ resolved_sources = possible or [raw_src]
683
+ # normalize to list
684
+ if isinstance(resolved_sources, (list, tuple)):
685
+ resolved_sources = list(resolved_sources)
686
+ else:
687
+ resolved_sources = [resolved_sources]
688
+ h["_resolved_sources"] = resolved_sources
689
+
690
+ # For MRCT-like text (detected by filename), dedupe repeated sections first
691
+ if "mrct" in str(src).lower() or "mrct" in str(raw_src).lower():
692
+ txt = dedupe_section_headers(txt)
693
+
694
+ # Group key based on resolved original source + type + term
695
+ # Special-case Abbreviations so each term is unique (Priority B behavior)
696
+ src_l = str(src).lower()
697
+ raw_src_l = str(raw_src).lower()
698
+ if "abbreviations.xlsx" in src_l or "abbreviations.xlsx" in raw_src_l or ("abbreviations" in src_l and src_type == "excel"):
699
+ key = f"abbrev__excel__{term_lower}"
700
+ # Special-case CDISC so each term is unique (Option A)
701
+ elif "cdisc glossary.xlsx" in src_l or "cdisc glossary.xlsx" in raw_src_l or ("cdisc" in src_l and src_type == "excel"):
702
+ key = f"cdisc__excel__{term_lower}"
703
+ else:
704
+ key = f"{os.path.basename(src).lower()}__{src_type}__{term_lower[:200]}"
705
+
706
+ # Prefer glossary PDF entries (GCDMP/ 'glossary' in filename) when colliding with long chunks
707
+ prefer_glossary = (GCDMP_FILENAME.lower() in str(src).lower()) or ("glossary" in str(src).lower())
708
+
709
+ if key not in grouped:
710
+ grouped[key] = {"hit": h, "text": txt, "src": src, "src_type": src_type, "term": term}
711
+ else:
712
+ existing_src = grouped[key]["src"]
713
+ existing_is_glossary = (GCDMP_FILENAME.lower() in str(existing_src).lower()) or ("glossary" in str(existing_src).lower())
714
+ if prefer_glossary and not existing_is_glossary:
715
+ grouped[key] = {"hit": h, "text": txt, "src": src, "src_type": src_type, "term": term}
716
+ else:
717
+ # otherwise prefer longer chunk unless this new is a glossary and existing is not
718
+ if not prefer_glossary and len(txt) > len(grouped[key]["text"]):
719
+ grouped[key] = {"hit": h, "text": txt, "src": src, "src_type": src_type, "term": term}
720
+
721
+ # ----------------------------
722
+ # Format answers: one per original source
723
+ # ----------------------------
724
+ answers = []
725
+ src_counts = {"excel": 0, "pdf": 0, "web": 0, "other": 0}
726
+
727
+ # ensure only one combined excel answer per term to prevent duplicated sections
728
+ seen_excel_terms = set()
729
+
730
+ for entry in grouped.values():
731
+ h = entry["hit"]
732
+ txt = entry["text"]
733
+ src = entry["src"]
734
+ src_type = entry.get("src_type") or (h.get("type") or "").lower()
735
+ term = entry.get("term") or (h.get("term") or "").strip()
736
+ term_lower = (term or "").lower()
737
+
738
+ # Skip entries resolved to glossary.json here (we'll use them only as fallback)
739
+ if str(src).lower().endswith("glossary.json"):
740
+ continue
741
+
742
+ # If this is an excel entry for MRCT/CDISC/Abbrev, ensure only first combined answer per term
743
+ is_excel = (src_type == "excel") or str(src).lower().endswith((".xls", ".xlsx"))
744
+ if is_excel:
745
+ if term_lower in seen_excel_terms:
746
+ # skip duplicate excel results for same term (they will be combined in the first occurrence)
747
+ continue
748
+ # mark as seen (so subsequent excel chunks won't produce duplicates)
749
+ seen_excel_terms.add(term_lower)
750
+
751
+ # Skip noisy PDF sections unless they look like short glossary entries
752
+ txt_lower = txt.lower()
753
+ if src_type == "pdf" and any(k in txt_lower[:300] for k in ["table of contents", "appendix", "index", "section"]):
754
+ if not (len(txt.split()) < 80 and term_lower and term_lower in txt_lower[:120]):
755
+ # treat as noise
756
+ continue
757
+
758
+ # Determine icon and counts
759
+ if src_type == "excel":
760
+ icon, cat = "📘", "excel"
761
+ elif src_type == "pdf":
762
+ icon, cat = "📄", "pdf"
763
+ elif src_type == "web":
764
+ icon, cat = "🌐", "web"
765
+ else:
766
+ icon, cat = "📁", "other"
767
+ src_counts[cat] += 1
768
+
769
+ # Extract excerpt (PDF / web special handling for glossary-style)
770
+ excerpt = ""
771
+ if src_type in ("pdf", "web"):
772
+ paragraphs = re.split(r"\n\s*\n", txt)
773
+ paragraphs = [p.strip() for p in paragraphs if p.strip()]
774
+
775
+ # 1) If full term matches heading (e.g., "electronic health record")
776
+ if paragraphs and term_lower:
777
+ heading = paragraphs[0].strip().lower()
778
+ if heading == term_lower or (term_lower in heading):
779
+ excerpt = paragraphs[1].strip() if len(paragraphs) > 1 else paragraphs[0].strip()
780
+
781
+ # 2) If not yet found, try exact normalized query inside paragraphs
782
+ if not excerpt:
783
+ found = None
784
+ for p in paragraphs:
785
+ if nq and nq in p.lower():
786
+ found = p.strip()
787
+ break
788
+
789
+ # 3) Fuzzy match with paragraph starts
790
+ if not found and term_lower:
791
+ for p in paragraphs:
792
+ if fuzzy_ratio(term_lower, p.lower()[:100]) > 0.75:
793
+ found = p.strip()
794
+ break
795
+
796
+ # 4) Paragraph following a heading that contains the term
797
+ if not found and term_lower:
798
+ for i, p in enumerate(paragraphs[:-1]):
799
+ if term_lower in p.lower():
800
+ found = paragraphs[i + 1].strip()
801
+ break
802
+
803
+ excerpt = (found or (paragraphs[0] if paragraphs else txt)).strip()
804
+
805
+ excerpt = excerpt[:2000] + ("..." if len(excerpt) > 2000 else "")
806
+ excerpt = add_links_to_text(excerpt)
807
+
808
+ elif src_type == "excel":
809
+ # Special-case: Abbreviations -> always show full clean definition (single block)
810
+ if "abbreviations.xlsx" in str(src).lower() or ("abbreviations" in str(src).lower() and src_type=="excel"):
811
+ excerpt = add_links_to_text(txt)
812
+ # Special-case: CDISC -> always show full clean definition (single block)
813
+ elif "cdisc glossary.xlsx" in str(src).lower() or ("cdisc" in str(src).lower() and src_type=="excel"):
814
+ excerpt = add_links_to_text(txt)
815
+ else:
816
+ # General Excel/MRCT parsing: parse labeled sections and build one combined excerpt
817
+ try:
818
+ sections = parse_excel_sections(txt)
819
+ except Exception:
820
+ sections = [("Glossary Definition", txt)]
821
+
822
+ lines = []
823
+ seen_vals = set()
824
+ for label, val in sections:
825
+ if not val or not str(val).strip():
826
+ continue
827
+ v = str(val).strip()
828
+ # Clickify URLs if the section is a single URL
829
+ if re.match(r"^https?://\S+$", v):
830
+ v_html = f'<a href="{v}" target="_blank" rel="noopener noreferrer">{v}</a>'
831
+ else:
832
+ v_html = add_links_to_text(v)
833
+ # Avoid duplicate repeated text segments
834
+ if v_html in seen_vals:
835
+ continue
836
+ seen_vals.add(v_html)
837
+ lines.append(f"<b>{label}:</b> {v_html}")
838
+
839
+ excerpt = "<br>".join(lines) if lines else add_links_to_text(txt)
840
+
841
+ else:
842
+ excerpt = add_links_to_text(txt)
843
+
844
+ # Prepare heading and display sources (exclude internal glossary.json from display)
845
+ heading_term = term.strip() or os.path.splitext(os.path.basename(src))[0]
846
+ heading_html = f"<h4>{icon} {heading_term}</h4>"
847
+
848
+ # Use _resolved_sources (preserved earlier) and ensure web URLs are shown directly
849
+ # Build clickable sources
850
+ resolved_sources = h.get("_resolved_sources") or []
851
+ display_sources = []
852
+
853
+ for s in resolved_sources:
854
+ if not isinstance(s, str):
855
+ continue
856
+ if s.lower().endswith("glossary.json"):
857
+ continue
858
+
859
+ if s.startswith("http"):
860
+ display_sources.append(
861
+ f'<a href="{s}" target="_blank" rel="noopener noreferrer">{s}</a>'
862
+ )
863
+ else:
864
+ display_sources.append(os.path.basename(s))
865
+
866
+ # Fallback if empty
867
+ if not display_sources:
868
+ if isinstance(src, str) and src.startswith("http"):
869
+ display_sources = [
870
+ f'<a href="{src}" target="_blank" rel="noopener noreferrer">{src}</a>'
871
+ ]
872
+ else:
873
+ display_sources = [os.path.basename(str(src))]
874
+
875
+ # ALWAYS create sources_line safely
876
+ sources_line = (
877
+ "<p>🔗 <i>Source:</i> "
878
+ + " · ".join(dict.fromkeys(display_sources))
879
+ + "</p>"
880
+ )
881
+
882
+ answers.append({
883
+ "rank": get_source_rank(src, src_type),
884
+ "type": cat,
885
+ "term": term,
886
+ "html": f"{heading_html}{sources_line}<blockquote>{excerpt}</blockquote>"
887
+ })
888
+
889
+ # ----------------------------
890
+ # Fallback: only use glossary.json definitions if no other original sources matched
891
+ # ----------------------------
892
+ if not answers and glossary_fallbacks:
893
+ for item in glossary_fallbacks:
894
+ h = item["hit"]
895
+ txt = item["text"]
896
+ src = item.get("src") or (h.get("file") or h.get("source") or "glossary.json")
897
+ term = (h.get("term") or "").strip() or "Definition"
898
+ heading_html = f"<h4>📄 {term}</h4>"
899
+ excerpt = txt.strip()
900
+ answers.append({
901
+ "rank": 10,
902
+ "type": "pdf",
903
+ "term": term,
904
+ "html": f"{heading_html}<p>🔗 <i>Source:</i> {os.path.basename(src)}</p><blockquote>{excerpt}</blockquote>"
905
+ })
906
+
907
+ # ----------------------------
908
+ # Final sort & output
909
+ # ----------------------------
910
+ if not answers:
911
+ return "<i>No relevant results found.</i>"
912
+
913
+ answers = sorted(answers, key=lambda a: a["rank"])
914
+ final_html_parts = [a["html"] for a in answers[:TOP_RESULTS_LIMIT]]
915
+
916
+ summary_counts = " | ".join(f"{k.capitalize()}: {v}" for k, v in src_counts.items() if v > 0)
917
+
918
+ elapsed = time.time() - start
919
+ print(f"✅ Answers from {len(answers)} sources in {elapsed:.2f}s")
920
+
921
+ return (
922
+ f"<h3>🧠 Answers (one per source):</h3>"
923
+ f"<p><i>Sources → {summary_counts}</i></p>"
924
+ + "<br>".join(final_html_parts)
925
+ )
core/retrieval.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ from whoosh.index import open_dir
5
+ from whoosh.qparser import MultifieldParser
6
+
7
+ WHOOSH_INDEX_PATH = "/home/user/app/persistent/whoosh_index"
8
+
9
+ _ix = None
10
+
11
+ def _load_whoosh():
12
+ global _ix
13
+ if _ix is None and os.path.exists(WHOOSH_INDEX_PATH):
14
+ _ix = open_dir(WHOOSH_INDEX_PATH)
15
+ return _ix
16
+
17
+ def _bm25_search(query, top_n=10):
18
+ ix = _load_whoosh()
19
+ if not ix:
20
+ return []
21
+ parser = MultifieldParser(["text", "title"], schema=ix.schema)
22
+ q = parser.parse(query)
23
+ with ix.searcher() as s:
24
+ results = s.search(q, limit=top_n)
25
+ return [{"text": r["text"], "file": r.get("file", "")} for r in results]
core/van_normalizer.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/van_normalizer.py
2
+ import re
3
+ import nltk
4
+ from nltk import pos_tag, word_tokenize
5
+ from nltk.stem import WordNetLemmatizer
6
+
7
+ # make sure you have these (run once if missing):
8
+ # python -m nltk.downloader punkt averaged_perceptron_tagger wordnet omw-1.4
9
+
10
+ lemmatizer = WordNetLemmatizer()
11
+
12
+ def normalize_to_van(text: str) -> str:
13
+ """
14
+ VAN-based normalization (optimized for clinical trial domain):
15
+ - Lowercases and removes punctuation
16
+ - Tokenizes and POS-tags
17
+ - Keeps only Nouns (N), Adjectives (J), and key Verbs (V)
18
+ - Explicitly removes determiners/articles (a, an, the)
19
+ - Lemmatizes each token to its base form
20
+ - Returns a space-joined string suitable for FAISS embedding
21
+ """
22
+ if not text:
23
+ return ""
24
+
25
+ # Basic cleanup
26
+ text = text.lower().strip()
27
+ text = re.sub(r"[^a-z0-9\s-]", " ", text) # remove punctuation
28
+ tokens = word_tokenize(text)
29
+
30
+ # POS tagging
31
+ tagged = pos_tag(tokens)
32
+
33
+ filtered = []
34
+ for word, tag in tagged:
35
+ # Skip common determiners, articles, and auxiliary verbs
36
+ if word in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
37
+ continue
38
+
39
+ # Keep only verbs, adjectives, and nouns
40
+ if tag.startswith("V") or tag.startswith("J") or tag.startswith("N"):
41
+ filtered.append((word, tag))
42
+
43
+ # Lemmatize each word to its appropriate part of speech
44
+ lemmas = []
45
+ for word, tag in filtered:
46
+ pos = (
47
+ "v" if tag.startswith("V")
48
+ else "a" if tag.startswith("J")
49
+ else "n"
50
+ )
51
+ lemmas.append(lemmatizer.lemmatize(word, pos))
52
+
53
+ # Join and clean
54
+ normalized = " ".join(lemmas).strip()
55
+ normalized = re.sub(r"\s+", " ", normalized) # collapse multiple spaces
56
+ return normalized
57
+
core/vector_search.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ vector_search.py
3
+
4
+ Thin wrapper helpers used to orchestrate searches and resets from the app or admin UI.
5
+ """
6
+
7
+ from typing import List, Dict, Any
8
+ from core import vector_store, vector_sync
9
+
10
+ def semantic_search(query: str, top_k: int = 6) -> List[Dict[str, Any]]:
11
+ """
12
+ Safe semantic search that falls back gracefully to empty list.
13
+ """
14
+ try:
15
+ return vector_store.search_index(query, top_k=top_k)
16
+ except Exception as e:
17
+ print(f"⚠️ semantic_search error: {e}")
18
+ return []
19
+
20
+ def reset_faiss_and_rebuild(glossary_builder_fn=None, rebuild_index_fn=None) -> str:
21
+ """
22
+ Clear local caches and run rebuilds. The app's reset_faiss_cache() can call this.
23
+ - glossary_builder_fn: optional function to run to rebuild glossary (if provided)
24
+ - rebuild_index_fn: optional function that triggers full rebuild (if provided)
25
+ """
26
+ try:
27
+ vector_store.clear_local_faiss()
28
+ except Exception as e:
29
+ print(f"⚠️ clear_local_faiss failed: {e}")
30
+
31
+ out = "🧹 Cleared local FAISS files.\n"
32
+
33
+ # If a glossary builder was provided, run it (safe)
34
+ if glossary_builder_fn:
35
+ try:
36
+ out += glossary_builder_fn() + "\n"
37
+ except Exception as e:
38
+ out += f"⚠️ Glossary builder failed: {e}\n"
39
+
40
+ if rebuild_index_fn:
41
+ try:
42
+ out += rebuild_index_fn()
43
+ except Exception as e:
44
+ out += f"⚠️ Rebuild index failed: {e}\n"
45
+
46
+ return out
47
+
core/vector_store.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ vector_store.py
3
+ -----------------------------------------------------
4
+ Maintains FAISS runtime index + metadata cache.
5
+
6
+ Features
7
+ --------
8
+ - Ensure local FAISS runtime index exists (download from HF if missing)
9
+ - FAISS semantic search and BM25 text access
10
+ - Automatic TTL reload
11
+ - Full cache clearing for Hugging Face Space
12
+ - Explicit "♻️ FAISS memory cache reset" logging on rebuild
13
+ """
14
+
15
+ import os
16
+ import json
17
+ import time
18
+ import shutil
19
+ from typing import List, Dict, Any, Optional
20
+
21
+ import numpy as np
22
+ import faiss
23
+ from sentence_transformers import SentenceTransformer
24
+ from huggingface_hub import hf_hub_download
25
+
26
+
27
+ # ------------------------------------------------------------------
28
+ # 🔧 Paths & constants
29
+ # ------------------------------------------------------------------
30
+ PERSISTENT_DIR = "/home/user/app/persistent"
31
+ RUNTIME_DIR = "/home/user/app/runtime_faiss"
32
+ INDEX_NAME = "faiss.index"
33
+ META_NAME = "faiss.index.meta.json"
34
+ GLOSSARY_META = "glossary.json"
35
+ HF_INDEX_REPO = "essprasad/CT-Chat-Index"
36
+
37
+ EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
38
+ EMBED_MODEL = None # lazy loaded
39
+
40
+ # in-memory cache
41
+ _runtime_index: Optional[faiss.Index] = None
42
+ _runtime_meta: Optional[List[Dict[str, Any]]] = None
43
+ _meta_loaded_time = 0.0
44
+ _META_TTL_SECONDS = 300.0
45
+
46
+
47
+ # ------------------------------------------------------------------
48
+ # 🔹 Helpers
49
+ # ------------------------------------------------------------------
50
+ def _ensure_dirs():
51
+ os.makedirs(PERSISTENT_DIR, exist_ok=True)
52
+ os.makedirs(RUNTIME_DIR, exist_ok=True)
53
+
54
+
55
+ def _ensure_model():
56
+ global EMBED_MODEL
57
+ if EMBED_MODEL is None:
58
+ print("📥 Loading embedding model for FAISS retrieval…")
59
+ EMBED_MODEL = SentenceTransformer(EMBED_MODEL_NAME)
60
+ print("✅ Embedding model loaded.")
61
+ return EMBED_MODEL
62
+
63
+
64
+ # ------------------------------------------------------------------
65
+ # 🔹 Cache control
66
+ # ------------------------------------------------------------------
67
+ def clear_local_faiss():
68
+ """Delete all local FAISS + glossary caches (safe in HF Space)."""
69
+ for p in [
70
+ os.path.join(PERSISTENT_DIR, INDEX_NAME),
71
+ os.path.join(PERSISTENT_DIR, META_NAME),
72
+ os.path.join(PERSISTENT_DIR, GLOSSARY_META),
73
+ RUNTIME_DIR,
74
+ ]:
75
+ try:
76
+ if os.path.isdir(p):
77
+ shutil.rmtree(p, ignore_errors=True)
78
+ elif os.path.exists(p):
79
+ os.remove(p)
80
+ print(f"🗑️ Cleared: {p}")
81
+ except Exception as e:
82
+ print(f"⚠️ Failed to clear {p}: {e}")
83
+ print("♻️ FAISS memory cache reset (runtime + persistent cleared)")
84
+
85
+
86
+ # ------------------------------------------------------------------
87
+ # 🔹 Loaders
88
+ # ------------------------------------------------------------------
89
+ def _load_local_index() -> bool:
90
+ """Load FAISS index + metadata from persistent into runtime."""
91
+ global _runtime_index, _runtime_meta, _meta_loaded_time
92
+ _ensure_dirs()
93
+ idx_path = os.path.join(PERSISTENT_DIR, INDEX_NAME)
94
+ meta_path = os.path.join(PERSISTENT_DIR, META_NAME)
95
+ try:
96
+ if not (os.path.exists(idx_path) and os.path.exists(meta_path)):
97
+ return False
98
+ os.makedirs(RUNTIME_DIR, exist_ok=True)
99
+ shutil.copy2(idx_path, os.path.join(RUNTIME_DIR, INDEX_NAME))
100
+ shutil.copy2(meta_path, os.path.join(RUNTIME_DIR, META_NAME))
101
+ _runtime_index = faiss.read_index(os.path.join(RUNTIME_DIR, INDEX_NAME))
102
+ with open(os.path.join(RUNTIME_DIR, META_NAME), "r", encoding="utf-8") as f:
103
+ _runtime_meta = json.load(f)
104
+ _meta_loaded_time = time.time()
105
+ print(f"✅ Loaded FAISS index ({len(_runtime_meta)} vectors).")
106
+ return True
107
+ except Exception as e:
108
+ print(f"⚠️ Could not load local FAISS index: {e}")
109
+ _runtime_index = None
110
+ _runtime_meta = None
111
+ return False
112
+
113
+
114
+ def _download_index_from_hub() -> bool:
115
+ """Download FAISS artifacts from Hugging Face dataset repo."""
116
+ _ensure_dirs()
117
+ try:
118
+ print("☁️ Downloading FAISS artifacts from HF dataset…")
119
+ idx = hf_hub_download(repo_id=HF_INDEX_REPO,
120
+ filename=f"persistent/{INDEX_NAME}",
121
+ repo_type="dataset")
122
+ meta = hf_hub_download(repo_id=HF_INDEX_REPO,
123
+ filename=f"persistent/{META_NAME}",
124
+ repo_type="dataset")
125
+ shutil.copy2(idx, os.path.join(PERSISTENT_DIR, INDEX_NAME))
126
+ shutil.copy2(meta, os.path.join(PERSISTENT_DIR, META_NAME))
127
+ print("✅ FAISS artifacts downloaded and stored persistently.")
128
+ return _load_local_index()
129
+ except Exception as e:
130
+ print(f"⚠️ HF download failed: {e}")
131
+ return False
132
+
133
+
134
+ def _ensure_faiss_index(force_refresh: bool = False) -> bool:
135
+ """
136
+ Ensure runtime FAISS is available.
137
+ If force_refresh=True, clears runtime and reloads fresh.
138
+ """
139
+ global _runtime_index, _runtime_meta, _meta_loaded_time
140
+ _ensure_dirs()
141
+
142
+ if force_refresh:
143
+ try:
144
+ shutil.rmtree(RUNTIME_DIR, ignore_errors=True)
145
+ _runtime_index = None
146
+ _runtime_meta = None
147
+ print("♻️ Forced FAISS runtime reload requested.")
148
+ except Exception as e:
149
+ print(f"⚠️ Force refresh failed: {e}")
150
+
151
+ if _runtime_index is not None and (time.time() - _meta_loaded_time) < _META_TTL_SECONDS:
152
+ return True
153
+
154
+ if _load_local_index():
155
+ return True
156
+ if _download_index_from_hub():
157
+ return True
158
+
159
+ print("⚠️ No FAISS index found locally or remotely.")
160
+ return False
161
+
162
+
163
+ # ------------------------------------------------------------------
164
+ # 🔹 Accessors
165
+ # ------------------------------------------------------------------
166
+ def load_all_text_chunks() -> List[Dict[str, Any]]:
167
+ """Return metadata list for BM25 fallback or analysis."""
168
+ global _runtime_meta, _meta_loaded_time
169
+ if _runtime_meta is None:
170
+ if not _ensure_faiss_index():
171
+ return []
172
+ if (time.time() - _meta_loaded_time) > _META_TTL_SECONDS:
173
+ try:
174
+ meta_path = os.path.join(RUNTIME_DIR, META_NAME)
175
+ with open(meta_path, "r", encoding="utf-8") as f:
176
+ _runtime_meta = json.load(f)
177
+ _meta_loaded_time = time.time()
178
+ except Exception:
179
+ pass
180
+ return _runtime_meta or []
181
+
182
+
183
+ # ------------------------------------------------------------------
184
+ # 🔹 Core Search
185
+ # ------------------------------------------------------------------
186
+ def search_index(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
187
+ """Perform semantic FAISS search and return metadata hits."""
188
+ if not _ensure_faiss_index():
189
+ return []
190
+
191
+ try:
192
+ model = _ensure_model()
193
+ q_emb = model.encode([query], convert_to_numpy=True).astype("float32")
194
+ faiss.normalize_L2(q_emb)
195
+ D, I = _runtime_index.search(q_emb, top_k)
196
+ results = []
197
+ for dist, idx in zip(D[0], I[0]):
198
+ if idx < 0 or idx >= len(_runtime_meta):
199
+ continue
200
+ meta = dict(_runtime_meta[idx])
201
+ meta["score"] = float(dist)
202
+ meta["file"] = meta.get("file") or meta.get("source") or "unknown"
203
+ meta["text"] = meta.get("text") or meta.get("definition", "")
204
+ results.append(meta)
205
+ return results
206
+ except Exception as e:
207
+ print(f"⚠️ FAISS search failed: {e}")
208
+ return []
core/vector_sync.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ vector_sync.py
3
+ Responsibilities:
4
+ - rebuild_faiss_from_glossary(glossary_path) -> builds a new faiss.Index + meta list
5
+ - _upload_to_dataset(index_path, meta_path, repo_id) -> upload via huggingface_hub
6
+ - safe helpers for creating normalized metadata entries
7
+ """
8
+
9
+ import os
10
+ import re
11
+ import json
12
+ import shutil
13
+ from typing import Tuple, List, Dict, Any
14
+
15
+ import faiss
16
+ import numpy as np
17
+ from sentence_transformers import SentenceTransformer
18
+ from huggingface_hub import upload_file
19
+
20
+ # default embedder (same model used elsewhere)
21
+ EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
22
+
23
+ # directories
24
+ PERSISTENT_DIR = "/home/user/app/persistent"
25
+ TMP_DIR = "/home/user/app/tmp"
26
+ os.makedirs(PERSISTENT_DIR, exist_ok=True)
27
+ os.makedirs(TMP_DIR, exist_ok=True)
28
+
29
+
30
+ def _ensure_model():
31
+ """Return global sentence-transformer model."""
32
+ return SentenceTransformer(EMBED_MODEL_NAME)
33
+
34
+
35
+ def _normalize_meta_row(row: Dict[str, Any]) -> Dict[str, Any]:
36
+ """Ensure consistent meta record fields."""
37
+ out = {
38
+ "term": row.get("term") or row.get("Term") or row.get("name") or "",
39
+ "text": row.get("text") or row.get("definition") or row.get("content") or "",
40
+ # keep both 'file' (local/basename) and full 'sources' list
41
+ "file": row.get("file") or row.get("source") or "",
42
+ "type": row.get("type") or "",
43
+ "sources": row.get("sources") if isinstance(row.get("sources"), list) else [row.get("source")] if row.get("source") else []
44
+ }
45
+ return out
46
+
47
+
48
+ # ==========================================================
49
+ # 🧠 Main Function: Rebuild FAISS from glossary.json
50
+ # ==========================================================
51
+ def rebuild_faiss_from_glossary(glossary_path: str):
52
+ """
53
+ Build FAISS index + metadata from glossary JSON file.
54
+ Handles mixed entries (PDF, Excel, Web, Other).
55
+ Fully resilient against malformed or oversized rows.
56
+ """
57
+ print(f"🧩 Building FAISS from glossary: {glossary_path}")
58
+ if not os.path.exists(glossary_path):
59
+ raise FileNotFoundError(f"Glossary not found: {glossary_path}")
60
+
61
+ # --- Load JSON safely
62
+ with open(glossary_path, "r", encoding="utf-8") as f:
63
+ try:
64
+ glossary_data = json.load(f)
65
+ except Exception as e:
66
+ raise RuntimeError(f"❌ Failed to load glossary JSON: {e}")
67
+
68
+ # Normalize structure
69
+ if isinstance(glossary_data, dict):
70
+ glossary_items = list(glossary_data.values())
71
+ elif isinstance(glossary_data, list):
72
+ glossary_items = glossary_data
73
+ else:
74
+ raise ValueError("Invalid glossary format — must be list or dict.")
75
+
76
+ model = SentenceTransformer(EMBED_MODEL_NAME)
77
+ entries, metas, bad_entries, long_entries = [], [], [], []
78
+
79
+ # helper: normalized type inference
80
+ def infer_type_from_source(src: str, declared_type: str = "") -> str:
81
+ src_l = (src or "").lower()
82
+ declared = (declared_type or "").lower()
83
+ if src_l.endswith(".pdf") or "pdf" in declared:
84
+ return "pdf"
85
+ if src_l.endswith((".xlsx", ".xls")) or "excel" in declared or "xls" in src_l:
86
+ return "excel"
87
+ if src_l.startswith("http") or declared == "web" or "http" in src_l:
88
+ return "web"
89
+ return "other"
90
+
91
+ # --- Process glossary items
92
+ for i, item in enumerate(glossary_items):
93
+ try:
94
+ if not isinstance(item, dict):
95
+ bad_entries.append(item)
96
+ continue
97
+
98
+ term = str(item.get("term") or item.get("Term") or item.get("name") or "").strip()
99
+ definition = str(item.get("definition") or item.get("text") or item.get("content") or "").strip()
100
+
101
+ # Normalize sources (keep list)
102
+ src_field = item.get("sources") or item.get("source") or item.get("file") or ""
103
+ if isinstance(src_field, list):
104
+ src_list = [str(s).strip() for s in src_field if s]
105
+ src = ", ".join(src_list)
106
+ else:
107
+ src_list = [str(src_field).strip()] if src_field else []
108
+ src = str(src_field).strip()
109
+
110
+ declared_type = str(item.get("type") or "").strip().lower()
111
+ entry_type = infer_type_from_source(src, declared_type)
112
+
113
+ # Clean up noisy HTML tags and whitespace
114
+ definition_clean = re.sub(r"<[^>]*>", "", definition)
115
+ definition_clean = re.sub(r"\s+", " ", definition_clean).strip()
116
+
117
+ # Skip if missing essentials
118
+ if not term or not definition_clean:
119
+ bad_entries.append(item)
120
+ continue
121
+
122
+ # Skip extremely long definitions (likely raw HTML or large web content)
123
+ if len(definition_clean) > 3000:
124
+ long_entries.append({
125
+ "term": term,
126
+ "len": len(definition_clean),
127
+ "source": src
128
+ })
129
+ continue
130
+
131
+ text = f"Definition of {term}: {definition_clean}"
132
+
133
+ entries.append(text)
134
+ metas.append({
135
+ "term": term,
136
+ "definition": definition_clean,
137
+ # preserve the original source list and file name
138
+ "sources": src_list if src_list else [src] if src else [],
139
+ "source": src,
140
+ "type": entry_type,
141
+ "file": os.path.basename(glossary_path)
142
+ })
143
+
144
+ except Exception as e:
145
+ bad_entries.append({
146
+ "index": i,
147
+ "error": str(e),
148
+ "raw": str(item)[:300]
149
+ })
150
+ continue
151
+
152
+ # --- Diagnostics
153
+ pdf_count = sum(1 for m in metas if m["type"].lower() == "pdf")
154
+ excel_count = sum(1 for m in metas if m["type"].lower() == "excel")
155
+ web_count = sum(1 for m in metas if m["type"].lower() == "web")
156
+ other_count = len(metas) - (pdf_count + excel_count + web_count)
157
+
158
+ print(f"🧠 Encoding {len(entries)} entries (PDF={pdf_count}, Excel={excel_count}, Web={web_count}, Other={other_count})…")
159
+
160
+ if bad_entries:
161
+ print(f"⚠️ {len(bad_entries)} malformed entries skipped.")
162
+ for b in bad_entries[:3]:
163
+ print(" →", json.dumps(b, ensure_ascii=False)[:300])
164
+
165
+ if long_entries:
166
+ print(f"⚠️ {len(long_entries)} very long entries (>3000 chars) skipped.")
167
+ for l in long_entries[:3]:
168
+ print(f" → Skipped {l['term']} ({l['len']} chars) from {l['source']}")
169
+
170
+ if not entries:
171
+ raise RuntimeError("❌ No valid glossary entries found after cleanup!")
172
+
173
+ # --- Encoding
174
+ embeddings = model.encode(entries, show_progress_bar=True, convert_to_numpy=True).astype("float32")
175
+ faiss.normalize_L2(embeddings)
176
+
177
+ index = faiss.IndexFlatIP(embeddings.shape[1])
178
+ index.add(embeddings)
179
+ print(f"✅ Glossary vectors built ({len(entries)} total entries).")
180
+
181
+ # metas is list of dicts aligned with vectors — return exactly as before
182
+ return index, metas
183
+
184
+ # ==========================================================
185
+ # ☁️ Upload Helper
186
+ # ==========================================================
187
+ def _upload_to_dataset(index_path: str, meta_path: str, repo_id: str) -> None:
188
+ """
189
+ Upload FAISS index and metadata JSON to Hugging Face dataset.
190
+ """
191
+ try:
192
+ print(f"☁️ Uploading {index_path} and {meta_path} to {repo_id}...")
193
+ upload_file(
194
+ path_or_fileobj=index_path,
195
+ path_in_repo=f"persistent/{os.path.basename(index_path)}",
196
+ repo_id=repo_id,
197
+ repo_type="dataset"
198
+ )
199
+ upload_file(
200
+ path_or_fileobj=meta_path,
201
+ path_in_repo=f"persistent/{os.path.basename(meta_path)}",
202
+ repo_id=repo_id,
203
+ repo_type="dataset"
204
+ )
205
+ print("✅ Upload complete.")
206
+ except Exception as e:
207
+ print(f"⚠️ Upload failed: {e}")
208
+ raise
core/web_loader.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests, re, json, time, os
2
+ from bs4 import BeautifulSoup
3
+
4
+ def web_crawler_loader(
5
+ urls_file="/home/user/app/data/urls.txt",
6
+ cache_path="/home/user/app/persistent/web_cache.json",
7
+ max_pages=3,
8
+ timeout=20,
9
+ force_refresh=False,
10
+ ):
11
+ """Fetch and cache text content from official URLs."""
12
+ cache = {}
13
+ if os.path.exists(cache_path) and not force_refresh:
14
+ try:
15
+ with open(cache_path, "r", encoding="utf-8") as f:
16
+ cache = json.load(f)
17
+ except Exception:
18
+ cache = {}
19
+
20
+ if not os.path.exists(urls_file):
21
+ print(f"⚠️ URLs file missing: {urls_file}")
22
+ return list(cache.values())
23
+
24
+ with open(urls_file, "r", encoding="utf-8") as f:
25
+ urls = [u.strip() for u in f if u.strip() and not u.startswith("#")]
26
+
27
+ new_entries = {}
28
+ for i, url in enumerate(urls[: max_pages * 10]):
29
+ if url in cache and not force_refresh:
30
+ new_entries[url] = cache[url]
31
+ continue
32
+ try:
33
+ print(f"🌐 Fetching ({i+1}/{len(urls)}): {url}")
34
+ r = requests.get(url, timeout=timeout, headers={"User-Agent": "ClinicalTrialChatBot/1.0"})
35
+ if r.status_code != 200:
36
+ print(f"⚠️ Skipped {url}: HTTP {r.status_code}")
37
+ continue
38
+ soup = BeautifulSoup(r.text, "html.parser")
39
+ for tag in soup(["script", "style", "header", "footer", "nav", "iframe"]):
40
+ tag.decompose()
41
+ text = " ".join(soup.get_text().split())
42
+ if len(text) < 400:
43
+ continue
44
+ entry_text = f"Source URL: {url}. {text[:3000]}"
45
+ new_entries[url] = {"source": url, "type": "Website", "text": entry_text}
46
+ time.sleep(1)
47
+ except Exception as e:
48
+ print(f"⚠️ Error fetching {url}: {e}")
49
+
50
+ cache.update(new_entries)
51
+ os.makedirs(os.path.dirname(cache_path), exist_ok=True)
52
+ with open(cache_path, "w", encoding="utf-8") as f:
53
+ json.dump(cache, f, indent=2)
54
+ print(f"💾 Web cache updated ({len(cache)} entries).")
55
+ return list(cache.values())