Spaces:
Sleeping
Sleeping
Upload 11 files
Browse files- core/admin_tasks.py +265 -0
- core/bm25.py +137 -0
- core/glossary.py +109 -0
- core/glossary_builder.py +480 -0
- core/hybrid_retriever.py +925 -0
- core/retrieval.py +25 -0
- core/van_normalizer.py +57 -0
- core/vector_search.py +47 -0
- core/vector_store.py +208 -0
- core/vector_sync.py +208 -0
- core/web_loader.py +55 -0
core/admin_tasks.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
core/admin_tasks.py
|
| 3 |
+
|
| 4 |
+
Centralized admin / maintenance functions used by both the Gradio UI (app.py)
|
| 5 |
+
and the FastAPI admin endpoints (api.py). These are synchronous as in your
|
| 6 |
+
current setup and return friendly status strings for display.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import json
|
| 11 |
+
import shutil
|
| 12 |
+
import glob
|
| 13 |
+
import traceback
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
import pandas as pd
|
| 17 |
+
except Exception:
|
| 18 |
+
pd = None
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
import faiss
|
| 22 |
+
except Exception:
|
| 23 |
+
faiss = None
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
from sentence_transformers import SentenceTransformer
|
| 27 |
+
except Exception:
|
| 28 |
+
SentenceTransformer = None
|
| 29 |
+
|
| 30 |
+
from huggingface_hub import hf_hub_download, list_repo_files
|
| 31 |
+
|
| 32 |
+
# functions from your project (should exist)
|
| 33 |
+
# rebuild_faiss_from_glossary should return (index, metas) like before.
|
| 34 |
+
try:
|
| 35 |
+
from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
|
| 36 |
+
except Exception:
|
| 37 |
+
rebuild_faiss_from_glossary = None
|
| 38 |
+
_upload_to_dataset = None
|
| 39 |
+
|
| 40 |
+
# Optional web loader
|
| 41 |
+
try:
|
| 42 |
+
from core.web_loader import web_crawler_loader
|
| 43 |
+
except Exception:
|
| 44 |
+
web_crawler_loader = None
|
| 45 |
+
|
| 46 |
+
PERSISTENT_DIR = "/home/user/app/persistent"
|
| 47 |
+
DATASET_INDEX_REPO = os.environ.get("DATASET_INDEX_REPO", "essprasad/CT-Chat-Index")
|
| 48 |
+
DATASET_DOCS_REPO = os.environ.get("DATASET_DOCS_REPO", "essprasad/CT-Chat-Docs")
|
| 49 |
+
|
| 50 |
+
def _ensure_dirs():
|
| 51 |
+
os.makedirs(PERSISTENT_DIR, exist_ok=True)
|
| 52 |
+
|
| 53 |
+
def clear_index():
|
| 54 |
+
"""Delete local FAISS and related caches. Returns a message string."""
|
| 55 |
+
removed = []
|
| 56 |
+
paths = [
|
| 57 |
+
os.path.join(PERSISTENT_DIR, "faiss.index"),
|
| 58 |
+
os.path.join(PERSISTENT_DIR, "faiss.index.meta.json"),
|
| 59 |
+
os.path.join(PERSISTENT_DIR, "glossary.json"),
|
| 60 |
+
"/home/user/app/data/docs_cache",
|
| 61 |
+
"/home/user/app/runtime_faiss",
|
| 62 |
+
]
|
| 63 |
+
for p in paths:
|
| 64 |
+
try:
|
| 65 |
+
if os.path.isdir(p):
|
| 66 |
+
shutil.rmtree(p, ignore_errors=True)
|
| 67 |
+
removed.append(f"🗑️ Deleted folder: {p}")
|
| 68 |
+
elif os.path.exists(p):
|
| 69 |
+
os.remove(p)
|
| 70 |
+
removed.append(f"🗑️ Deleted file: {p}")
|
| 71 |
+
except Exception as e:
|
| 72 |
+
removed.append(f"⚠️ Failed to delete {p}: {e}")
|
| 73 |
+
if not removed:
|
| 74 |
+
return "ℹ️ No cache files found."
|
| 75 |
+
return "\n".join(removed)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def rebuild_glossary():
|
| 79 |
+
"""
|
| 80 |
+
Calls the existing glossary builder (core.glossary_builder.rebuild_and_upload).
|
| 81 |
+
Returns status string.
|
| 82 |
+
"""
|
| 83 |
+
try:
|
| 84 |
+
from core.glossary_builder import rebuild_and_upload
|
| 85 |
+
except Exception as e:
|
| 86 |
+
return f"⚠️ Cannot import glossary builder: {e}"
|
| 87 |
+
try:
|
| 88 |
+
rebuild_and_upload()
|
| 89 |
+
return "✅ Glossary rebuilt and uploaded successfully."
|
| 90 |
+
except Exception as e:
|
| 91 |
+
tb = traceback.format_exc()
|
| 92 |
+
return f"⚠️ Glossary rebuild failed: {e}\n{tb}"
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def rebuild_index(force_download_glossary: bool = False):
|
| 96 |
+
"""
|
| 97 |
+
Rebuild FAISS index from glossary.json + Excel + (optionally) web content.
|
| 98 |
+
Returns status string. Mirrors the logic in your previous rebuild_index implementation.
|
| 99 |
+
"""
|
| 100 |
+
_ensure_dirs()
|
| 101 |
+
try:
|
| 102 |
+
if rebuild_faiss_from_glossary is None:
|
| 103 |
+
return "⚠️ rebuild_faiss_from_glossary is not available in core.vector_sync."
|
| 104 |
+
|
| 105 |
+
glossary_path = os.path.join(PERSISTENT_DIR, "glossary.json")
|
| 106 |
+
# Attempt to download glossary.json from HF dataset if missing
|
| 107 |
+
if not os.path.exists(glossary_path) or force_download_glossary:
|
| 108 |
+
try:
|
| 109 |
+
downloaded = hf_hub_download(repo_id=DATASET_INDEX_REPO, filename="persistent/glossary.json", repo_type="dataset")
|
| 110 |
+
shutil.copy2(downloaded, glossary_path)
|
| 111 |
+
except Exception as e:
|
| 112 |
+
# Continue even if glossary download fails; rebuild_faiss_from_glossary may handle absent file
|
| 113 |
+
return f"⚠️ Could not download glossary.json from {DATASET_INDEX_REPO}: {e}"
|
| 114 |
+
|
| 115 |
+
# Build faiss index using the project helper
|
| 116 |
+
index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path)
|
| 117 |
+
loaded = len(metas) if isinstance(metas, (list, tuple)) else 0
|
| 118 |
+
|
| 119 |
+
# Index Excel files from docs dataset
|
| 120 |
+
try:
|
| 121 |
+
repo_files = list_repo_files(DATASET_DOCS_REPO, repo_type="dataset")
|
| 122 |
+
excel_files = [f for f in repo_files if f.lower().endswith((".xls", ".xlsx"))]
|
| 123 |
+
except Exception:
|
| 124 |
+
excel_files = []
|
| 125 |
+
|
| 126 |
+
# If we have SentenceTransformer available we will embed and add Excel content
|
| 127 |
+
if SentenceTransformer is not None and faiss is not None and excel_files:
|
| 128 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 129 |
+
excel_entries = []
|
| 130 |
+
for file_name in excel_files:
|
| 131 |
+
try:
|
| 132 |
+
fp = hf_hub_download(repo_id=DATASET_DOCS_REPO, filename=file_name, repo_type="dataset")
|
| 133 |
+
# read sheets and look for MRCT-style columns (best-effort)
|
| 134 |
+
try:
|
| 135 |
+
xls = pd.read_excel(fp, sheet_name=None)
|
| 136 |
+
except Exception:
|
| 137 |
+
xls = {}
|
| 138 |
+
for sheet, df in xls.items():
|
| 139 |
+
if not isinstance(df, pd.DataFrame):
|
| 140 |
+
continue
|
| 141 |
+
cols = [c.lower() for c in df.columns.astype(str)]
|
| 142 |
+
# heuristic — look for "glossary term" or "glossary term" header
|
| 143 |
+
if not any("glossary term" in c or "term" == c.strip().lower() for c in cols):
|
| 144 |
+
continue
|
| 145 |
+
df = df.fillna("").dropna(how="all")
|
| 146 |
+
for _, row in df.iterrows():
|
| 147 |
+
term = str(row.get("Glossary Term", "") or row.get("term", "")).strip()
|
| 148 |
+
if not term:
|
| 149 |
+
# try first column
|
| 150 |
+
try:
|
| 151 |
+
term = str(row.iloc[0]).strip()
|
| 152 |
+
except Exception:
|
| 153 |
+
term = ""
|
| 154 |
+
if not term:
|
| 155 |
+
continue
|
| 156 |
+
combined = " ".join(str(x) for x in row.values if str(x).strip())
|
| 157 |
+
excel_entries.append({
|
| 158 |
+
"file": file_name,
|
| 159 |
+
"sheet": sheet,
|
| 160 |
+
"term": term,
|
| 161 |
+
"type": "excel",
|
| 162 |
+
"text": combined,
|
| 163 |
+
"source": file_name
|
| 164 |
+
})
|
| 165 |
+
except Exception:
|
| 166 |
+
# non-fatal: skip problematic excel
|
| 167 |
+
continue
|
| 168 |
+
|
| 169 |
+
if excel_entries:
|
| 170 |
+
texts = [e["text"] for e in excel_entries]
|
| 171 |
+
embs = model.encode(texts, show_progress_bar=False, convert_to_numpy=True).astype("float32")
|
| 172 |
+
try:
|
| 173 |
+
faiss.normalize_L2(embs)
|
| 174 |
+
index.add(embs)
|
| 175 |
+
if isinstance(metas, list):
|
| 176 |
+
metas.extend(excel_entries)
|
| 177 |
+
loaded = len(metas)
|
| 178 |
+
except Exception:
|
| 179 |
+
# index may be incompatible or None
|
| 180 |
+
pass
|
| 181 |
+
|
| 182 |
+
# Optionally fetch & embed web content if web_crawler_loader exists
|
| 183 |
+
if web_crawler_loader is not None and SentenceTransformer is not None and faiss is not None:
|
| 184 |
+
try:
|
| 185 |
+
web_entries = web_crawler_loader(
|
| 186 |
+
urls_file="/home/user/app/data/urls.txt",
|
| 187 |
+
cache_path=os.path.join(PERSISTENT_DIR, "web_cache.json"),
|
| 188 |
+
max_pages=2,
|
| 189 |
+
timeout=15,
|
| 190 |
+
force_refresh=False,
|
| 191 |
+
)
|
| 192 |
+
if web_entries:
|
| 193 |
+
web_texts = [w.get("text", "") for w in web_entries if len(w.get("text","")) > 50]
|
| 194 |
+
if web_texts:
|
| 195 |
+
model = model if 'model' in locals() else SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 196 |
+
web_emb = model.encode(web_texts, show_progress_bar=False, convert_to_numpy=True).astype("float32")
|
| 197 |
+
faiss.normalize_L2(web_emb)
|
| 198 |
+
index.add(web_emb)
|
| 199 |
+
if isinstance(metas, list):
|
| 200 |
+
metas.extend(web_entries)
|
| 201 |
+
loaded = len(metas)
|
| 202 |
+
except Exception:
|
| 203 |
+
pass
|
| 204 |
+
|
| 205 |
+
# Save the index + meta back to persistent
|
| 206 |
+
try:
|
| 207 |
+
faiss_path = os.path.join(PERSISTENT_DIR, "faiss.index")
|
| 208 |
+
meta_path = os.path.join(PERSISTENT_DIR, "faiss.index.meta.json")
|
| 209 |
+
if faiss is not None and hasattr(faiss, "write_index"):
|
| 210 |
+
faiss.write_index(index, faiss_path)
|
| 211 |
+
with open(meta_path, "w", encoding="utf-8") as f:
|
| 212 |
+
json.dump(metas, f, indent=2)
|
| 213 |
+
# Try upload if helper present
|
| 214 |
+
if _upload_to_dataset is not None:
|
| 215 |
+
try:
|
| 216 |
+
_upload_to_dataset(faiss_path, meta_path, DATASET_INDEX_REPO)
|
| 217 |
+
except Exception:
|
| 218 |
+
pass
|
| 219 |
+
except Exception:
|
| 220 |
+
pass
|
| 221 |
+
|
| 222 |
+
return f"✅ Rebuild complete: {loaded} entries."
|
| 223 |
+
except Exception as e:
|
| 224 |
+
tb = traceback.format_exc()
|
| 225 |
+
return f"⚠️ Rebuild failed: {e}\n{tb}"
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def reset_faiss_cache():
|
| 229 |
+
"""
|
| 230 |
+
Wipe persistent & runtime FAISS/glossary, then call rebuild_glossary + rebuild_index.
|
| 231 |
+
Returns concatenated status string.
|
| 232 |
+
"""
|
| 233 |
+
msgs = []
|
| 234 |
+
# wipe persistent
|
| 235 |
+
try:
|
| 236 |
+
to_remove = [
|
| 237 |
+
os.path.join(PERSISTENT_DIR, "faiss.index"),
|
| 238 |
+
os.path.join(PERSISTENT_DIR, "faiss.index.meta.json"),
|
| 239 |
+
os.path.join(PERSISTENT_DIR, "glossary.json"),
|
| 240 |
+
os.path.join(PERSISTENT_DIR, "web_cache.json"),
|
| 241 |
+
"/home/user/app/runtime_faiss",
|
| 242 |
+
]
|
| 243 |
+
for p in to_remove:
|
| 244 |
+
try:
|
| 245 |
+
if os.path.isdir(p):
|
| 246 |
+
shutil.rmtree(p, ignore_errors=True)
|
| 247 |
+
elif os.path.exists(p):
|
| 248 |
+
os.remove(p)
|
| 249 |
+
except Exception:
|
| 250 |
+
pass
|
| 251 |
+
msgs.append("🧹 Persistent FAISS + glossary caches cleared.")
|
| 252 |
+
except Exception as e:
|
| 253 |
+
msgs.append(f"⚠️ Failed clearing caches: {e}")
|
| 254 |
+
|
| 255 |
+
# Rebuild glossary then index
|
| 256 |
+
try:
|
| 257 |
+
msgs.append(rebuild_glossary())
|
| 258 |
+
except Exception as e:
|
| 259 |
+
msgs.append(f"⚠️ Rebuild glossary failed: {e}")
|
| 260 |
+
try:
|
| 261 |
+
msgs.append(rebuild_index())
|
| 262 |
+
except Exception as e:
|
| 263 |
+
msgs.append(f"⚠️ Rebuild index failed: {e}")
|
| 264 |
+
|
| 265 |
+
return "\n".join(msgs)
|
core/bm25.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
import math
|
| 5 |
+
from collections import defaultdict, Counter
|
| 6 |
+
|
| 7 |
+
# --- 🔧 NEW: Lightweight stemming and lemmatization helpers
|
| 8 |
+
try:
|
| 9 |
+
import nltk
|
| 10 |
+
from nltk.stem import WordNetLemmatizer, PorterStemmer
|
| 11 |
+
from nltk.corpus import wordnet
|
| 12 |
+
nltk.download("wordnet", quiet=True)
|
| 13 |
+
nltk.download("omw-1.4", quiet=True)
|
| 14 |
+
except Exception:
|
| 15 |
+
WordNetLemmatizer = PorterStemmer = None
|
| 16 |
+
|
| 17 |
+
# Initialize stemmer and lemmatizer
|
| 18 |
+
_lemmatizer = WordNetLemmatizer() if WordNetLemmatizer else None
|
| 19 |
+
_stemmer = PorterStemmer() if PorterStemmer else None
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _normalize_token(token: str) -> str:
|
| 23 |
+
"""Normalize a token by lowercasing, lemmatizing, and stemming."""
|
| 24 |
+
token = token.lower().strip()
|
| 25 |
+
if _lemmatizer:
|
| 26 |
+
try:
|
| 27 |
+
token = _lemmatizer.lemmatize(token)
|
| 28 |
+
except Exception:
|
| 29 |
+
pass
|
| 30 |
+
if _stemmer:
|
| 31 |
+
try:
|
| 32 |
+
token = _stemmer.stem(token)
|
| 33 |
+
except Exception:
|
| 34 |
+
pass
|
| 35 |
+
return token
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class BM25:
|
| 39 |
+
def __init__(self, corpus):
|
| 40 |
+
# corpus = list of dicts each with 'text'
|
| 41 |
+
# 🔧 FIX: support for 'definition' or 'content' fallback if 'text' missing
|
| 42 |
+
self.corpus = corpus
|
| 43 |
+
self.tokenized_corpus = [self._tokenize(self._get_text(doc)) for doc in corpus]
|
| 44 |
+
self.doc_lens = [len(tokens) for tokens in self.tokenized_corpus]
|
| 45 |
+
self.avgdl = sum(self.doc_lens) / len(self.doc_lens) if self.doc_lens else 0.0
|
| 46 |
+
self.doc_freqs = self._calc_doc_freqs()
|
| 47 |
+
self.k1 = 1.5
|
| 48 |
+
self.b = 0.75
|
| 49 |
+
|
| 50 |
+
def _get_text(self, doc):
|
| 51 |
+
"""Safely extract text from multiple possible keys ('text', 'definition', 'content')."""
|
| 52 |
+
if not isinstance(doc, dict):
|
| 53 |
+
return ""
|
| 54 |
+
if "text" in doc and isinstance(doc["text"], str):
|
| 55 |
+
return doc["text"]
|
| 56 |
+
elif "definition" in doc and isinstance(doc["definition"], str):
|
| 57 |
+
return doc["definition"]
|
| 58 |
+
elif "content" in doc and isinstance(doc["content"], str):
|
| 59 |
+
return doc["content"]
|
| 60 |
+
return ""
|
| 61 |
+
|
| 62 |
+
def _tokenize(self, text):
|
| 63 |
+
"""Tokenize and normalize each word with stemming and lemmatization."""
|
| 64 |
+
raw_tokens = re.findall(r"\w+", (text or "").lower())
|
| 65 |
+
return [_normalize_token(t) for t in raw_tokens if t]
|
| 66 |
+
|
| 67 |
+
def _calc_doc_freqs(self):
|
| 68 |
+
freqs = defaultdict(int)
|
| 69 |
+
for doc in self.tokenized_corpus:
|
| 70 |
+
for word in set(doc):
|
| 71 |
+
freqs[word] += 1
|
| 72 |
+
return freqs
|
| 73 |
+
|
| 74 |
+
def _idf(self, term):
|
| 75 |
+
N = len(self.tokenized_corpus)
|
| 76 |
+
df = self.doc_freqs.get(term, 0)
|
| 77 |
+
# smoothed idf to avoid division issues
|
| 78 |
+
return math.log(1 + (N - df + 0.5) / (df + 0.5)) if N > 0 else 0.0
|
| 79 |
+
|
| 80 |
+
def get_scores(self, query_tokens):
|
| 81 |
+
scores = [0.0] * len(self.tokenized_corpus)
|
| 82 |
+
for idx, doc_tokens in enumerate(self.tokenized_corpus):
|
| 83 |
+
freqs = Counter(doc_tokens)
|
| 84 |
+
dl = self.doc_lens[idx]
|
| 85 |
+
for term in query_tokens:
|
| 86 |
+
idf = self._idf(term)
|
| 87 |
+
tf = freqs.get(term, 0)
|
| 88 |
+
denom = tf + self.k1 * (1 - self.b + self.b * dl / (self.avgdl or 1.0))
|
| 89 |
+
score = 0.0
|
| 90 |
+
if denom != 0:
|
| 91 |
+
score = idf * ((tf * (self.k1 + 1)) / denom)
|
| 92 |
+
scores[idx] += score
|
| 93 |
+
return scores
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def search_bm25(query, docs=None, top_n=10):
|
| 97 |
+
"""
|
| 98 |
+
BM25 search helper.
|
| 99 |
+
- query: string
|
| 100 |
+
- docs: optional list of dicts (each may have 'text'/'definition'/'content');
|
| 101 |
+
if None, will load from vector_store.load_all_text_chunks()
|
| 102 |
+
- top_n: int
|
| 103 |
+
Returns list of doc dicts with added 'score' field.
|
| 104 |
+
"""
|
| 105 |
+
from core.vector_store import load_all_text_chunks
|
| 106 |
+
|
| 107 |
+
if docs is None:
|
| 108 |
+
docs = load_all_text_chunks() or []
|
| 109 |
+
if not docs:
|
| 110 |
+
return []
|
| 111 |
+
|
| 112 |
+
bm25 = BM25(docs)
|
| 113 |
+
|
| 114 |
+
# 🔧 Normalize query tokens with same stem/lemma logic
|
| 115 |
+
query_tokens = [_normalize_token(t) for t in re.findall(r"\w+", (query or "").lower()) if t]
|
| 116 |
+
if not query_tokens:
|
| 117 |
+
return []
|
| 118 |
+
|
| 119 |
+
scores = bm25.get_scores(query_tokens)
|
| 120 |
+
|
| 121 |
+
# --- 🎯 NEW: Boost Excel glossary sources (MRCT, xlsx/xls) by +15%
|
| 122 |
+
for i, doc in enumerate(docs):
|
| 123 |
+
src = (doc.get("file") or doc.get("source") or "").lower()
|
| 124 |
+
if any(x in src for x in [".xlsx", ".xls", "mrct", "clinical-research-glossary"]):
|
| 125 |
+
scores[i] *= 1.15 # Excel source boost
|
| 126 |
+
|
| 127 |
+
# --- Rank and return top_n docs
|
| 128 |
+
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
|
| 129 |
+
results = []
|
| 130 |
+
for i in top_indices:
|
| 131 |
+
doc = dict(docs[i]) # shallow copy
|
| 132 |
+
# 🔧 Ensure 'text' key exists so retriever can render it
|
| 133 |
+
if "text" not in doc:
|
| 134 |
+
doc["text"] = bm25._get_text(doc)
|
| 135 |
+
doc["score"] = float(scores[i])
|
| 136 |
+
results.append(doc)
|
| 137 |
+
return results
|
core/glossary.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# core/glossary.py
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
from difflib import get_close_matches
|
| 7 |
+
from huggingface_hub import hf_hub_download
|
| 8 |
+
|
| 9 |
+
GLOSSARY = None
|
| 10 |
+
GLOSSARY_TERMS_CACHE = [] # 🧠 Cache of glossary keys for fuzzy matching
|
| 11 |
+
DATASET_REPO = "essprasad/CT-Chat-Index"
|
| 12 |
+
GLOSSARY_FILENAME = "persistent/glossary.json"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _normalize_term(term: str) -> str:
|
| 16 |
+
"""Normalize glossary terms for matching, with fuzzy fallback."""
|
| 17 |
+
if not term:
|
| 18 |
+
return ""
|
| 19 |
+
term = term.lower().strip()
|
| 20 |
+
term = re.sub(r'[\-_/\\.,;:]+', ' ', term)
|
| 21 |
+
term = re.sub(r'\s+', ' ', term)
|
| 22 |
+
|
| 23 |
+
# Common clinical research synonym normalization
|
| 24 |
+
term = term.replace("e crf", "ecrf").replace("e-crf", "ecrf").replace("e/crf", "ecrf").replace("e_crf", "ecrf")
|
| 25 |
+
term = term.replace("electronic case report form", "ecrf")
|
| 26 |
+
term = term.replace("case report form", "crf")
|
| 27 |
+
term = term.replace("informed consent form", "icf")
|
| 28 |
+
term = term.replace("good clinical practice", "gcp")
|
| 29 |
+
term = term.replace("serious adverse event", "sae")
|
| 30 |
+
term = term.replace("adverse event", "ae")
|
| 31 |
+
term = term.replace("21 cfr part 11", "21cfrpart11")
|
| 32 |
+
term = term.replace("clinical study report", "csr")
|
| 33 |
+
|
| 34 |
+
term = term.strip()
|
| 35 |
+
|
| 36 |
+
# 🧩 Fuzzy matching fallback (for plural/singular or typos)
|
| 37 |
+
if GLOSSARY_TERMS_CACHE:
|
| 38 |
+
if term not in GLOSSARY_TERMS_CACHE:
|
| 39 |
+
close = get_close_matches(term, GLOSSARY_TERMS_CACHE, n=1, cutoff=0.85)
|
| 40 |
+
if close:
|
| 41 |
+
# return the closest key for better recall
|
| 42 |
+
return close[0]
|
| 43 |
+
|
| 44 |
+
return term
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _load_glossary():
|
| 48 |
+
"""Load glossary.json from Hugging Face Hub (cached)."""
|
| 49 |
+
global GLOSSARY, GLOSSARY_TERMS_CACHE
|
| 50 |
+
if GLOSSARY is not None:
|
| 51 |
+
return GLOSSARY
|
| 52 |
+
try:
|
| 53 |
+
path = hf_hub_download(
|
| 54 |
+
repo_id=DATASET_REPO,
|
| 55 |
+
filename=GLOSSARY_FILENAME,
|
| 56 |
+
repo_type="dataset",
|
| 57 |
+
)
|
| 58 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 59 |
+
raw = json.load(f)
|
| 60 |
+
|
| 61 |
+
GLOSSARY = {}
|
| 62 |
+
for k, vlist in raw.items():
|
| 63 |
+
if not isinstance(k, str) or len(k.split()) > 12 or re.search(r'\d{4}', k):
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
candidate_key = k
|
| 67 |
+
if isinstance(vlist, dict):
|
| 68 |
+
candidate_key = vlist.get("term") or vlist.get("name") or vlist.get("title") or k
|
| 69 |
+
|
| 70 |
+
norm = _normalize_term(candidate_key)
|
| 71 |
+
if not norm:
|
| 72 |
+
continue
|
| 73 |
+
|
| 74 |
+
if isinstance(vlist, dict):
|
| 75 |
+
dfn = vlist.get("definition") or vlist.get("text") or ""
|
| 76 |
+
sources = vlist.get("sources", [])
|
| 77 |
+
elif isinstance(vlist, str):
|
| 78 |
+
dfn = vlist
|
| 79 |
+
sources = []
|
| 80 |
+
else:
|
| 81 |
+
dfn, sources = "", []
|
| 82 |
+
|
| 83 |
+
if not dfn or len(dfn.strip()) < 5:
|
| 84 |
+
continue
|
| 85 |
+
|
| 86 |
+
if norm not in GLOSSARY:
|
| 87 |
+
GLOSSARY[norm] = {
|
| 88 |
+
"term": candidate_key.strip(),
|
| 89 |
+
"definition": dfn.strip(),
|
| 90 |
+
"sources": sources if isinstance(sources, list) else []
|
| 91 |
+
}
|
| 92 |
+
else:
|
| 93 |
+
# Merge sources if already exists
|
| 94 |
+
existing = GLOSSARY[norm]
|
| 95 |
+
existing_sources = set(existing.get("sources", []))
|
| 96 |
+
new_sources = set(sources) if sources else set()
|
| 97 |
+
existing["sources"] = list(existing_sources.union(new_sources))
|
| 98 |
+
|
| 99 |
+
# 🧠 Store all glossary keys for fuzzy fallback
|
| 100 |
+
GLOSSARY_TERMS_CACHE = list(GLOSSARY.keys())
|
| 101 |
+
|
| 102 |
+
print(f"✅ Glossary downloaded and cached ({len(GLOSSARY)} entries).")
|
| 103 |
+
return GLOSSARY
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"⚠️ Failed to load glossary from Hugging Face: {e}")
|
| 106 |
+
return {}
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
__all__ = ["_load_glossary", "_normalize_term"]
|
core/glossary_builder.py
ADDED
|
@@ -0,0 +1,480 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
📘 glossary_builder.py — FINAL VERSION WITH MRCT SECTION FIX + CDISC + ABBREVIATIONS
|
| 3 |
+
------------------------------------------------------------------------------------
|
| 4 |
+
Builds a unified glossary from:
|
| 5 |
+
- PDF glossary files
|
| 6 |
+
- MRCT Clinical Research Glossary (Excel)
|
| 7 |
+
- CDISC Glossary (Excel)
|
| 8 |
+
- Abbreviations (Excel)
|
| 9 |
+
- Web glossary sources
|
| 10 |
+
|
| 11 |
+
Features:
|
| 12 |
+
✔ Correctly splits MRCT concatenated cells (Glossary Definition + Use in Context…)
|
| 13 |
+
✔ Removes all duplicated sections
|
| 14 |
+
✔ Maintains the correct order of sections
|
| 15 |
+
✔ Handles CDISC Submission Value → Definition extraction
|
| 16 |
+
✔ Handles Abbreviations.xlsx column patterns
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
import re
|
| 21 |
+
import json
|
| 22 |
+
import time
|
| 23 |
+
import fitz
|
| 24 |
+
import requests
|
| 25 |
+
import pandas as pd
|
| 26 |
+
from bs4 import BeautifulSoup
|
| 27 |
+
from huggingface_hub import (
|
| 28 |
+
upload_file, HfFolder, list_repo_files, hf_hub_download
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ------------------------------------------------------------------------------
|
| 33 |
+
# CONFIG
|
| 34 |
+
# ------------------------------------------------------------------------------
|
| 35 |
+
DATASET_REPO = "essprasad/CT-Chat-Index"
|
| 36 |
+
DOCS_REPO = "essprasad/CT-Chat-Docs"
|
| 37 |
+
|
| 38 |
+
LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
|
| 39 |
+
REMOTE_GLOSSARY = "persistent/glossary.json"
|
| 40 |
+
|
| 41 |
+
TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")
|
| 42 |
+
|
| 43 |
+
WEB_SOURCES = [
|
| 44 |
+
"https://mrctcenter.org/glossaryterm/clinical-research/",
|
| 45 |
+
"https://www.fda.gov/patients/drug-development-process/step-3-clinical-research",
|
| 46 |
+
"https://www.cdisc.org/",
|
| 47 |
+
"https://www.ich.org/",
|
| 48 |
+
"https://www.ema.europa.eu/",
|
| 49 |
+
"https://www.who.int/",
|
| 50 |
+
"https://clinicaltrials.gov/",
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ------------------------------------------------------------------------------
|
| 55 |
+
# HELPERS
|
| 56 |
+
# ------------------------------------------------------------------------------
|
| 57 |
+
def normalize_term(term: str) -> str:
|
| 58 |
+
if not term:
|
| 59 |
+
return ""
|
| 60 |
+
s = re.sub(r"[\-_/\\.,;:]+", " ", term.lower().strip())
|
| 61 |
+
s = re.sub(r"\s+", " ", s)
|
| 62 |
+
synonyms = {
|
| 63 |
+
"electronic case report form": "ecrf",
|
| 64 |
+
"case report form": "crf",
|
| 65 |
+
"good clinical practice": "gcp",
|
| 66 |
+
"clinical study report": "csr",
|
| 67 |
+
"informed consent form": "icf",
|
| 68 |
+
"adverse event": "ae",
|
| 69 |
+
"serious adverse event": "sae",
|
| 70 |
+
"21 cfr part 11": "21cfrpart11",
|
| 71 |
+
}
|
| 72 |
+
return synonyms.get(s, s)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def extract_text_from_pdf(path):
|
| 76 |
+
try:
|
| 77 |
+
doc = fitz.open(path)
|
| 78 |
+
text = "\n".join(page.get_text("text") for page in doc)
|
| 79 |
+
doc.close()
|
| 80 |
+
return text
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"⚠️ Error reading PDF {path}: {e}")
|
| 83 |
+
return ""
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def extract_definitions_from_text(text):
|
| 87 |
+
glossary = {}
|
| 88 |
+
text = re.sub(r"\r", "", text)
|
| 89 |
+
lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
|
| 90 |
+
|
| 91 |
+
i = 0
|
| 92 |
+
while i < len(lines):
|
| 93 |
+
term = lines[i]
|
| 94 |
+
|
| 95 |
+
if len(term) <= 1 or term.isdigit():
|
| 96 |
+
i += 1
|
| 97 |
+
continue
|
| 98 |
+
if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index"]):
|
| 99 |
+
i += 1
|
| 100 |
+
continue
|
| 101 |
+
|
| 102 |
+
def_lines = []
|
| 103 |
+
j = i + 1
|
| 104 |
+
|
| 105 |
+
while j < len(lines):
|
| 106 |
+
nxt = lines[j]
|
| 107 |
+
if (
|
| 108 |
+
re.match(r"^[A-Za-z][A-Za-z0-9\- ]{0,20}$", nxt)
|
| 109 |
+
and not nxt.endswith(".")
|
| 110 |
+
):
|
| 111 |
+
break
|
| 112 |
+
def_lines.append(nxt)
|
| 113 |
+
j += 1
|
| 114 |
+
|
| 115 |
+
definition = " ".join(def_lines).strip()
|
| 116 |
+
|
| 117 |
+
if len(definition.split()) < 5:
|
| 118 |
+
i += 1
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
norm = normalize_term(term)
|
| 122 |
+
glossary[norm] = {
|
| 123 |
+
"term": term,
|
| 124 |
+
"definition": definition,
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
i = j
|
| 128 |
+
|
| 129 |
+
return glossary
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def detect_source_type(src: str) -> str:
|
| 133 |
+
s = src.lower()
|
| 134 |
+
if s.endswith(".pdf"):
|
| 135 |
+
return "pdf"
|
| 136 |
+
if s.endswith((".xls", ".xlsx")):
|
| 137 |
+
return "excel"
|
| 138 |
+
if s.startswith("http"):
|
| 139 |
+
return "web"
|
| 140 |
+
return "other"
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def extract_web_glossary(url):
|
| 144 |
+
results = []
|
| 145 |
+
try:
|
| 146 |
+
print(f"🌐 Fetching {url}…")
|
| 147 |
+
r = requests.get(url, timeout=10)
|
| 148 |
+
if r.status_code != 200:
|
| 149 |
+
print(f"⚠️ Skipped {url} (HTTP {r.status_code})")
|
| 150 |
+
return []
|
| 151 |
+
|
| 152 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 153 |
+
text = soup.get_text(separator="\n")
|
| 154 |
+
|
| 155 |
+
matches = re.findall(
|
| 156 |
+
r"([A-Z][A-Za-z0-9 \-]{3,30})[:\-]\s*(.{10,200})", text
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
for term, definition in matches[:50]:
|
| 160 |
+
results.append(
|
| 161 |
+
{
|
| 162 |
+
"term": term.strip(),
|
| 163 |
+
"definition": definition.strip(),
|
| 164 |
+
"sources": [url],
|
| 165 |
+
"file": url,
|
| 166 |
+
"type": "web",
|
| 167 |
+
}
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
except Exception as e:
|
| 171 |
+
print(f"⚠️ Web extraction error for {url}: {e}")
|
| 172 |
+
|
| 173 |
+
return results
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# ------------------------------------------------------------------------------
|
| 177 |
+
# MRCT STRUCTURED CELL PARSER
|
| 178 |
+
# ------------------------------------------------------------------------------
|
| 179 |
+
SECTION_LABELS = [
|
| 180 |
+
"Glossary Definition",
|
| 181 |
+
"Use in Context",
|
| 182 |
+
"More Info",
|
| 183 |
+
"Other Info to Think About When Joining a Study",
|
| 184 |
+
"Related Terms",
|
| 185 |
+
"Resource URL",
|
| 186 |
+
]
|
| 187 |
+
|
| 188 |
+
LABEL_RE = re.compile(
|
| 189 |
+
r"(?i)(Glossary Definition:|Use in Context:|More Info:|Other Info to Think About When Joining a Study:|Related Terms:|Resource URL:)"
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def parse_mrct_cell(cell: str):
|
| 194 |
+
if not isinstance(cell, str) or not cell.strip():
|
| 195 |
+
return []
|
| 196 |
+
|
| 197 |
+
text = re.sub(r"\s{2,}", " ", cell.strip())
|
| 198 |
+
|
| 199 |
+
# Split by labels
|
| 200 |
+
pieces = re.split(LABEL_RE, text)
|
| 201 |
+
out = []
|
| 202 |
+
i = 0
|
| 203 |
+
while i < len(pieces):
|
| 204 |
+
p = pieces[i].strip()
|
| 205 |
+
if p == "":
|
| 206 |
+
i += 1
|
| 207 |
+
continue
|
| 208 |
+
if p.endswith(":"):
|
| 209 |
+
label = p[:-1].strip()
|
| 210 |
+
value = pieces[i + 1].strip() if i + 1 < len(pieces) else ""
|
| 211 |
+
out.append((label, value))
|
| 212 |
+
i += 2
|
| 213 |
+
else:
|
| 214 |
+
out.append(("Glossary Definition", p))
|
| 215 |
+
i += 1
|
| 216 |
+
return out
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
# ------------------------------------------------------------------------------
|
| 220 |
+
# MAIN: GLOSSARY REBUILD
|
| 221 |
+
# ------------------------------------------------------------------------------
|
| 222 |
+
def rebuild_and_upload():
|
| 223 |
+
start = time.time()
|
| 224 |
+
print("📘 Starting glossary rebuild…")
|
| 225 |
+
|
| 226 |
+
try:
|
| 227 |
+
all_files = list_repo_files(
|
| 228 |
+
repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN
|
| 229 |
+
)
|
| 230 |
+
pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
|
| 231 |
+
excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
|
| 232 |
+
except Exception as e:
|
| 233 |
+
raise RuntimeError(f"❌ Cannot list files: {e}")
|
| 234 |
+
|
| 235 |
+
all_defs = {}
|
| 236 |
+
|
| 237 |
+
# ----------------------------------------------------
|
| 238 |
+
# 1️⃣ PDFs
|
| 239 |
+
# ----------------------------------------------------
|
| 240 |
+
skip_patterns = [
|
| 241 |
+
"topic_", "template", "schedule", "protocol",
|
| 242 |
+
"painac", "sas", "glossary_printable"
|
| 243 |
+
]
|
| 244 |
+
|
| 245 |
+
for pdf in pdfs:
|
| 246 |
+
if any(sp in pdf.lower() for sp in skip_patterns):
|
| 247 |
+
print(f"⏩ Skipping non-glossary PDF: {pdf}")
|
| 248 |
+
continue
|
| 249 |
+
|
| 250 |
+
print(f"🔍 Processing PDF: {pdf}")
|
| 251 |
+
try:
|
| 252 |
+
path = hf_hub_download(
|
| 253 |
+
repo_id=DOCS_REPO,
|
| 254 |
+
filename=pdf,
|
| 255 |
+
token=TOKEN,
|
| 256 |
+
repo_type="dataset"
|
| 257 |
+
)
|
| 258 |
+
text = extract_text_from_pdf(path)
|
| 259 |
+
defs = extract_definitions_from_text(text)
|
| 260 |
+
|
| 261 |
+
for k, v in defs.items():
|
| 262 |
+
v["sources"] = [pdf]
|
| 263 |
+
v["file"] = pdf
|
| 264 |
+
v["type"] = "pdf"
|
| 265 |
+
all_defs[f"{k}__{pdf}"] = v
|
| 266 |
+
|
| 267 |
+
except Exception as e:
|
| 268 |
+
print(f"⚠️ PDF extraction error: {pdf}: {e}")
|
| 269 |
+
|
| 270 |
+
# ----------------------------------------------------
|
| 271 |
+
# 2️⃣ Excel files (MRCT + Abbreviations + CDISC)
|
| 272 |
+
# ----------------------------------------------------
|
| 273 |
+
for excel in excels:
|
| 274 |
+
try:
|
| 275 |
+
print(f"📗 Processing Excel: {excel}")
|
| 276 |
+
path = hf_hub_download(
|
| 277 |
+
repo_id=DOCS_REPO,
|
| 278 |
+
filename=excel,
|
| 279 |
+
token=TOKEN,
|
| 280 |
+
repo_type="dataset"
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
xls = pd.read_excel(path, sheet_name=None)
|
| 284 |
+
|
| 285 |
+
for sheet_name, df in xls.items():
|
| 286 |
+
df = df.fillna("").dropna(how="all")
|
| 287 |
+
if df.empty:
|
| 288 |
+
continue
|
| 289 |
+
|
| 290 |
+
df.columns = [str(c).strip() for c in df.columns]
|
| 291 |
+
lower_cols = {c.lower(): c for c in df.columns}
|
| 292 |
+
|
| 293 |
+
# -----------------------------
|
| 294 |
+
# Detect term column
|
| 295 |
+
# -----------------------------
|
| 296 |
+
term_col = next(
|
| 297 |
+
(
|
| 298 |
+
c
|
| 299 |
+
for c in df.columns
|
| 300 |
+
if "glossary term" in c.lower() or c.lower() == "term"
|
| 301 |
+
),
|
| 302 |
+
None,
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# Abbreviations
|
| 306 |
+
if not term_col:
|
| 307 |
+
for c in [
|
| 308 |
+
"acronym",
|
| 309 |
+
"abbreviation",
|
| 310 |
+
"acryonym/abbreviation/initial",
|
| 311 |
+
"initial",
|
| 312 |
+
]:
|
| 313 |
+
if c in lower_cols:
|
| 314 |
+
term_col = lower_cols[c]
|
| 315 |
+
break
|
| 316 |
+
|
| 317 |
+
# CDISC
|
| 318 |
+
if not term_col:
|
| 319 |
+
for c in ["cdisc submission value", "submission value"]:
|
| 320 |
+
if c in lower_cols:
|
| 321 |
+
term_col = lower_cols[c]
|
| 322 |
+
break
|
| 323 |
+
|
| 324 |
+
# Fallback
|
| 325 |
+
if not term_col:
|
| 326 |
+
for c in df.columns:
|
| 327 |
+
if "submission" in c.lower():
|
| 328 |
+
term_col = c
|
| 329 |
+
break
|
| 330 |
+
|
| 331 |
+
if not term_col:
|
| 332 |
+
print(f"⏩ Skipping sheet {sheet_name} — no term column")
|
| 333 |
+
continue
|
| 334 |
+
|
| 335 |
+
# -----------------------------
|
| 336 |
+
# MRCT Structured Format
|
| 337 |
+
# -----------------------------
|
| 338 |
+
if "Glossary Definition" in df.columns:
|
| 339 |
+
# All possible MRCT columns
|
| 340 |
+
mrct_cols = [
|
| 341 |
+
"Glossary Definition",
|
| 342 |
+
"Use in Context",
|
| 343 |
+
"More Info",
|
| 344 |
+
"Other Info to Think About When Joining a Study",
|
| 345 |
+
"Related Terms",
|
| 346 |
+
"Resource URL",
|
| 347 |
+
]
|
| 348 |
+
def_cols = [c for c in mrct_cols if c in df.columns]
|
| 349 |
+
else:
|
| 350 |
+
# Generic fallback
|
| 351 |
+
def_cols = [
|
| 352 |
+
c
|
| 353 |
+
for c in df.columns
|
| 354 |
+
if any(
|
| 355 |
+
k in c.lower()
|
| 356 |
+
for k in [
|
| 357 |
+
"definition",
|
| 358 |
+
"description",
|
| 359 |
+
"cdisc definition",
|
| 360 |
+
"context",
|
| 361 |
+
"info",
|
| 362 |
+
"related",
|
| 363 |
+
]
|
| 364 |
+
)
|
| 365 |
+
]
|
| 366 |
+
|
| 367 |
+
if not def_cols and len(df.columns) > 1:
|
| 368 |
+
def_cols = [df.columns[1]]
|
| 369 |
+
|
| 370 |
+
# -----------------------------
|
| 371 |
+
# Extract rows
|
| 372 |
+
# -----------------------------
|
| 373 |
+
for _, row in df.iterrows():
|
| 374 |
+
term = str(row.get(term_col, "")).strip()
|
| 375 |
+
if not term:
|
| 376 |
+
continue
|
| 377 |
+
|
| 378 |
+
# Clean + dedupe
|
| 379 |
+
def_parts = []
|
| 380 |
+
seen = set()
|
| 381 |
+
|
| 382 |
+
if "Glossary Definition" in df.columns:
|
| 383 |
+
raw = str(row.get("Glossary Definition", "")).strip()
|
| 384 |
+
parsed = parse_mrct_cell(raw)
|
| 385 |
+
|
| 386 |
+
if parsed:
|
| 387 |
+
# Preferred order
|
| 388 |
+
for label in SECTION_LABELS:
|
| 389 |
+
for plabel, ptext in parsed:
|
| 390 |
+
if plabel.lower() == label.lower() and ptext.strip():
|
| 391 |
+
if ptext not in seen:
|
| 392 |
+
def_parts.append(f"<b>{label}:</b> {ptext}")
|
| 393 |
+
seen.add(ptext)
|
| 394 |
+
|
| 395 |
+
# Add missing columns (non-duplicates)
|
| 396 |
+
for c in def_cols:
|
| 397 |
+
val = str(row.get(c, "")).strip()
|
| 398 |
+
if val and val not in seen:
|
| 399 |
+
def_parts.append(f"<b>{c}:</b> {val}")
|
| 400 |
+
seen.add(val)
|
| 401 |
+
else:
|
| 402 |
+
# Fallback to direct column reading
|
| 403 |
+
for c in def_cols:
|
| 404 |
+
val = str(row.get(c, "")).strip()
|
| 405 |
+
if val and val not in seen:
|
| 406 |
+
def_parts.append(f"<b>{c}:</b> {val}")
|
| 407 |
+
seen.add(val)
|
| 408 |
+
else:
|
| 409 |
+
# Non-MRCT Excel rows
|
| 410 |
+
for c in def_cols:
|
| 411 |
+
val = str(row.get(c, "")).strip()
|
| 412 |
+
if val and val not in seen:
|
| 413 |
+
def_parts.append(f"<b>{c}:</b> {val}")
|
| 414 |
+
seen.add(val)
|
| 415 |
+
|
| 416 |
+
if not def_parts:
|
| 417 |
+
continue
|
| 418 |
+
|
| 419 |
+
entry = {
|
| 420 |
+
"term": term,
|
| 421 |
+
"definition": "<br>".join(def_parts),
|
| 422 |
+
"sources": [excel],
|
| 423 |
+
"file": excel,
|
| 424 |
+
"sheet": sheet_name,
|
| 425 |
+
"type": "excel",
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
key = f"{normalize_term(term)}__{excel}"
|
| 429 |
+
all_defs[key] = entry
|
| 430 |
+
|
| 431 |
+
print(f"✅ Processed Excel: {excel}")
|
| 432 |
+
|
| 433 |
+
except Exception as e:
|
| 434 |
+
print(f"⚠️ Excel extraction error: {excel}: {e}")
|
| 435 |
+
|
| 436 |
+
# ----------------------------------------------------
|
| 437 |
+
# 3️⃣ Web Sources
|
| 438 |
+
# ----------------------------------------------------
|
| 439 |
+
web_defs = []
|
| 440 |
+
for url in WEB_SOURCES:
|
| 441 |
+
items = extract_web_glossary(url)
|
| 442 |
+
for e in items:
|
| 443 |
+
key = f"{normalize_term(e['term'])}__{e['file']}"
|
| 444 |
+
all_defs[key] = e
|
| 445 |
+
web_defs.append(e)
|
| 446 |
+
|
| 447 |
+
print(f"🌐 Added {len(web_defs)} web entries.")
|
| 448 |
+
|
| 449 |
+
# ----------------------------------------------------
|
| 450 |
+
# 4️⃣ SAVE glossary.json
|
| 451 |
+
# ----------------------------------------------------
|
| 452 |
+
os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
|
| 453 |
+
with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
|
| 454 |
+
json.dump(list(all_defs.values()), f, indent=2, ensure_ascii=False)
|
| 455 |
+
|
| 456 |
+
print(f"💾 Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")
|
| 457 |
+
|
| 458 |
+
# ----------------------------------------------------
|
| 459 |
+
# 5️⃣ UPLOAD TO HUGGINGFACE
|
| 460 |
+
# ----------------------------------------------------
|
| 461 |
+
if TOKEN:
|
| 462 |
+
try:
|
| 463 |
+
upload_file(
|
| 464 |
+
path_or_fileobj=LOCAL_GLOSSARY,
|
| 465 |
+
path_in_repo=REMOTE_GLOSSARY,
|
| 466 |
+
repo_id=DATASET_REPO,
|
| 467 |
+
repo_type="dataset",
|
| 468 |
+
token=TOKEN,
|
| 469 |
+
commit_message="Glossary updated (PDF + Excel + Web)",
|
| 470 |
+
)
|
| 471 |
+
print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
|
| 472 |
+
except Exception as e:
|
| 473 |
+
print(f"⚠️ Upload failed: {e}")
|
| 474 |
+
|
| 475 |
+
print(f"✨ Glossary rebuild complete in {time.time() - start:.1f}s")
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
# ------------------------------------------------------------------------------
|
| 479 |
+
if __name__ == "__main__":
|
| 480 |
+
rebuild_and_upload()
|
core/hybrid_retriever.py
ADDED
|
@@ -0,0 +1,925 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hybrid retriever (drop-in replacement)
|
| 3 |
+
-------------------------------------
|
| 4 |
+
- Preserves original function & variable names and signatures.
|
| 5 |
+
- Integrates CDISC Excel runtime loader, Abbreviations.xlsx loader,
|
| 6 |
+
PyMuPDF-based clinical-informatics PDF parser,
|
| 7 |
+
and MRCT duplicate-section dedupe.
|
| 8 |
+
- Injects abbreviation and CDISC entries as separate answers (one per term).
|
| 9 |
+
- Uses FAISS + BM25 retrieval as before.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import re
|
| 14 |
+
import time
|
| 15 |
+
import glob
|
| 16 |
+
from urllib.parse import urlparse
|
| 17 |
+
from difflib import SequenceMatcher
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import logging
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
# optional libs
|
| 24 |
+
try:
|
| 25 |
+
import pandas as pd
|
| 26 |
+
except Exception:
|
| 27 |
+
pd = None
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
import fitz # PyMuPDF
|
| 31 |
+
except Exception:
|
| 32 |
+
fitz = None
|
| 33 |
+
|
| 34 |
+
# project imports
|
| 35 |
+
from core.glossary import _normalize_term
|
| 36 |
+
from core.vector_store import _ensure_faiss_index, search_index, load_all_text_chunks
|
| 37 |
+
from core.bm25 import search_bm25
|
| 38 |
+
from utils.nlp_helpers import extract_van_tokens, normalize_query_text
|
| 39 |
+
|
| 40 |
+
# ----------------------------
|
| 41 |
+
# CONFIG
|
| 42 |
+
# ----------------------------
|
| 43 |
+
DENSE_TOP_K = 10
|
| 44 |
+
FUZZY_THRESHOLD = 0.15
|
| 45 |
+
TOP_RESULTS_LIMIT = 5
|
| 46 |
+
|
| 47 |
+
GCDMP_FILENAME = "GCDMP_Glossary.pdf" # exact filename in your HF space/persistent store
|
| 48 |
+
|
| 49 |
+
# ----------------------------
|
| 50 |
+
# UTILITIES (preserve names)
|
| 51 |
+
# ----------------------------
|
| 52 |
+
def fuzzy_ratio(a: str, b: str) -> float:
|
| 53 |
+
return SequenceMatcher(None, a or "", b or "").ratio()
|
| 54 |
+
|
| 55 |
+
def strip_question_phrases(text: str) -> str:
|
| 56 |
+
text = (text or "").lower().strip()
|
| 57 |
+
prefixes = [
|
| 58 |
+
"what", "how", "when", "why", "define", "definition", "meaning", "explain",
|
| 59 |
+
"describe", "expand", "abbreviate", "compare", "identify", "classify",
|
| 60 |
+
"determine", "do", "did", "does", "done", "can", "shall",
|
| 61 |
+
"will", "where", "which", "who", "whose", "have", "might", "could", "would",
|
| 62 |
+
"kindly", "please", "may", "you", "i", "we","us" "they", "there", "here",
|
| 63 |
+
"what's", "i'll", "where's", "how's", "there's", "who's", "didn't", "doesn't",
|
| 64 |
+
"give", "provide", "mention", "state", "arrange", "asking", "tell", "explain me",
|
| 65 |
+
"can you", "could you", "would you", "please explain", "let me know",
|
| 66 |
+
"say something about", "give details of", "show me", "find", "list", "expand on"
|
| 67 |
+
"what", "how", "when", "why", "define", "definition", "meaning", "explain",
|
| 68 |
+
"is", "was", "were", "are",
|
| 69 |
+
]
|
| 70 |
+
prefix_pattern = r"^(" + "|".join(re.escape(p) for p in prefixes) + r")(\s+|['’])"
|
| 71 |
+
while re.match(prefix_pattern, text):
|
| 72 |
+
text = re.sub(prefix_pattern, "", text).strip()
|
| 73 |
+
text = re.sub(r"[?.!]+$", "", text)
|
| 74 |
+
text = re.sub(r"\s{2,}", " ", text)
|
| 75 |
+
return text.strip()
|
| 76 |
+
|
| 77 |
+
def add_links_to_text(text: str) -> str:
|
| 78 |
+
return re.sub(r"(https?://[^\s<]+)", r'<a href="\1" target="_blank" rel="noopener noreferrer">\1</a>', text)
|
| 79 |
+
|
| 80 |
+
def get_source_rank(src: str, src_type: str) -> int:
|
| 81 |
+
s = (src or "").lower()
|
| 82 |
+
# 1. GCDMP glossary PDF → highest priority
|
| 83 |
+
if GCDMP_FILENAME.lower() in s:
|
| 84 |
+
return 1
|
| 85 |
+
# 2. MRCT Excel or MRCT filename
|
| 86 |
+
if src_type == "excel" or "mrct" in s:
|
| 87 |
+
return 2
|
| 88 |
+
# 3. ICH documents (E6, E3, E2A, E9, E1)
|
| 89 |
+
if any(x in s for x in ["ich_e6", "ich-e6", "ich e6", "ich_e3", "ich-e3", "ich e3", "ich_e2", "ich-e2", "ich e2", "ich_e9", "ich-e9", "ich e9", "ich_e1", "ich-e1", "ich e1"]):
|
| 90 |
+
return 3
|
| 91 |
+
# 4. Other PDFs
|
| 92 |
+
if src_type == "pdf":
|
| 93 |
+
return 4
|
| 94 |
+
# 5. Web sources
|
| 95 |
+
if src_type == "web":
|
| 96 |
+
return 5
|
| 97 |
+
return 6
|
| 98 |
+
|
| 99 |
+
# Patterns to filter junk lines commonly found in PDF extractions
|
| 100 |
+
JUNK_PATTERNS = [
|
| 101 |
+
r"^\s*\d+\s*$", # page-only lines
|
| 102 |
+
r"^\s*Page\s+\d+\s*$",
|
| 103 |
+
r"^\s*Table of Contents.*$",
|
| 104 |
+
r"^\s*Figure\s+\d+.*$",
|
| 105 |
+
r"^\s*Section\s+\d+.*$",
|
| 106 |
+
r".*\.{5,}.*", # dotted lines
|
| 107 |
+
r"^\s*-{3,}\s*$",
|
| 108 |
+
r"^\s*_+\s*$",
|
| 109 |
+
r"^\s*required by regulatory authorities.*$",
|
| 110 |
+
]
|
| 111 |
+
_COMPILED_JUNK = [re.compile(p, flags=re.IGNORECASE) for p in JUNK_PATTERNS]
|
| 112 |
+
|
| 113 |
+
def clean_extracted_text(text: str) -> str:
|
| 114 |
+
if not text:
|
| 115 |
+
return text
|
| 116 |
+
lines = text.splitlines()
|
| 117 |
+
cleaned = []
|
| 118 |
+
for line in lines:
|
| 119 |
+
s = line.strip()
|
| 120 |
+
if not s:
|
| 121 |
+
continue
|
| 122 |
+
junk = False
|
| 123 |
+
for pat in _COMPILED_JUNK:
|
| 124 |
+
if pat.match(s):
|
| 125 |
+
junk = True
|
| 126 |
+
break
|
| 127 |
+
if junk:
|
| 128 |
+
continue
|
| 129 |
+
s = re.sub(r'\.{3,}', '.', s)
|
| 130 |
+
s = re.sub(r'\s{2,}', ' ', s)
|
| 131 |
+
cleaned.append(s)
|
| 132 |
+
return "\n".join(cleaned)
|
| 133 |
+
|
| 134 |
+
def dedupe_section_headers(txt):
|
| 135 |
+
"""
|
| 136 |
+
Remove repeated section headers such as 'Use in Context', 'More Info', etc.
|
| 137 |
+
Keeps first occurrence of each heading.
|
| 138 |
+
"""
|
| 139 |
+
if not txt:
|
| 140 |
+
return txt
|
| 141 |
+
lines = txt.splitlines()
|
| 142 |
+
seen = set()
|
| 143 |
+
out_lines = []
|
| 144 |
+
heading_labels = {
|
| 145 |
+
"Glossary Definition", "Use in Context", "More Info",
|
| 146 |
+
"Other Info to Think About When Joining a Study", "Related Terms", "Term URL",
|
| 147 |
+
"Other Resources", "Resource URL"
|
| 148 |
+
}
|
| 149 |
+
for line in lines:
|
| 150 |
+
# detect heading start
|
| 151 |
+
m = re.match(r"^([A-Za-z0-9 \-]{3,200}):\s*$", line)
|
| 152 |
+
if m:
|
| 153 |
+
h = m.group(1).strip()
|
| 154 |
+
if h in heading_labels:
|
| 155 |
+
if h in seen:
|
| 156 |
+
# skip this heading line
|
| 157 |
+
continue
|
| 158 |
+
else:
|
| 159 |
+
seen.add(h)
|
| 160 |
+
out_lines.append(line)
|
| 161 |
+
return "\n".join(out_lines)
|
| 162 |
+
|
| 163 |
+
# ----------------------------
|
| 164 |
+
# Excel / MRCT section parser (robust)
|
| 165 |
+
# ----------------------------
|
| 166 |
+
# Recognizes label variants and returns ordered sections as list of (label, text)
|
| 167 |
+
_SECTION_LABELS_ORDER = [
|
| 168 |
+
"Glossary Definition",
|
| 169 |
+
"Use in Context",
|
| 170 |
+
"More Info",
|
| 171 |
+
"Other Info to Think About When Joining a Study",
|
| 172 |
+
"Related Terms",
|
| 173 |
+
"Other Resources",
|
| 174 |
+
"Resource URL",
|
| 175 |
+
"Term URL",
|
| 176 |
+
"CDISC/NCI URL"
|
| 177 |
+
]
|
| 178 |
+
|
| 179 |
+
# label alternatives to catch small variations
|
| 180 |
+
_LABEL_ALIASES = {
|
| 181 |
+
"glossary definition": ["glossary definition", "definition", "glossarydefinition"],
|
| 182 |
+
"use in context": ["use in context", "use in context:"],
|
| 183 |
+
"more info": ["more info", "more information", "additional info", "additional information"],
|
| 184 |
+
"other info to think about when joining a study": [
|
| 185 |
+
"other info to think about when joining a study",
|
| 186 |
+
"other info to think about when joining the study",
|
| 187 |
+
"other info to think about when joining a study:"
|
| 188 |
+
],
|
| 189 |
+
"related terms": ["related terms", "related term", "related terms:"],
|
| 190 |
+
"other resources": ["other resources", "other resource"],
|
| 191 |
+
"resource url": ["resource url", "other resources:", "other resources:"],
|
| 192 |
+
"term url": ["term url", "term url:"],
|
| 193 |
+
"cdisc/nci url": ["cdisc/nci url", "cdisc nci url"],
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
# regex to find labels in a single blob of text
|
| 197 |
+
_LABEL_RE = re.compile(
|
| 198 |
+
r"(?P<label>(Glossary Definition|Definition|Use in Context|More Info|More Information|Other Info to Think About When Joining a Study|Other Info|Related Terms|Related Terms:|Related Term|Other Resources|Resource URL|Term URL|CDISC/NCI URL))\s*[:\-]\s*",
|
| 199 |
+
flags=re.IGNORECASE
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
def parse_excel_sections(blob: str):
|
| 203 |
+
"""
|
| 204 |
+
Parse a text blob that may contain multiple labeled sections (MRCT style).
|
| 205 |
+
Returns ordered list of (label, text) based on _SECTION_LABELS_ORDER.
|
| 206 |
+
If labels are missing, returns a single ('Glossary Definition', blob).
|
| 207 |
+
"""
|
| 208 |
+
if not blob or not isinstance(blob, str):
|
| 209 |
+
return []
|
| 210 |
+
|
| 211 |
+
# Normalize common HTML tags if present (some entries might include <br> or <b>)
|
| 212 |
+
b = re.sub(r"<br\s*/?>", "\n", blob, flags=re.IGNORECASE)
|
| 213 |
+
b = re.sub(r"</?[^>]+>", "", b) # strip tags conservatively
|
| 214 |
+
|
| 215 |
+
# Find all label matches and their positions
|
| 216 |
+
matches = list(_LABEL_RE.finditer(b))
|
| 217 |
+
if not matches:
|
| 218 |
+
# no labels found: attempt heuristic splits by known label-like lines
|
| 219 |
+
# split by double newline or "Related Terms:" if present
|
| 220 |
+
if "\n\n" in b:
|
| 221 |
+
parts = [p.strip() for p in b.split("\n\n") if p.strip()]
|
| 222 |
+
# heuristically map first part to glossary definition
|
| 223 |
+
out = [("Glossary Definition", parts[0])]
|
| 224 |
+
# remaining parts appended as "More Info" or "Other Info"
|
| 225 |
+
for i, p in enumerate(parts[1:], start=1):
|
| 226 |
+
label = "More Info" if i == 1 else f"Other Info {i}"
|
| 227 |
+
out.append((label, p))
|
| 228 |
+
return out
|
| 229 |
+
# fallback single block
|
| 230 |
+
return [("Glossary Definition", b.strip())]
|
| 231 |
+
|
| 232 |
+
# build spans for label->value
|
| 233 |
+
spans = []
|
| 234 |
+
for idx, m in enumerate(matches):
|
| 235 |
+
start = m.end()
|
| 236 |
+
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(b)
|
| 237 |
+
label = m.group("label").strip().rstrip(":").strip()
|
| 238 |
+
val = b[start:end].strip()
|
| 239 |
+
spans.append((label, val))
|
| 240 |
+
|
| 241 |
+
# Normalize labels to canonical labels and build ordered dict
|
| 242 |
+
canonical = {}
|
| 243 |
+
for lab, val in spans:
|
| 244 |
+
key = lab.lower().strip().rstrip(":")
|
| 245 |
+
# map through alias set
|
| 246 |
+
mapped = None
|
| 247 |
+
for canon, aliases in _LABEL_ALIASES.items():
|
| 248 |
+
for alias in aliases:
|
| 249 |
+
if alias in key:
|
| 250 |
+
mapped = canon
|
| 251 |
+
break
|
| 252 |
+
if mapped:
|
| 253 |
+
break
|
| 254 |
+
if not mapped:
|
| 255 |
+
# fallback: title-case the label
|
| 256 |
+
mapped = lab.strip().title()
|
| 257 |
+
canonical[mapped] = canonical.get(mapped, "") + ("\n\n" + val if canonical.get(mapped) else val)
|
| 258 |
+
|
| 259 |
+
# produce ordered list according to preferred order
|
| 260 |
+
out = []
|
| 261 |
+
for label in _SECTION_LABELS_ORDER:
|
| 262 |
+
key = label.lower()
|
| 263 |
+
if key in canonical:
|
| 264 |
+
out.append((label, canonical[key].strip()))
|
| 265 |
+
# if nothing matched (odd case), return spans as-is (label, val)
|
| 266 |
+
if not out:
|
| 267 |
+
for lab, val in spans:
|
| 268 |
+
out.append((lab.strip(), val.strip()))
|
| 269 |
+
return out
|
| 270 |
+
|
| 271 |
+
# ----------------------------
|
| 272 |
+
# Abbreviations.xlsx loader (Priority B: treated as excel)
|
| 273 |
+
# ----------------------------
|
| 274 |
+
def load_abbreviations_entries(search_paths=None):
|
| 275 |
+
"""
|
| 276 |
+
Discover Abbreviations.xlsx and return list of candidate dicts:
|
| 277 |
+
[{'definition','text','file','type','term','sources'}...]
|
| 278 |
+
Auto-detects header names: term/acronym and definition/long name/description.
|
| 279 |
+
Falls back to positional columns (A=term, B=definition).
|
| 280 |
+
"""
|
| 281 |
+
if pd is None:
|
| 282 |
+
logger.warning("pandas not installed — skipping Abbreviations.xlsx load.")
|
| 283 |
+
return []
|
| 284 |
+
|
| 285 |
+
# common HF cache & persistent paths
|
| 286 |
+
HF_CACHE_ROOT = "/root/.cache/huggingface/hub"
|
| 287 |
+
HF_DATASET_PREFIX = os.path.join(HF_CACHE_ROOT, "datasets--essprasad--CT-Chat-Docs")
|
| 288 |
+
|
| 289 |
+
default_paths = [
|
| 290 |
+
".", "/workspace/data", "/mnt/data", os.getcwd(),
|
| 291 |
+
HF_CACHE_ROOT,
|
| 292 |
+
HF_DATASET_PREFIX,
|
| 293 |
+
os.path.join(HF_DATASET_PREFIX, "snapshots"),
|
| 294 |
+
"/home/user/app/persistent", "/home/user/app/persistent/glossary",
|
| 295 |
+
"/app/persistent", "/persistent", "/root/.cache"
|
| 296 |
+
]
|
| 297 |
+
if search_paths:
|
| 298 |
+
default_paths = list(search_paths) + default_paths
|
| 299 |
+
|
| 300 |
+
files = []
|
| 301 |
+
for base in default_paths:
|
| 302 |
+
try:
|
| 303 |
+
files.extend(glob.glob(os.path.join(base, "*Abbreviations*.xls*"), recursive=True))
|
| 304 |
+
files.extend(glob.glob(os.path.join(base, "**/Abbreviations.xlsx"), recursive=True))
|
| 305 |
+
files.extend(glob.glob(os.path.join(base, "**/*abbrev*.xls*"), recursive=True))
|
| 306 |
+
files.extend(glob.glob(os.path.join(base, "**/*abbreviations*.xls*"), recursive=True))
|
| 307 |
+
except Exception:
|
| 308 |
+
continue
|
| 309 |
+
files = list(dict.fromkeys(files))
|
| 310 |
+
|
| 311 |
+
entries = []
|
| 312 |
+
for fx in files:
|
| 313 |
+
try:
|
| 314 |
+
if fx.lower().endswith("x") and pd is not None:
|
| 315 |
+
try:
|
| 316 |
+
df = pd.read_excel(fx, engine="openpyxl")
|
| 317 |
+
except Exception:
|
| 318 |
+
df = pd.read_excel(fx)
|
| 319 |
+
else:
|
| 320 |
+
df = pd.read_excel(fx)
|
| 321 |
+
except Exception as e:
|
| 322 |
+
logger.exception("Failed to read Abbreviations file %s: %s", fx, e)
|
| 323 |
+
continue
|
| 324 |
+
|
| 325 |
+
cols = {c.strip().lower(): c for c in df.columns}
|
| 326 |
+
# common possible headers for term
|
| 327 |
+
term_candidates = ["abbreviation", "acronym", "term", "short form", "initial", "abbrev", "abbrev."]
|
| 328 |
+
def_candidates = ["definition", "description", "long name", "meaning", "full form", "explanation"]
|
| 329 |
+
|
| 330 |
+
term_col = None
|
| 331 |
+
def_col = None
|
| 332 |
+
for k, v in cols.items():
|
| 333 |
+
if any(tc in k for tc in term_candidates) and term_col is None:
|
| 334 |
+
term_col = v
|
| 335 |
+
if any(dc in k for dc in def_candidates) and def_col is None:
|
| 336 |
+
def_col = v
|
| 337 |
+
|
| 338 |
+
# fallback to positional columns A/B (0/1)
|
| 339 |
+
if not term_col or not def_col:
|
| 340 |
+
try:
|
| 341 |
+
term_col = term_col or df.columns[0]
|
| 342 |
+
def_col = def_col or (df.columns[1] if len(df.columns) > 1 else df.columns[0])
|
| 343 |
+
except Exception:
|
| 344 |
+
logger.warning("Abbreviations file %s missing expected term/definition columns. Skipping.", fx)
|
| 345 |
+
continue
|
| 346 |
+
|
| 347 |
+
count = 0
|
| 348 |
+
for _, row in df.iterrows():
|
| 349 |
+
term = str(row.get(term_col) or "").strip()
|
| 350 |
+
definition = str(row.get(def_col) or "").strip()
|
| 351 |
+
if not term or not definition:
|
| 352 |
+
continue
|
| 353 |
+
# combine into text for retrieval
|
| 354 |
+
combined = definition
|
| 355 |
+
entries.append({
|
| 356 |
+
"definition": definition,
|
| 357 |
+
"text": combined,
|
| 358 |
+
"file": os.path.basename(fx),
|
| 359 |
+
"type": "excel",
|
| 360 |
+
"term": term,
|
| 361 |
+
"sources": [os.path.basename(fx)]
|
| 362 |
+
})
|
| 363 |
+
count += 1
|
| 364 |
+
logger.info("Loaded %d abbreviations from %s", count, fx)
|
| 365 |
+
logger.info("Total loaded abbreviations: %d", len(entries))
|
| 366 |
+
return entries
|
| 367 |
+
|
| 368 |
+
# ----------------------------
|
| 369 |
+
# CDISC Excel loader (improved HF cache discovery)
|
| 370 |
+
# ----------------------------
|
| 371 |
+
def load_cdisc_entries(search_paths=None):
|
| 372 |
+
"""
|
| 373 |
+
Discover CDISC Excel files and return a list of normalized candidate dicts:
|
| 374 |
+
[{'definition','text','file','type','term','sources'}...]
|
| 375 |
+
"""
|
| 376 |
+
if pd is None:
|
| 377 |
+
logger.warning("pandas not installed — skipping CDISC Excel load.")
|
| 378 |
+
return []
|
| 379 |
+
|
| 380 |
+
# HF cache path where datasets are downloaded during rebuild
|
| 381 |
+
HF_CACHE_ROOT = "/root/.cache/huggingface/hub"
|
| 382 |
+
HF_DATASET_PREFIX = os.path.join(HF_CACHE_ROOT, "datasets--essprasad--CT-Chat-Docs")
|
| 383 |
+
|
| 384 |
+
default_paths = [
|
| 385 |
+
".", "/workspace/data", "/mnt/data", os.getcwd(),
|
| 386 |
+
HF_CACHE_ROOT,
|
| 387 |
+
HF_DATASET_PREFIX,
|
| 388 |
+
os.path.join(HF_DATASET_PREFIX, "snapshots"),
|
| 389 |
+
"/home/user/app/persistent", "/home/user/app/persistent/glossary",
|
| 390 |
+
"/app/persistent", "/persistent", "/root/.cache"
|
| 391 |
+
]
|
| 392 |
+
if search_paths:
|
| 393 |
+
default_paths = list(search_paths) + default_paths
|
| 394 |
+
|
| 395 |
+
# find files (recursive search)
|
| 396 |
+
files = []
|
| 397 |
+
for base in default_paths:
|
| 398 |
+
try:
|
| 399 |
+
files.extend(glob.glob(os.path.join(base, "*[Cc][Dd][Ii][Ss][Cc]*.xls*"), recursive=True))
|
| 400 |
+
files.extend(glob.glob(os.path.join(base, "**/CDISC Glossary.xlsx"), recursive=True))
|
| 401 |
+
files.extend(glob.glob(os.path.join(base, "**/*CDISC*.xls*"), recursive=True))
|
| 402 |
+
except Exception:
|
| 403 |
+
continue
|
| 404 |
+
files = list(dict.fromkeys(files)) # unique
|
| 405 |
+
|
| 406 |
+
entries = []
|
| 407 |
+
for fx in files:
|
| 408 |
+
try:
|
| 409 |
+
# read with openpyxl for .xlsx when available
|
| 410 |
+
if fx.lower().endswith("x") and pd is not None:
|
| 411 |
+
try:
|
| 412 |
+
df = pd.read_excel(fx, engine="openpyxl")
|
| 413 |
+
except Exception:
|
| 414 |
+
df = pd.read_excel(fx)
|
| 415 |
+
else:
|
| 416 |
+
df = pd.read_excel(fx)
|
| 417 |
+
except Exception as e:
|
| 418 |
+
logger.exception("Failed to read CDISC file %s: %s", fx, e)
|
| 419 |
+
continue
|
| 420 |
+
|
| 421 |
+
cols = {c.strip().lower(): c for c in df.columns}
|
| 422 |
+
term_col = cols.get("cdisc submission value") or cols.get("term") or cols.get("submission value")
|
| 423 |
+
syn_col = cols.get("cdisc synonym(s)") or cols.get("cdisc synonym") or cols.get("synonym(s)") or cols.get("synonyms")
|
| 424 |
+
def_col = cols.get("cdisc definition") or cols.get("definition") or cols.get("cdisc definition(s)")
|
| 425 |
+
|
| 426 |
+
# fallback to positional columns A/B/C if headers differ
|
| 427 |
+
if not term_col or not def_col:
|
| 428 |
+
try:
|
| 429 |
+
term_col = term_col or df.columns[0]
|
| 430 |
+
def_col = def_col or df.columns[2]
|
| 431 |
+
syn_col = syn_col or (df.columns[1] if len(df.columns) > 1 else None)
|
| 432 |
+
except Exception:
|
| 433 |
+
logger.warning("CDISC file %s missing expected columns (A or C). Skipping.", fx)
|
| 434 |
+
continue
|
| 435 |
+
|
| 436 |
+
for _, row in df.iterrows():
|
| 437 |
+
term = str(row.get(term_col) or "").strip()
|
| 438 |
+
synonyms = str(row.get(syn_col) or "").strip() if syn_col else ""
|
| 439 |
+
definition = str(row.get(def_col) or "").strip()
|
| 440 |
+
if not term or not definition:
|
| 441 |
+
continue
|
| 442 |
+
# Build combined text including synonyms for better retrieval
|
| 443 |
+
text_parts = [definition]
|
| 444 |
+
if synonyms:
|
| 445 |
+
text_parts.append("Synonyms: " + synonyms)
|
| 446 |
+
combined = "\n\n".join([p for p in text_parts if p])
|
| 447 |
+
entries.append({
|
| 448 |
+
"definition": definition,
|
| 449 |
+
"text": combined,
|
| 450 |
+
"file": os.path.basename(fx),
|
| 451 |
+
"type": "excel",
|
| 452 |
+
"term": term,
|
| 453 |
+
"sources": [os.path.basename(fx)]
|
| 454 |
+
})
|
| 455 |
+
logger.info("Loaded %d CDISC entries from %d files", len(entries), len(files))
|
| 456 |
+
return entries
|
| 457 |
+
|
| 458 |
+
# ----------------------------
|
| 459 |
+
# Clinical-informatics PDF parser using PyMuPDF (fitz)
|
| 460 |
+
# ----------------------------
|
| 461 |
+
def parse_clinical_informatics_pdf(path):
|
| 462 |
+
"""
|
| 463 |
+
Parse clinical-informatics-acronym-glossary.pdf using PyMuPDF (fitz).
|
| 464 |
+
Extracts acronym headings and description blocks; returns list of candidate dicts.
|
| 465 |
+
"""
|
| 466 |
+
if fitz is None:
|
| 467 |
+
logger.warning("PyMuPDF (fitz) not installed — skipping clinical-informatics PDF parsing.")
|
| 468 |
+
return []
|
| 469 |
+
|
| 470 |
+
try:
|
| 471 |
+
doc = fitz.open(path)
|
| 472 |
+
except Exception as e:
|
| 473 |
+
logger.exception("Failed to open PDF %s: %s", path, e)
|
| 474 |
+
return []
|
| 475 |
+
|
| 476 |
+
full_text = []
|
| 477 |
+
for page in doc:
|
| 478 |
+
try:
|
| 479 |
+
text = page.get_text("text")
|
| 480 |
+
if text:
|
| 481 |
+
full_text.append(text)
|
| 482 |
+
except Exception:
|
| 483 |
+
continue
|
| 484 |
+
doc.close()
|
| 485 |
+
|
| 486 |
+
full = "\n".join(full_text)
|
| 487 |
+
full = full.replace("\r", "")
|
| 488 |
+
# split into blocks by blank lines
|
| 489 |
+
blocks = [b.strip() for b in re.split(r"\n\s*\n", full) if b.strip()]
|
| 490 |
+
|
| 491 |
+
entries = []
|
| 492 |
+
i = 0
|
| 493 |
+
while i < len(blocks):
|
| 494 |
+
blk = blocks[i]
|
| 495 |
+
heading = blk.splitlines()[0].strip()
|
| 496 |
+
# detect an acronym-like heading: all-caps, digits, dashes, short
|
| 497 |
+
if re.match(r"^[A-Z0-9\-/]{1,12}$", heading):
|
| 498 |
+
term = heading
|
| 499 |
+
desc_parts = []
|
| 500 |
+
j = i + 1
|
| 501 |
+
while j < len(blocks):
|
| 502 |
+
next_head = blocks[j].splitlines()[0].strip()
|
| 503 |
+
if re.match(r"^[A-Z0-9\-/]{1,12}$", next_head):
|
| 504 |
+
break
|
| 505 |
+
desc_parts.append(blocks[j])
|
| 506 |
+
j += 1
|
| 507 |
+
if desc_parts:
|
| 508 |
+
definition = "\n\n".join(desc_parts).strip()
|
| 509 |
+
entries.append({
|
| 510 |
+
"definition": definition,
|
| 511 |
+
"text": definition,
|
| 512 |
+
"file": os.path.basename(path),
|
| 513 |
+
"type": "pdf",
|
| 514 |
+
"term": term,
|
| 515 |
+
"sources": [os.path.basename(path)]
|
| 516 |
+
})
|
| 517 |
+
i = j
|
| 518 |
+
else:
|
| 519 |
+
i += 1
|
| 520 |
+
|
| 521 |
+
logger.info("Parsed %d entries from clinical-informatics PDF (PyMuPDF).", len(entries))
|
| 522 |
+
return entries
|
| 523 |
+
|
| 524 |
+
# ----------------------------
|
| 525 |
+
# MAIN RETRIEVER (preserve name)
|
| 526 |
+
# ----------------------------
|
| 527 |
+
def summarize_combined(query: str, mode: str = "short") -> str:
|
| 528 |
+
start = time.time()
|
| 529 |
+
if not query or not query.strip():
|
| 530 |
+
return "<i>No query provided.</i>"
|
| 531 |
+
|
| 532 |
+
# Normalize user query
|
| 533 |
+
cleaned = strip_question_phrases(query)
|
| 534 |
+
expanded = normalize_query_text(cleaned)
|
| 535 |
+
van_tokens = extract_van_tokens(expanded)
|
| 536 |
+
normalized = " ".join(van_tokens).strip() or cleaned
|
| 537 |
+
nq = normalized.lower().strip()
|
| 538 |
+
print(f"🔍 summarize_combined() | cleaned='{cleaned}' normalized='{nq}'")
|
| 539 |
+
|
| 540 |
+
# Acronym expansion map (preserve/extend)
|
| 541 |
+
acronym_map = {
|
| 542 |
+
"ae": "adverse event", "adr": "adverse drug reaction",
|
| 543 |
+
"crf": "case report form", "ecrf": "electronic case report form",
|
| 544 |
+
"cro": "contract research organization", "csr": "clinical study report",
|
| 545 |
+
"ctms": "clinical trial management system", "edc": "electronic data capture",
|
| 546 |
+
"ehr": "electronic health record", "emr": "electronic medical record",
|
| 547 |
+
"gcp": "good clinical practice", "irb": "institutional review board",
|
| 548 |
+
"iec": "independent ethics committee", "ind": "investigational new drug application",
|
| 549 |
+
"mrct": "multi-regional clinical trials", "qa": "quality assurance",
|
| 550 |
+
"qc": "quality control", "sae": "serious adverse event", "sap": "statistical analysis plan",
|
| 551 |
+
"siv": "site initiation visit", "sop": "standard operating procedure",
|
| 552 |
+
"ssu": "study start-up", "uat": "user acceptance testing",
|
| 553 |
+
"whodrug": "world health organization drug dictionary",
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
glossary_key = _normalize_term(nq)
|
| 557 |
+
if glossary_key in acronym_map:
|
| 558 |
+
expanded_term = acronym_map[glossary_key]
|
| 559 |
+
nq = _normalize_term(expanded_term)
|
| 560 |
+
print(f"🔁 Acronym expanded → {expanded_term}")
|
| 561 |
+
|
| 562 |
+
# ----------------------------
|
| 563 |
+
# FAISS + BM25 retrieval
|
| 564 |
+
# ----------------------------
|
| 565 |
+
dense_hits, bm25_hits = [], []
|
| 566 |
+
try:
|
| 567 |
+
if _ensure_faiss_index():
|
| 568 |
+
dense_hits = search_index(normalized, top_k=DENSE_TOP_K) or []
|
| 569 |
+
print(f"✅ FAISS hits: {len(dense_hits)}")
|
| 570 |
+
except Exception as e:
|
| 571 |
+
print(f"⚠️ FAISS search failed: {e}")
|
| 572 |
+
|
| 573 |
+
try:
|
| 574 |
+
docs = load_all_text_chunks()
|
| 575 |
+
if docs:
|
| 576 |
+
bm25_hits = search_bm25(normalized, docs, top_n=8) or []
|
| 577 |
+
print(f"✅ BM25 hits: {len(bm25_hits)}")
|
| 578 |
+
except Exception as e:
|
| 579 |
+
print(f"⚠️ BM25 fallback failed: {e}")
|
| 580 |
+
|
| 581 |
+
# ----------------------------
|
| 582 |
+
# Inject Abbreviations + CDISC + clinical-informatics PDF parsed entries (runtime)
|
| 583 |
+
# ----------------------------
|
| 584 |
+
extra_hits = []
|
| 585 |
+
try:
|
| 586 |
+
abbrev_entries = load_abbreviations_entries()
|
| 587 |
+
for e in abbrev_entries:
|
| 588 |
+
extra_hits.append({
|
| 589 |
+
"definition": e["definition"],
|
| 590 |
+
"text": e["text"],
|
| 591 |
+
"file": e["file"],
|
| 592 |
+
"type": e["type"],
|
| 593 |
+
"term": e["term"],
|
| 594 |
+
"sources": e.get("sources", [])
|
| 595 |
+
})
|
| 596 |
+
except Exception as e:
|
| 597 |
+
logger.exception("Abbreviations load failed: %s", e)
|
| 598 |
+
|
| 599 |
+
try:
|
| 600 |
+
cdisc_entries = load_cdisc_entries()
|
| 601 |
+
for e in cdisc_entries:
|
| 602 |
+
extra_hits.append({
|
| 603 |
+
"definition": e["definition"],
|
| 604 |
+
"text": e["text"],
|
| 605 |
+
"file": e["file"],
|
| 606 |
+
"type": e["type"],
|
| 607 |
+
"term": e["term"],
|
| 608 |
+
"sources": e.get("sources", [])
|
| 609 |
+
})
|
| 610 |
+
except Exception as e:
|
| 611 |
+
logger.exception("CDISC load failed: %s", e)
|
| 612 |
+
|
| 613 |
+
try:
|
| 614 |
+
pdf_paths = glob.glob("./*clinical*informatics*.pdf") + glob.glob("/mnt/data/*clinical*informatics*.pdf") + glob.glob("/workspace/data/*clinical*informatics*.pdf")
|
| 615 |
+
pdf_paths = list(dict.fromkeys(pdf_paths))
|
| 616 |
+
for p in pdf_paths:
|
| 617 |
+
parsed = parse_clinical_informatics_pdf(p)
|
| 618 |
+
for e in parsed:
|
| 619 |
+
extra_hits.append({
|
| 620 |
+
"definition": e["definition"],
|
| 621 |
+
"text": e["text"],
|
| 622 |
+
"file": e["file"],
|
| 623 |
+
"type": e["type"],
|
| 624 |
+
"term": e["term"],
|
| 625 |
+
"sources": e.get("sources", [])
|
| 626 |
+
})
|
| 627 |
+
except Exception as e:
|
| 628 |
+
logger.exception("clinical-informatics parse failed: %s", e)
|
| 629 |
+
|
| 630 |
+
hits = (dense_hits or []) + (bm25_hits or []) + extra_hits
|
| 631 |
+
if not hits:
|
| 632 |
+
return "<i>No relevant information found.</i>"
|
| 633 |
+
|
| 634 |
+
# ----------------------------
|
| 635 |
+
# Group by original resolved source (prefer real source over glossary.json)
|
| 636 |
+
# ----------------------------
|
| 637 |
+
grouped = {}
|
| 638 |
+
glossary_fallbacks = []
|
| 639 |
+
|
| 640 |
+
for h in hits:
|
| 641 |
+
raw_src = h.get("file") or h.get("source") or h.get("source_file") or "unknown"
|
| 642 |
+
meta_sources = h.get("sources") or h.get("source_list") or []
|
| 643 |
+
|
| 644 |
+
# prefer a non-glossary meta source if available
|
| 645 |
+
src = raw_src
|
| 646 |
+
if isinstance(meta_sources, (list, tuple)) and meta_sources:
|
| 647 |
+
chosen = None
|
| 648 |
+
for s in meta_sources:
|
| 649 |
+
if isinstance(s, str) and not s.lower().endswith("glossary.json"):
|
| 650 |
+
chosen = s
|
| 651 |
+
break
|
| 652 |
+
if chosen:
|
| 653 |
+
src = chosen
|
| 654 |
+
else:
|
| 655 |
+
src = meta_sources[0]
|
| 656 |
+
|
| 657 |
+
src_type = (h.get("type") or "").lower()
|
| 658 |
+
term = (h.get("term") or "").strip()
|
| 659 |
+
term_lower = term.lower()
|
| 660 |
+
|
| 661 |
+
txt = (h.get("definition") or h.get("text") or h.get("content") or h.get("full_text") or "").strip()
|
| 662 |
+
if not txt:
|
| 663 |
+
continue
|
| 664 |
+
|
| 665 |
+
txt = clean_extracted_text(txt)
|
| 666 |
+
# If original stored file was glossary.json, keep as fallback only
|
| 667 |
+
if str(raw_src).lower().endswith("glossary.json"):
|
| 668 |
+
glossary_fallbacks.append({"hit": h, "text": txt, "src": src})
|
| 669 |
+
|
| 670 |
+
# Save resolved sources for provenance. Ensure URL-like sources are preserved.
|
| 671 |
+
resolved_sources = meta_sources if meta_sources else []
|
| 672 |
+
# If resolved_sources empty, try to collect URL-like values from hit fields or src
|
| 673 |
+
if not resolved_sources:
|
| 674 |
+
possible = []
|
| 675 |
+
for key in ("url", "source", "link", "file"):
|
| 676 |
+
v = h.get(key)
|
| 677 |
+
if isinstance(v, str) and v.startswith("http"):
|
| 678 |
+
possible.append(v)
|
| 679 |
+
if isinstance(src, str) and src.startswith("http"):
|
| 680 |
+
possible.append(src)
|
| 681 |
+
# fallback to raw_src if nothing else
|
| 682 |
+
resolved_sources = possible or [raw_src]
|
| 683 |
+
# normalize to list
|
| 684 |
+
if isinstance(resolved_sources, (list, tuple)):
|
| 685 |
+
resolved_sources = list(resolved_sources)
|
| 686 |
+
else:
|
| 687 |
+
resolved_sources = [resolved_sources]
|
| 688 |
+
h["_resolved_sources"] = resolved_sources
|
| 689 |
+
|
| 690 |
+
# For MRCT-like text (detected by filename), dedupe repeated sections first
|
| 691 |
+
if "mrct" in str(src).lower() or "mrct" in str(raw_src).lower():
|
| 692 |
+
txt = dedupe_section_headers(txt)
|
| 693 |
+
|
| 694 |
+
# Group key based on resolved original source + type + term
|
| 695 |
+
# Special-case Abbreviations so each term is unique (Priority B behavior)
|
| 696 |
+
src_l = str(src).lower()
|
| 697 |
+
raw_src_l = str(raw_src).lower()
|
| 698 |
+
if "abbreviations.xlsx" in src_l or "abbreviations.xlsx" in raw_src_l or ("abbreviations" in src_l and src_type == "excel"):
|
| 699 |
+
key = f"abbrev__excel__{term_lower}"
|
| 700 |
+
# Special-case CDISC so each term is unique (Option A)
|
| 701 |
+
elif "cdisc glossary.xlsx" in src_l or "cdisc glossary.xlsx" in raw_src_l or ("cdisc" in src_l and src_type == "excel"):
|
| 702 |
+
key = f"cdisc__excel__{term_lower}"
|
| 703 |
+
else:
|
| 704 |
+
key = f"{os.path.basename(src).lower()}__{src_type}__{term_lower[:200]}"
|
| 705 |
+
|
| 706 |
+
# Prefer glossary PDF entries (GCDMP/ 'glossary' in filename) when colliding with long chunks
|
| 707 |
+
prefer_glossary = (GCDMP_FILENAME.lower() in str(src).lower()) or ("glossary" in str(src).lower())
|
| 708 |
+
|
| 709 |
+
if key not in grouped:
|
| 710 |
+
grouped[key] = {"hit": h, "text": txt, "src": src, "src_type": src_type, "term": term}
|
| 711 |
+
else:
|
| 712 |
+
existing_src = grouped[key]["src"]
|
| 713 |
+
existing_is_glossary = (GCDMP_FILENAME.lower() in str(existing_src).lower()) or ("glossary" in str(existing_src).lower())
|
| 714 |
+
if prefer_glossary and not existing_is_glossary:
|
| 715 |
+
grouped[key] = {"hit": h, "text": txt, "src": src, "src_type": src_type, "term": term}
|
| 716 |
+
else:
|
| 717 |
+
# otherwise prefer longer chunk unless this new is a glossary and existing is not
|
| 718 |
+
if not prefer_glossary and len(txt) > len(grouped[key]["text"]):
|
| 719 |
+
grouped[key] = {"hit": h, "text": txt, "src": src, "src_type": src_type, "term": term}
|
| 720 |
+
|
| 721 |
+
# ----------------------------
|
| 722 |
+
# Format answers: one per original source
|
| 723 |
+
# ----------------------------
|
| 724 |
+
answers = []
|
| 725 |
+
src_counts = {"excel": 0, "pdf": 0, "web": 0, "other": 0}
|
| 726 |
+
|
| 727 |
+
# ensure only one combined excel answer per term to prevent duplicated sections
|
| 728 |
+
seen_excel_terms = set()
|
| 729 |
+
|
| 730 |
+
for entry in grouped.values():
|
| 731 |
+
h = entry["hit"]
|
| 732 |
+
txt = entry["text"]
|
| 733 |
+
src = entry["src"]
|
| 734 |
+
src_type = entry.get("src_type") or (h.get("type") or "").lower()
|
| 735 |
+
term = entry.get("term") or (h.get("term") or "").strip()
|
| 736 |
+
term_lower = (term or "").lower()
|
| 737 |
+
|
| 738 |
+
# Skip entries resolved to glossary.json here (we'll use them only as fallback)
|
| 739 |
+
if str(src).lower().endswith("glossary.json"):
|
| 740 |
+
continue
|
| 741 |
+
|
| 742 |
+
# If this is an excel entry for MRCT/CDISC/Abbrev, ensure only first combined answer per term
|
| 743 |
+
is_excel = (src_type == "excel") or str(src).lower().endswith((".xls", ".xlsx"))
|
| 744 |
+
if is_excel:
|
| 745 |
+
if term_lower in seen_excel_terms:
|
| 746 |
+
# skip duplicate excel results for same term (they will be combined in the first occurrence)
|
| 747 |
+
continue
|
| 748 |
+
# mark as seen (so subsequent excel chunks won't produce duplicates)
|
| 749 |
+
seen_excel_terms.add(term_lower)
|
| 750 |
+
|
| 751 |
+
# Skip noisy PDF sections unless they look like short glossary entries
|
| 752 |
+
txt_lower = txt.lower()
|
| 753 |
+
if src_type == "pdf" and any(k in txt_lower[:300] for k in ["table of contents", "appendix", "index", "section"]):
|
| 754 |
+
if not (len(txt.split()) < 80 and term_lower and term_lower in txt_lower[:120]):
|
| 755 |
+
# treat as noise
|
| 756 |
+
continue
|
| 757 |
+
|
| 758 |
+
# Determine icon and counts
|
| 759 |
+
if src_type == "excel":
|
| 760 |
+
icon, cat = "📘", "excel"
|
| 761 |
+
elif src_type == "pdf":
|
| 762 |
+
icon, cat = "📄", "pdf"
|
| 763 |
+
elif src_type == "web":
|
| 764 |
+
icon, cat = "🌐", "web"
|
| 765 |
+
else:
|
| 766 |
+
icon, cat = "📁", "other"
|
| 767 |
+
src_counts[cat] += 1
|
| 768 |
+
|
| 769 |
+
# Extract excerpt (PDF / web special handling for glossary-style)
|
| 770 |
+
excerpt = ""
|
| 771 |
+
if src_type in ("pdf", "web"):
|
| 772 |
+
paragraphs = re.split(r"\n\s*\n", txt)
|
| 773 |
+
paragraphs = [p.strip() for p in paragraphs if p.strip()]
|
| 774 |
+
|
| 775 |
+
# 1) If full term matches heading (e.g., "electronic health record")
|
| 776 |
+
if paragraphs and term_lower:
|
| 777 |
+
heading = paragraphs[0].strip().lower()
|
| 778 |
+
if heading == term_lower or (term_lower in heading):
|
| 779 |
+
excerpt = paragraphs[1].strip() if len(paragraphs) > 1 else paragraphs[0].strip()
|
| 780 |
+
|
| 781 |
+
# 2) If not yet found, try exact normalized query inside paragraphs
|
| 782 |
+
if not excerpt:
|
| 783 |
+
found = None
|
| 784 |
+
for p in paragraphs:
|
| 785 |
+
if nq and nq in p.lower():
|
| 786 |
+
found = p.strip()
|
| 787 |
+
break
|
| 788 |
+
|
| 789 |
+
# 3) Fuzzy match with paragraph starts
|
| 790 |
+
if not found and term_lower:
|
| 791 |
+
for p in paragraphs:
|
| 792 |
+
if fuzzy_ratio(term_lower, p.lower()[:100]) > 0.75:
|
| 793 |
+
found = p.strip()
|
| 794 |
+
break
|
| 795 |
+
|
| 796 |
+
# 4) Paragraph following a heading that contains the term
|
| 797 |
+
if not found and term_lower:
|
| 798 |
+
for i, p in enumerate(paragraphs[:-1]):
|
| 799 |
+
if term_lower in p.lower():
|
| 800 |
+
found = paragraphs[i + 1].strip()
|
| 801 |
+
break
|
| 802 |
+
|
| 803 |
+
excerpt = (found or (paragraphs[0] if paragraphs else txt)).strip()
|
| 804 |
+
|
| 805 |
+
excerpt = excerpt[:2000] + ("..." if len(excerpt) > 2000 else "")
|
| 806 |
+
excerpt = add_links_to_text(excerpt)
|
| 807 |
+
|
| 808 |
+
elif src_type == "excel":
|
| 809 |
+
# Special-case: Abbreviations -> always show full clean definition (single block)
|
| 810 |
+
if "abbreviations.xlsx" in str(src).lower() or ("abbreviations" in str(src).lower() and src_type=="excel"):
|
| 811 |
+
excerpt = add_links_to_text(txt)
|
| 812 |
+
# Special-case: CDISC -> always show full clean definition (single block)
|
| 813 |
+
elif "cdisc glossary.xlsx" in str(src).lower() or ("cdisc" in str(src).lower() and src_type=="excel"):
|
| 814 |
+
excerpt = add_links_to_text(txt)
|
| 815 |
+
else:
|
| 816 |
+
# General Excel/MRCT parsing: parse labeled sections and build one combined excerpt
|
| 817 |
+
try:
|
| 818 |
+
sections = parse_excel_sections(txt)
|
| 819 |
+
except Exception:
|
| 820 |
+
sections = [("Glossary Definition", txt)]
|
| 821 |
+
|
| 822 |
+
lines = []
|
| 823 |
+
seen_vals = set()
|
| 824 |
+
for label, val in sections:
|
| 825 |
+
if not val or not str(val).strip():
|
| 826 |
+
continue
|
| 827 |
+
v = str(val).strip()
|
| 828 |
+
# Clickify URLs if the section is a single URL
|
| 829 |
+
if re.match(r"^https?://\S+$", v):
|
| 830 |
+
v_html = f'<a href="{v}" target="_blank" rel="noopener noreferrer">{v}</a>'
|
| 831 |
+
else:
|
| 832 |
+
v_html = add_links_to_text(v)
|
| 833 |
+
# Avoid duplicate repeated text segments
|
| 834 |
+
if v_html in seen_vals:
|
| 835 |
+
continue
|
| 836 |
+
seen_vals.add(v_html)
|
| 837 |
+
lines.append(f"<b>{label}:</b> {v_html}")
|
| 838 |
+
|
| 839 |
+
excerpt = "<br>".join(lines) if lines else add_links_to_text(txt)
|
| 840 |
+
|
| 841 |
+
else:
|
| 842 |
+
excerpt = add_links_to_text(txt)
|
| 843 |
+
|
| 844 |
+
# Prepare heading and display sources (exclude internal glossary.json from display)
|
| 845 |
+
heading_term = term.strip() or os.path.splitext(os.path.basename(src))[0]
|
| 846 |
+
heading_html = f"<h4>{icon} {heading_term}</h4>"
|
| 847 |
+
|
| 848 |
+
# Use _resolved_sources (preserved earlier) and ensure web URLs are shown directly
|
| 849 |
+
# Build clickable sources
|
| 850 |
+
resolved_sources = h.get("_resolved_sources") or []
|
| 851 |
+
display_sources = []
|
| 852 |
+
|
| 853 |
+
for s in resolved_sources:
|
| 854 |
+
if not isinstance(s, str):
|
| 855 |
+
continue
|
| 856 |
+
if s.lower().endswith("glossary.json"):
|
| 857 |
+
continue
|
| 858 |
+
|
| 859 |
+
if s.startswith("http"):
|
| 860 |
+
display_sources.append(
|
| 861 |
+
f'<a href="{s}" target="_blank" rel="noopener noreferrer">{s}</a>'
|
| 862 |
+
)
|
| 863 |
+
else:
|
| 864 |
+
display_sources.append(os.path.basename(s))
|
| 865 |
+
|
| 866 |
+
# Fallback if empty
|
| 867 |
+
if not display_sources:
|
| 868 |
+
if isinstance(src, str) and src.startswith("http"):
|
| 869 |
+
display_sources = [
|
| 870 |
+
f'<a href="{src}" target="_blank" rel="noopener noreferrer">{src}</a>'
|
| 871 |
+
]
|
| 872 |
+
else:
|
| 873 |
+
display_sources = [os.path.basename(str(src))]
|
| 874 |
+
|
| 875 |
+
# ALWAYS create sources_line safely
|
| 876 |
+
sources_line = (
|
| 877 |
+
"<p>🔗 <i>Source:</i> "
|
| 878 |
+
+ " · ".join(dict.fromkeys(display_sources))
|
| 879 |
+
+ "</p>"
|
| 880 |
+
)
|
| 881 |
+
|
| 882 |
+
answers.append({
|
| 883 |
+
"rank": get_source_rank(src, src_type),
|
| 884 |
+
"type": cat,
|
| 885 |
+
"term": term,
|
| 886 |
+
"html": f"{heading_html}{sources_line}<blockquote>{excerpt}</blockquote>"
|
| 887 |
+
})
|
| 888 |
+
|
| 889 |
+
# ----------------------------
|
| 890 |
+
# Fallback: only use glossary.json definitions if no other original sources matched
|
| 891 |
+
# ----------------------------
|
| 892 |
+
if not answers and glossary_fallbacks:
|
| 893 |
+
for item in glossary_fallbacks:
|
| 894 |
+
h = item["hit"]
|
| 895 |
+
txt = item["text"]
|
| 896 |
+
src = item.get("src") or (h.get("file") or h.get("source") or "glossary.json")
|
| 897 |
+
term = (h.get("term") or "").strip() or "Definition"
|
| 898 |
+
heading_html = f"<h4>📄 {term}</h4>"
|
| 899 |
+
excerpt = txt.strip()
|
| 900 |
+
answers.append({
|
| 901 |
+
"rank": 10,
|
| 902 |
+
"type": "pdf",
|
| 903 |
+
"term": term,
|
| 904 |
+
"html": f"{heading_html}<p>🔗 <i>Source:</i> {os.path.basename(src)}</p><blockquote>{excerpt}</blockquote>"
|
| 905 |
+
})
|
| 906 |
+
|
| 907 |
+
# ----------------------------
|
| 908 |
+
# Final sort & output
|
| 909 |
+
# ----------------------------
|
| 910 |
+
if not answers:
|
| 911 |
+
return "<i>No relevant results found.</i>"
|
| 912 |
+
|
| 913 |
+
answers = sorted(answers, key=lambda a: a["rank"])
|
| 914 |
+
final_html_parts = [a["html"] for a in answers[:TOP_RESULTS_LIMIT]]
|
| 915 |
+
|
| 916 |
+
summary_counts = " | ".join(f"{k.capitalize()}: {v}" for k, v in src_counts.items() if v > 0)
|
| 917 |
+
|
| 918 |
+
elapsed = time.time() - start
|
| 919 |
+
print(f"✅ Answers from {len(answers)} sources in {elapsed:.2f}s")
|
| 920 |
+
|
| 921 |
+
return (
|
| 922 |
+
f"<h3>🧠 Answers (one per source):</h3>"
|
| 923 |
+
f"<p><i>Sources → {summary_counts}</i></p>"
|
| 924 |
+
+ "<br>".join(final_html_parts)
|
| 925 |
+
)
|
core/retrieval.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
from whoosh.index import open_dir
|
| 5 |
+
from whoosh.qparser import MultifieldParser
|
| 6 |
+
|
| 7 |
+
WHOOSH_INDEX_PATH = "/home/user/app/persistent/whoosh_index"
|
| 8 |
+
|
| 9 |
+
_ix = None
|
| 10 |
+
|
| 11 |
+
def _load_whoosh():
|
| 12 |
+
global _ix
|
| 13 |
+
if _ix is None and os.path.exists(WHOOSH_INDEX_PATH):
|
| 14 |
+
_ix = open_dir(WHOOSH_INDEX_PATH)
|
| 15 |
+
return _ix
|
| 16 |
+
|
| 17 |
+
def _bm25_search(query, top_n=10):
|
| 18 |
+
ix = _load_whoosh()
|
| 19 |
+
if not ix:
|
| 20 |
+
return []
|
| 21 |
+
parser = MultifieldParser(["text", "title"], schema=ix.schema)
|
| 22 |
+
q = parser.parse(query)
|
| 23 |
+
with ix.searcher() as s:
|
| 24 |
+
results = s.search(q, limit=top_n)
|
| 25 |
+
return [{"text": r["text"], "file": r.get("file", "")} for r in results]
|
core/van_normalizer.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# core/van_normalizer.py
|
| 2 |
+
import re
|
| 3 |
+
import nltk
|
| 4 |
+
from nltk import pos_tag, word_tokenize
|
| 5 |
+
from nltk.stem import WordNetLemmatizer
|
| 6 |
+
|
| 7 |
+
# make sure you have these (run once if missing):
|
| 8 |
+
# python -m nltk.downloader punkt averaged_perceptron_tagger wordnet omw-1.4
|
| 9 |
+
|
| 10 |
+
lemmatizer = WordNetLemmatizer()
|
| 11 |
+
|
| 12 |
+
def normalize_to_van(text: str) -> str:
|
| 13 |
+
"""
|
| 14 |
+
VAN-based normalization (optimized for clinical trial domain):
|
| 15 |
+
- Lowercases and removes punctuation
|
| 16 |
+
- Tokenizes and POS-tags
|
| 17 |
+
- Keeps only Nouns (N), Adjectives (J), and key Verbs (V)
|
| 18 |
+
- Explicitly removes determiners/articles (a, an, the)
|
| 19 |
+
- Lemmatizes each token to its base form
|
| 20 |
+
- Returns a space-joined string suitable for FAISS embedding
|
| 21 |
+
"""
|
| 22 |
+
if not text:
|
| 23 |
+
return ""
|
| 24 |
+
|
| 25 |
+
# Basic cleanup
|
| 26 |
+
text = text.lower().strip()
|
| 27 |
+
text = re.sub(r"[^a-z0-9\s-]", " ", text) # remove punctuation
|
| 28 |
+
tokens = word_tokenize(text)
|
| 29 |
+
|
| 30 |
+
# POS tagging
|
| 31 |
+
tagged = pos_tag(tokens)
|
| 32 |
+
|
| 33 |
+
filtered = []
|
| 34 |
+
for word, tag in tagged:
|
| 35 |
+
# Skip common determiners, articles, and auxiliary verbs
|
| 36 |
+
if word in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
|
| 37 |
+
continue
|
| 38 |
+
|
| 39 |
+
# Keep only verbs, adjectives, and nouns
|
| 40 |
+
if tag.startswith("V") or tag.startswith("J") or tag.startswith("N"):
|
| 41 |
+
filtered.append((word, tag))
|
| 42 |
+
|
| 43 |
+
# Lemmatize each word to its appropriate part of speech
|
| 44 |
+
lemmas = []
|
| 45 |
+
for word, tag in filtered:
|
| 46 |
+
pos = (
|
| 47 |
+
"v" if tag.startswith("V")
|
| 48 |
+
else "a" if tag.startswith("J")
|
| 49 |
+
else "n"
|
| 50 |
+
)
|
| 51 |
+
lemmas.append(lemmatizer.lemmatize(word, pos))
|
| 52 |
+
|
| 53 |
+
# Join and clean
|
| 54 |
+
normalized = " ".join(lemmas).strip()
|
| 55 |
+
normalized = re.sub(r"\s+", " ", normalized) # collapse multiple spaces
|
| 56 |
+
return normalized
|
| 57 |
+
|
core/vector_search.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
vector_search.py
|
| 3 |
+
|
| 4 |
+
Thin wrapper helpers used to orchestrate searches and resets from the app or admin UI.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import List, Dict, Any
|
| 8 |
+
from core import vector_store, vector_sync
|
| 9 |
+
|
| 10 |
+
def semantic_search(query: str, top_k: int = 6) -> List[Dict[str, Any]]:
|
| 11 |
+
"""
|
| 12 |
+
Safe semantic search that falls back gracefully to empty list.
|
| 13 |
+
"""
|
| 14 |
+
try:
|
| 15 |
+
return vector_store.search_index(query, top_k=top_k)
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print(f"⚠️ semantic_search error: {e}")
|
| 18 |
+
return []
|
| 19 |
+
|
| 20 |
+
def reset_faiss_and_rebuild(glossary_builder_fn=None, rebuild_index_fn=None) -> str:
|
| 21 |
+
"""
|
| 22 |
+
Clear local caches and run rebuilds. The app's reset_faiss_cache() can call this.
|
| 23 |
+
- glossary_builder_fn: optional function to run to rebuild glossary (if provided)
|
| 24 |
+
- rebuild_index_fn: optional function that triggers full rebuild (if provided)
|
| 25 |
+
"""
|
| 26 |
+
try:
|
| 27 |
+
vector_store.clear_local_faiss()
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"⚠️ clear_local_faiss failed: {e}")
|
| 30 |
+
|
| 31 |
+
out = "🧹 Cleared local FAISS files.\n"
|
| 32 |
+
|
| 33 |
+
# If a glossary builder was provided, run it (safe)
|
| 34 |
+
if glossary_builder_fn:
|
| 35 |
+
try:
|
| 36 |
+
out += glossary_builder_fn() + "\n"
|
| 37 |
+
except Exception as e:
|
| 38 |
+
out += f"⚠️ Glossary builder failed: {e}\n"
|
| 39 |
+
|
| 40 |
+
if rebuild_index_fn:
|
| 41 |
+
try:
|
| 42 |
+
out += rebuild_index_fn()
|
| 43 |
+
except Exception as e:
|
| 44 |
+
out += f"⚠️ Rebuild index failed: {e}\n"
|
| 45 |
+
|
| 46 |
+
return out
|
| 47 |
+
|
core/vector_store.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
vector_store.py
|
| 3 |
+
-----------------------------------------------------
|
| 4 |
+
Maintains FAISS runtime index + metadata cache.
|
| 5 |
+
|
| 6 |
+
Features
|
| 7 |
+
--------
|
| 8 |
+
- Ensure local FAISS runtime index exists (download from HF if missing)
|
| 9 |
+
- FAISS semantic search and BM25 text access
|
| 10 |
+
- Automatic TTL reload
|
| 11 |
+
- Full cache clearing for Hugging Face Space
|
| 12 |
+
- Explicit "♻️ FAISS memory cache reset" logging on rebuild
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
import json
|
| 17 |
+
import time
|
| 18 |
+
import shutil
|
| 19 |
+
from typing import List, Dict, Any, Optional
|
| 20 |
+
|
| 21 |
+
import numpy as np
|
| 22 |
+
import faiss
|
| 23 |
+
from sentence_transformers import SentenceTransformer
|
| 24 |
+
from huggingface_hub import hf_hub_download
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ------------------------------------------------------------------
|
| 28 |
+
# 🔧 Paths & constants
|
| 29 |
+
# ------------------------------------------------------------------
|
| 30 |
+
PERSISTENT_DIR = "/home/user/app/persistent"
|
| 31 |
+
RUNTIME_DIR = "/home/user/app/runtime_faiss"
|
| 32 |
+
INDEX_NAME = "faiss.index"
|
| 33 |
+
META_NAME = "faiss.index.meta.json"
|
| 34 |
+
GLOSSARY_META = "glossary.json"
|
| 35 |
+
HF_INDEX_REPO = "essprasad/CT-Chat-Index"
|
| 36 |
+
|
| 37 |
+
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 38 |
+
EMBED_MODEL = None # lazy loaded
|
| 39 |
+
|
| 40 |
+
# in-memory cache
|
| 41 |
+
_runtime_index: Optional[faiss.Index] = None
|
| 42 |
+
_runtime_meta: Optional[List[Dict[str, Any]]] = None
|
| 43 |
+
_meta_loaded_time = 0.0
|
| 44 |
+
_META_TTL_SECONDS = 300.0
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ------------------------------------------------------------------
|
| 48 |
+
# 🔹 Helpers
|
| 49 |
+
# ------------------------------------------------------------------
|
| 50 |
+
def _ensure_dirs():
|
| 51 |
+
os.makedirs(PERSISTENT_DIR, exist_ok=True)
|
| 52 |
+
os.makedirs(RUNTIME_DIR, exist_ok=True)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _ensure_model():
|
| 56 |
+
global EMBED_MODEL
|
| 57 |
+
if EMBED_MODEL is None:
|
| 58 |
+
print("📥 Loading embedding model for FAISS retrieval…")
|
| 59 |
+
EMBED_MODEL = SentenceTransformer(EMBED_MODEL_NAME)
|
| 60 |
+
print("✅ Embedding model loaded.")
|
| 61 |
+
return EMBED_MODEL
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ------------------------------------------------------------------
|
| 65 |
+
# 🔹 Cache control
|
| 66 |
+
# ------------------------------------------------------------------
|
| 67 |
+
def clear_local_faiss():
|
| 68 |
+
"""Delete all local FAISS + glossary caches (safe in HF Space)."""
|
| 69 |
+
for p in [
|
| 70 |
+
os.path.join(PERSISTENT_DIR, INDEX_NAME),
|
| 71 |
+
os.path.join(PERSISTENT_DIR, META_NAME),
|
| 72 |
+
os.path.join(PERSISTENT_DIR, GLOSSARY_META),
|
| 73 |
+
RUNTIME_DIR,
|
| 74 |
+
]:
|
| 75 |
+
try:
|
| 76 |
+
if os.path.isdir(p):
|
| 77 |
+
shutil.rmtree(p, ignore_errors=True)
|
| 78 |
+
elif os.path.exists(p):
|
| 79 |
+
os.remove(p)
|
| 80 |
+
print(f"🗑️ Cleared: {p}")
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"⚠️ Failed to clear {p}: {e}")
|
| 83 |
+
print("♻️ FAISS memory cache reset (runtime + persistent cleared)")
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ------------------------------------------------------------------
|
| 87 |
+
# 🔹 Loaders
|
| 88 |
+
# ------------------------------------------------------------------
|
| 89 |
+
def _load_local_index() -> bool:
|
| 90 |
+
"""Load FAISS index + metadata from persistent into runtime."""
|
| 91 |
+
global _runtime_index, _runtime_meta, _meta_loaded_time
|
| 92 |
+
_ensure_dirs()
|
| 93 |
+
idx_path = os.path.join(PERSISTENT_DIR, INDEX_NAME)
|
| 94 |
+
meta_path = os.path.join(PERSISTENT_DIR, META_NAME)
|
| 95 |
+
try:
|
| 96 |
+
if not (os.path.exists(idx_path) and os.path.exists(meta_path)):
|
| 97 |
+
return False
|
| 98 |
+
os.makedirs(RUNTIME_DIR, exist_ok=True)
|
| 99 |
+
shutil.copy2(idx_path, os.path.join(RUNTIME_DIR, INDEX_NAME))
|
| 100 |
+
shutil.copy2(meta_path, os.path.join(RUNTIME_DIR, META_NAME))
|
| 101 |
+
_runtime_index = faiss.read_index(os.path.join(RUNTIME_DIR, INDEX_NAME))
|
| 102 |
+
with open(os.path.join(RUNTIME_DIR, META_NAME), "r", encoding="utf-8") as f:
|
| 103 |
+
_runtime_meta = json.load(f)
|
| 104 |
+
_meta_loaded_time = time.time()
|
| 105 |
+
print(f"✅ Loaded FAISS index ({len(_runtime_meta)} vectors).")
|
| 106 |
+
return True
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"⚠️ Could not load local FAISS index: {e}")
|
| 109 |
+
_runtime_index = None
|
| 110 |
+
_runtime_meta = None
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _download_index_from_hub() -> bool:
|
| 115 |
+
"""Download FAISS artifacts from Hugging Face dataset repo."""
|
| 116 |
+
_ensure_dirs()
|
| 117 |
+
try:
|
| 118 |
+
print("☁️ Downloading FAISS artifacts from HF dataset…")
|
| 119 |
+
idx = hf_hub_download(repo_id=HF_INDEX_REPO,
|
| 120 |
+
filename=f"persistent/{INDEX_NAME}",
|
| 121 |
+
repo_type="dataset")
|
| 122 |
+
meta = hf_hub_download(repo_id=HF_INDEX_REPO,
|
| 123 |
+
filename=f"persistent/{META_NAME}",
|
| 124 |
+
repo_type="dataset")
|
| 125 |
+
shutil.copy2(idx, os.path.join(PERSISTENT_DIR, INDEX_NAME))
|
| 126 |
+
shutil.copy2(meta, os.path.join(PERSISTENT_DIR, META_NAME))
|
| 127 |
+
print("✅ FAISS artifacts downloaded and stored persistently.")
|
| 128 |
+
return _load_local_index()
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"⚠️ HF download failed: {e}")
|
| 131 |
+
return False
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _ensure_faiss_index(force_refresh: bool = False) -> bool:
|
| 135 |
+
"""
|
| 136 |
+
Ensure runtime FAISS is available.
|
| 137 |
+
If force_refresh=True, clears runtime and reloads fresh.
|
| 138 |
+
"""
|
| 139 |
+
global _runtime_index, _runtime_meta, _meta_loaded_time
|
| 140 |
+
_ensure_dirs()
|
| 141 |
+
|
| 142 |
+
if force_refresh:
|
| 143 |
+
try:
|
| 144 |
+
shutil.rmtree(RUNTIME_DIR, ignore_errors=True)
|
| 145 |
+
_runtime_index = None
|
| 146 |
+
_runtime_meta = None
|
| 147 |
+
print("♻️ Forced FAISS runtime reload requested.")
|
| 148 |
+
except Exception as e:
|
| 149 |
+
print(f"⚠️ Force refresh failed: {e}")
|
| 150 |
+
|
| 151 |
+
if _runtime_index is not None and (time.time() - _meta_loaded_time) < _META_TTL_SECONDS:
|
| 152 |
+
return True
|
| 153 |
+
|
| 154 |
+
if _load_local_index():
|
| 155 |
+
return True
|
| 156 |
+
if _download_index_from_hub():
|
| 157 |
+
return True
|
| 158 |
+
|
| 159 |
+
print("⚠️ No FAISS index found locally or remotely.")
|
| 160 |
+
return False
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# ------------------------------------------------------------------
|
| 164 |
+
# 🔹 Accessors
|
| 165 |
+
# ------------------------------------------------------------------
|
| 166 |
+
def load_all_text_chunks() -> List[Dict[str, Any]]:
|
| 167 |
+
"""Return metadata list for BM25 fallback or analysis."""
|
| 168 |
+
global _runtime_meta, _meta_loaded_time
|
| 169 |
+
if _runtime_meta is None:
|
| 170 |
+
if not _ensure_faiss_index():
|
| 171 |
+
return []
|
| 172 |
+
if (time.time() - _meta_loaded_time) > _META_TTL_SECONDS:
|
| 173 |
+
try:
|
| 174 |
+
meta_path = os.path.join(RUNTIME_DIR, META_NAME)
|
| 175 |
+
with open(meta_path, "r", encoding="utf-8") as f:
|
| 176 |
+
_runtime_meta = json.load(f)
|
| 177 |
+
_meta_loaded_time = time.time()
|
| 178 |
+
except Exception:
|
| 179 |
+
pass
|
| 180 |
+
return _runtime_meta or []
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# ------------------------------------------------------------------
|
| 184 |
+
# 🔹 Core Search
|
| 185 |
+
# ------------------------------------------------------------------
|
| 186 |
+
def search_index(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
| 187 |
+
"""Perform semantic FAISS search and return metadata hits."""
|
| 188 |
+
if not _ensure_faiss_index():
|
| 189 |
+
return []
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
model = _ensure_model()
|
| 193 |
+
q_emb = model.encode([query], convert_to_numpy=True).astype("float32")
|
| 194 |
+
faiss.normalize_L2(q_emb)
|
| 195 |
+
D, I = _runtime_index.search(q_emb, top_k)
|
| 196 |
+
results = []
|
| 197 |
+
for dist, idx in zip(D[0], I[0]):
|
| 198 |
+
if idx < 0 or idx >= len(_runtime_meta):
|
| 199 |
+
continue
|
| 200 |
+
meta = dict(_runtime_meta[idx])
|
| 201 |
+
meta["score"] = float(dist)
|
| 202 |
+
meta["file"] = meta.get("file") or meta.get("source") or "unknown"
|
| 203 |
+
meta["text"] = meta.get("text") or meta.get("definition", "")
|
| 204 |
+
results.append(meta)
|
| 205 |
+
return results
|
| 206 |
+
except Exception as e:
|
| 207 |
+
print(f"⚠️ FAISS search failed: {e}")
|
| 208 |
+
return []
|
core/vector_sync.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
vector_sync.py
|
| 3 |
+
Responsibilities:
|
| 4 |
+
- rebuild_faiss_from_glossary(glossary_path) -> builds a new faiss.Index + meta list
|
| 5 |
+
- _upload_to_dataset(index_path, meta_path, repo_id) -> upload via huggingface_hub
|
| 6 |
+
- safe helpers for creating normalized metadata entries
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import re
|
| 11 |
+
import json
|
| 12 |
+
import shutil
|
| 13 |
+
from typing import Tuple, List, Dict, Any
|
| 14 |
+
|
| 15 |
+
import faiss
|
| 16 |
+
import numpy as np
|
| 17 |
+
from sentence_transformers import SentenceTransformer
|
| 18 |
+
from huggingface_hub import upload_file
|
| 19 |
+
|
| 20 |
+
# default embedder (same model used elsewhere)
|
| 21 |
+
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 22 |
+
|
| 23 |
+
# directories
|
| 24 |
+
PERSISTENT_DIR = "/home/user/app/persistent"
|
| 25 |
+
TMP_DIR = "/home/user/app/tmp"
|
| 26 |
+
os.makedirs(PERSISTENT_DIR, exist_ok=True)
|
| 27 |
+
os.makedirs(TMP_DIR, exist_ok=True)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _ensure_model():
|
| 31 |
+
"""Return global sentence-transformer model."""
|
| 32 |
+
return SentenceTransformer(EMBED_MODEL_NAME)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _normalize_meta_row(row: Dict[str, Any]) -> Dict[str, Any]:
|
| 36 |
+
"""Ensure consistent meta record fields."""
|
| 37 |
+
out = {
|
| 38 |
+
"term": row.get("term") or row.get("Term") or row.get("name") or "",
|
| 39 |
+
"text": row.get("text") or row.get("definition") or row.get("content") or "",
|
| 40 |
+
# keep both 'file' (local/basename) and full 'sources' list
|
| 41 |
+
"file": row.get("file") or row.get("source") or "",
|
| 42 |
+
"type": row.get("type") or "",
|
| 43 |
+
"sources": row.get("sources") if isinstance(row.get("sources"), list) else [row.get("source")] if row.get("source") else []
|
| 44 |
+
}
|
| 45 |
+
return out
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# ==========================================================
|
| 49 |
+
# 🧠 Main Function: Rebuild FAISS from glossary.json
|
| 50 |
+
# ==========================================================
|
| 51 |
+
def rebuild_faiss_from_glossary(glossary_path: str):
|
| 52 |
+
"""
|
| 53 |
+
Build FAISS index + metadata from glossary JSON file.
|
| 54 |
+
Handles mixed entries (PDF, Excel, Web, Other).
|
| 55 |
+
Fully resilient against malformed or oversized rows.
|
| 56 |
+
"""
|
| 57 |
+
print(f"🧩 Building FAISS from glossary: {glossary_path}")
|
| 58 |
+
if not os.path.exists(glossary_path):
|
| 59 |
+
raise FileNotFoundError(f"Glossary not found: {glossary_path}")
|
| 60 |
+
|
| 61 |
+
# --- Load JSON safely
|
| 62 |
+
with open(glossary_path, "r", encoding="utf-8") as f:
|
| 63 |
+
try:
|
| 64 |
+
glossary_data = json.load(f)
|
| 65 |
+
except Exception as e:
|
| 66 |
+
raise RuntimeError(f"❌ Failed to load glossary JSON: {e}")
|
| 67 |
+
|
| 68 |
+
# Normalize structure
|
| 69 |
+
if isinstance(glossary_data, dict):
|
| 70 |
+
glossary_items = list(glossary_data.values())
|
| 71 |
+
elif isinstance(glossary_data, list):
|
| 72 |
+
glossary_items = glossary_data
|
| 73 |
+
else:
|
| 74 |
+
raise ValueError("Invalid glossary format — must be list or dict.")
|
| 75 |
+
|
| 76 |
+
model = SentenceTransformer(EMBED_MODEL_NAME)
|
| 77 |
+
entries, metas, bad_entries, long_entries = [], [], [], []
|
| 78 |
+
|
| 79 |
+
# helper: normalized type inference
|
| 80 |
+
def infer_type_from_source(src: str, declared_type: str = "") -> str:
|
| 81 |
+
src_l = (src or "").lower()
|
| 82 |
+
declared = (declared_type or "").lower()
|
| 83 |
+
if src_l.endswith(".pdf") or "pdf" in declared:
|
| 84 |
+
return "pdf"
|
| 85 |
+
if src_l.endswith((".xlsx", ".xls")) or "excel" in declared or "xls" in src_l:
|
| 86 |
+
return "excel"
|
| 87 |
+
if src_l.startswith("http") or declared == "web" or "http" in src_l:
|
| 88 |
+
return "web"
|
| 89 |
+
return "other"
|
| 90 |
+
|
| 91 |
+
# --- Process glossary items
|
| 92 |
+
for i, item in enumerate(glossary_items):
|
| 93 |
+
try:
|
| 94 |
+
if not isinstance(item, dict):
|
| 95 |
+
bad_entries.append(item)
|
| 96 |
+
continue
|
| 97 |
+
|
| 98 |
+
term = str(item.get("term") or item.get("Term") or item.get("name") or "").strip()
|
| 99 |
+
definition = str(item.get("definition") or item.get("text") or item.get("content") or "").strip()
|
| 100 |
+
|
| 101 |
+
# Normalize sources (keep list)
|
| 102 |
+
src_field = item.get("sources") or item.get("source") or item.get("file") or ""
|
| 103 |
+
if isinstance(src_field, list):
|
| 104 |
+
src_list = [str(s).strip() for s in src_field if s]
|
| 105 |
+
src = ", ".join(src_list)
|
| 106 |
+
else:
|
| 107 |
+
src_list = [str(src_field).strip()] if src_field else []
|
| 108 |
+
src = str(src_field).strip()
|
| 109 |
+
|
| 110 |
+
declared_type = str(item.get("type") or "").strip().lower()
|
| 111 |
+
entry_type = infer_type_from_source(src, declared_type)
|
| 112 |
+
|
| 113 |
+
# Clean up noisy HTML tags and whitespace
|
| 114 |
+
definition_clean = re.sub(r"<[^>]*>", "", definition)
|
| 115 |
+
definition_clean = re.sub(r"\s+", " ", definition_clean).strip()
|
| 116 |
+
|
| 117 |
+
# Skip if missing essentials
|
| 118 |
+
if not term or not definition_clean:
|
| 119 |
+
bad_entries.append(item)
|
| 120 |
+
continue
|
| 121 |
+
|
| 122 |
+
# Skip extremely long definitions (likely raw HTML or large web content)
|
| 123 |
+
if len(definition_clean) > 3000:
|
| 124 |
+
long_entries.append({
|
| 125 |
+
"term": term,
|
| 126 |
+
"len": len(definition_clean),
|
| 127 |
+
"source": src
|
| 128 |
+
})
|
| 129 |
+
continue
|
| 130 |
+
|
| 131 |
+
text = f"Definition of {term}: {definition_clean}"
|
| 132 |
+
|
| 133 |
+
entries.append(text)
|
| 134 |
+
metas.append({
|
| 135 |
+
"term": term,
|
| 136 |
+
"definition": definition_clean,
|
| 137 |
+
# preserve the original source list and file name
|
| 138 |
+
"sources": src_list if src_list else [src] if src else [],
|
| 139 |
+
"source": src,
|
| 140 |
+
"type": entry_type,
|
| 141 |
+
"file": os.path.basename(glossary_path)
|
| 142 |
+
})
|
| 143 |
+
|
| 144 |
+
except Exception as e:
|
| 145 |
+
bad_entries.append({
|
| 146 |
+
"index": i,
|
| 147 |
+
"error": str(e),
|
| 148 |
+
"raw": str(item)[:300]
|
| 149 |
+
})
|
| 150 |
+
continue
|
| 151 |
+
|
| 152 |
+
# --- Diagnostics
|
| 153 |
+
pdf_count = sum(1 for m in metas if m["type"].lower() == "pdf")
|
| 154 |
+
excel_count = sum(1 for m in metas if m["type"].lower() == "excel")
|
| 155 |
+
web_count = sum(1 for m in metas if m["type"].lower() == "web")
|
| 156 |
+
other_count = len(metas) - (pdf_count + excel_count + web_count)
|
| 157 |
+
|
| 158 |
+
print(f"🧠 Encoding {len(entries)} entries (PDF={pdf_count}, Excel={excel_count}, Web={web_count}, Other={other_count})…")
|
| 159 |
+
|
| 160 |
+
if bad_entries:
|
| 161 |
+
print(f"⚠️ {len(bad_entries)} malformed entries skipped.")
|
| 162 |
+
for b in bad_entries[:3]:
|
| 163 |
+
print(" →", json.dumps(b, ensure_ascii=False)[:300])
|
| 164 |
+
|
| 165 |
+
if long_entries:
|
| 166 |
+
print(f"⚠️ {len(long_entries)} very long entries (>3000 chars) skipped.")
|
| 167 |
+
for l in long_entries[:3]:
|
| 168 |
+
print(f" → Skipped {l['term']} ({l['len']} chars) from {l['source']}")
|
| 169 |
+
|
| 170 |
+
if not entries:
|
| 171 |
+
raise RuntimeError("❌ No valid glossary entries found after cleanup!")
|
| 172 |
+
|
| 173 |
+
# --- Encoding
|
| 174 |
+
embeddings = model.encode(entries, show_progress_bar=True, convert_to_numpy=True).astype("float32")
|
| 175 |
+
faiss.normalize_L2(embeddings)
|
| 176 |
+
|
| 177 |
+
index = faiss.IndexFlatIP(embeddings.shape[1])
|
| 178 |
+
index.add(embeddings)
|
| 179 |
+
print(f"✅ Glossary vectors built ({len(entries)} total entries).")
|
| 180 |
+
|
| 181 |
+
# metas is list of dicts aligned with vectors — return exactly as before
|
| 182 |
+
return index, metas
|
| 183 |
+
|
| 184 |
+
# ==========================================================
|
| 185 |
+
# ☁️ Upload Helper
|
| 186 |
+
# ==========================================================
|
| 187 |
+
def _upload_to_dataset(index_path: str, meta_path: str, repo_id: str) -> None:
|
| 188 |
+
"""
|
| 189 |
+
Upload FAISS index and metadata JSON to Hugging Face dataset.
|
| 190 |
+
"""
|
| 191 |
+
try:
|
| 192 |
+
print(f"☁️ Uploading {index_path} and {meta_path} to {repo_id}...")
|
| 193 |
+
upload_file(
|
| 194 |
+
path_or_fileobj=index_path,
|
| 195 |
+
path_in_repo=f"persistent/{os.path.basename(index_path)}",
|
| 196 |
+
repo_id=repo_id,
|
| 197 |
+
repo_type="dataset"
|
| 198 |
+
)
|
| 199 |
+
upload_file(
|
| 200 |
+
path_or_fileobj=meta_path,
|
| 201 |
+
path_in_repo=f"persistent/{os.path.basename(meta_path)}",
|
| 202 |
+
repo_id=repo_id,
|
| 203 |
+
repo_type="dataset"
|
| 204 |
+
)
|
| 205 |
+
print("✅ Upload complete.")
|
| 206 |
+
except Exception as e:
|
| 207 |
+
print(f"⚠️ Upload failed: {e}")
|
| 208 |
+
raise
|
core/web_loader.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests, re, json, time, os
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
|
| 4 |
+
def web_crawler_loader(
|
| 5 |
+
urls_file="/home/user/app/data/urls.txt",
|
| 6 |
+
cache_path="/home/user/app/persistent/web_cache.json",
|
| 7 |
+
max_pages=3,
|
| 8 |
+
timeout=20,
|
| 9 |
+
force_refresh=False,
|
| 10 |
+
):
|
| 11 |
+
"""Fetch and cache text content from official URLs."""
|
| 12 |
+
cache = {}
|
| 13 |
+
if os.path.exists(cache_path) and not force_refresh:
|
| 14 |
+
try:
|
| 15 |
+
with open(cache_path, "r", encoding="utf-8") as f:
|
| 16 |
+
cache = json.load(f)
|
| 17 |
+
except Exception:
|
| 18 |
+
cache = {}
|
| 19 |
+
|
| 20 |
+
if not os.path.exists(urls_file):
|
| 21 |
+
print(f"⚠️ URLs file missing: {urls_file}")
|
| 22 |
+
return list(cache.values())
|
| 23 |
+
|
| 24 |
+
with open(urls_file, "r", encoding="utf-8") as f:
|
| 25 |
+
urls = [u.strip() for u in f if u.strip() and not u.startswith("#")]
|
| 26 |
+
|
| 27 |
+
new_entries = {}
|
| 28 |
+
for i, url in enumerate(urls[: max_pages * 10]):
|
| 29 |
+
if url in cache and not force_refresh:
|
| 30 |
+
new_entries[url] = cache[url]
|
| 31 |
+
continue
|
| 32 |
+
try:
|
| 33 |
+
print(f"🌐 Fetching ({i+1}/{len(urls)}): {url}")
|
| 34 |
+
r = requests.get(url, timeout=timeout, headers={"User-Agent": "ClinicalTrialChatBot/1.0"})
|
| 35 |
+
if r.status_code != 200:
|
| 36 |
+
print(f"⚠️ Skipped {url}: HTTP {r.status_code}")
|
| 37 |
+
continue
|
| 38 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 39 |
+
for tag in soup(["script", "style", "header", "footer", "nav", "iframe"]):
|
| 40 |
+
tag.decompose()
|
| 41 |
+
text = " ".join(soup.get_text().split())
|
| 42 |
+
if len(text) < 400:
|
| 43 |
+
continue
|
| 44 |
+
entry_text = f"Source URL: {url}. {text[:3000]}"
|
| 45 |
+
new_entries[url] = {"source": url, "type": "Website", "text": entry_text}
|
| 46 |
+
time.sleep(1)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"⚠️ Error fetching {url}: {e}")
|
| 49 |
+
|
| 50 |
+
cache.update(new_entries)
|
| 51 |
+
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
|
| 52 |
+
with open(cache_path, "w", encoding="utf-8") as f:
|
| 53 |
+
json.dump(cache, f, indent=2)
|
| 54 |
+
print(f"💾 Web cache updated ({len(cache)} entries).")
|
| 55 |
+
return list(cache.values())
|