essprasad commited on
Commit
12db3c3
·
verified ·
1 Parent(s): 43311b4

Upload 3 files

Browse files
Files changed (3) hide show
  1. api.py +356 -0
  2. app.py +102 -0
  3. postBuild +60 -0
api.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ import os
5
+ import json
6
+ import traceback
7
+ import shutil
8
+ import typing
9
+
10
+ # ============================================================
11
+ # Try core pipeline first
12
+ # ============================================================
13
+ try:
14
+ from core.hybrid_retriever import summarize_combined as core_summarize_combined
15
+ CORE_AVAILABLE = True
16
+ except Exception:
17
+ core_summarize_combined = None
18
+ CORE_AVAILABLE = False
19
+
20
+ # ------------------------------------------------------------
21
+ # Admin functions (safe fallback)
22
+ # ------------------------------------------------------------
23
+ try:
24
+ from core.admin_tasks import rebuild_index, rebuild_glossary, reset_faiss_cache, clear_index
25
+ except Exception:
26
+ # fallbacks
27
+ def rebuild_index(): return "rebuild_index not available"
28
+ def rebuild_glossary(): return "rebuild_glossary not available"
29
+ def reset_faiss_cache(): return "reset_faiss_cache not available"
30
+ def clear_index(): return "clear_index not available"
31
+
32
+ # ------------------------------------------------------------
33
+ # Optional FAISS + SentenceTransformer
34
+ # ------------------------------------------------------------
35
+ try:
36
+ import faiss
37
+ from sentence_transformers import SentenceTransformer
38
+ EMBEDDER = SentenceTransformer("all-MiniLM-L6-v2")
39
+ FAISS_OK = True
40
+ except Exception:
41
+ EMBEDDER = None
42
+ FAISS_OK = False
43
+
44
+ # TF-IDF fallback
45
+ from sklearn.feature_extraction.text import TfidfVectorizer
46
+ from sklearn.metrics.pairwise import linear_kernel
47
+
48
+ # ------------------------------------------------------------
49
+ # Paths
50
+ # ------------------------------------------------------------
51
+ GLOSSARY_PATH = os.environ.get("GLOSSARY_PATH", "./data/glossary.json")
52
+ FAISS_INDEX_DIR = os.environ.get("FAISS_INDEX_DIR", "./data/faiss_index")
53
+ DOCS_FOLDER = os.environ.get("DOCS_FOLDER", "./data/docs")
54
+ ADMIN_PASS = os.environ.get("ADMIN_PASS", "changeme")
55
+
56
+ DISK_USAGE_THRESHOLD_GB = float(os.environ.get("DISK_USAGE_THRESHOLD_GB", "45.0"))
57
+
58
+ # ============================================================
59
+ # Disk Utilities
60
+ # ============================================================
61
+ def get_folder_size_bytes(path: str) -> int:
62
+ total = 0
63
+ if not os.path.exists(path):
64
+ return 0
65
+ for root, dirs, files in os.walk(path, onerror=lambda e: None):
66
+ for f in files:
67
+ fp = os.path.join(root, f)
68
+ if os.path.exists(fp):
69
+ try:
70
+ total += os.path.getsize(fp)
71
+ except:
72
+ pass
73
+ return total
74
+
75
+ def bytes_to_human(n: int) -> str:
76
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
77
+ if n < 1024:
78
+ return f"{n:.1f}{unit}"
79
+ n /= 1024
80
+ return f"{n:.1f}PB"
81
+
82
+ def get_disk_usage(path="/"):
83
+ try:
84
+ usage = shutil.disk_usage(path)
85
+ return {"total": usage.total, "used": usage.used, "free": usage.free}
86
+ except:
87
+ try:
88
+ st = os.statvfs(path)
89
+ total = st.f_frsize * st.f_blocks
90
+ free = st.f_frsize * st.f_bfree
91
+ used = total - free
92
+ return {"total": total, "used": used, "free": free}
93
+ except:
94
+ return {"total": 0, "used": 0, "free": 0}
95
+
96
+ # ============================================================
97
+ # Glossary / Docs
98
+ # ============================================================
99
+ def load_glossary():
100
+ if not os.path.exists(GLOSSARY_PATH):
101
+ return {}
102
+ try:
103
+ with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
104
+ return json.load(f)
105
+ except:
106
+ return {}
107
+
108
+ def load_docs():
109
+ docs = []
110
+ if not os.path.exists(DOCS_FOLDER):
111
+ return docs
112
+ for f in os.listdir(DOCS_FOLDER):
113
+ full = os.path.join(DOCS_FOLDER, f)
114
+ if os.path.isfile(full):
115
+ try:
116
+ docs.append({"id": f, "text": open(full, "r", encoding="utf-8").read()})
117
+ except:
118
+ pass
119
+ return docs
120
+
121
+ # ============================================================
122
+ # TF-IDF Retriever
123
+ # ============================================================
124
+ class SimpleRetriever:
125
+ def __init__(self, docs):
126
+ self.docs = docs
127
+ texts = [d["text"] for d in docs]
128
+ if not texts:
129
+ self.vectorizer = None
130
+ return
131
+ self.vectorizer = TfidfVectorizer(stop_words="english", max_features=4000)
132
+ self.mat = self.vectorizer.fit_transform(texts)
133
+
134
+ def query(self, q, k=3):
135
+ if not self.vectorizer:
136
+ return []
137
+ qv = self.vectorizer.transform([q])
138
+ sims = linear_kernel(qv, self.mat).flatten()
139
+ idxs = sims.argsort()[::-1][:k]
140
+ out = []
141
+ for i in idxs:
142
+ if sims[i] > 0:
143
+ text = self.docs[i]["text"][:300].replace("\n", " ")
144
+ out.append({"id": self.docs[i]["id"], "excerpt": text, "score": float(sims[i])})
145
+ return out
146
+
147
+ # ============================================================
148
+ # FAISS Searcher
149
+ # ============================================================
150
+ def load_faiss():
151
+ if not FAISS_OK:
152
+ return None
153
+ idx_file = os.path.join(FAISS_INDEX_DIR, "index.faiss")
154
+ map_file = os.path.join(FAISS_INDEX_DIR, "mapping.json")
155
+ if not os.path.exists(idx_file) or not os.path.exists(map_file):
156
+ return None
157
+ try:
158
+ idx = faiss.read_index(idx_file)
159
+ mapping = json.load(open(map_file, "r", encoding="utf-8"))
160
+
161
+ def search(q, k=3):
162
+ emb = EMBEDDER.encode([q])
163
+ D, I = idx.search(emb, k)
164
+ res = []
165
+ for score, i_id in zip(D[0], I[0]):
166
+ meta = mapping.get(str(int(i_id)), {})
167
+ txt = (meta.get("text", "")[:300]).replace("\n", " ")
168
+ res.append({
169
+ "id": meta.get("id", i_id),
170
+ "excerpt": txt,
171
+ "score": float(score)
172
+ })
173
+ return res
174
+ return search
175
+ except:
176
+ return None
177
+
178
+ # ============================================================
179
+ # Summarize Wrapper
180
+ # ============================================================
181
+ def fallback_summarize(question):
182
+ glossary = load_glossary()
183
+ docs = load_docs()
184
+
185
+ g_hits = []
186
+ for t, d in glossary.items():
187
+ if t.lower() in question.lower():
188
+ g_hits.append({"source": f"glossary:{t}", "excerpt": d[:300]})
189
+
190
+ faiss_srch = load_faiss()
191
+ doc_hits = faiss_srch(question) if faiss_srch else SimpleRetriever(docs).query(question)
192
+
193
+ parts = []
194
+ if g_hits:
195
+ parts.append("Glossary matches:\n" + "\n".join([f"- {h['source']}: {h['excerpt']}" for h in g_hits]))
196
+ if doc_hits:
197
+ parts.append("Top documents:\n" + "\n".join([f"- ({d['id']}) {d['excerpt']}" for d in doc_hits]))
198
+
199
+ if not parts:
200
+ return {"answer": f"No sources found for: {question}", "citations": []}
201
+
202
+ return {
203
+ "answer": "\n\n".join(parts),
204
+ "citations": g_hits + doc_hits
205
+ }
206
+
207
+ def summarize_combined_wrapper(q):
208
+ if CORE_AVAILABLE and core_summarize_combined:
209
+ try:
210
+ res = core_summarize_combined(q)
211
+ if isinstance(res, dict):
212
+ return {"answer": res.get("answer", ""), "citations": res.get("citations", [])}
213
+ return {"answer": str(res), "citations": []}
214
+ except:
215
+ traceback.print_exc()
216
+ return fallback_summarize(q)
217
+ return fallback_summarize(q)
218
+
219
+ # ============================================================
220
+ # FastAPI - Inner App (CT-Chat API)
221
+ # ============================================================
222
+ app = FastAPI(title="CT-Chat API", description="API endpoint for Clinical Trial Chatbot")
223
+
224
+ app.add_middleware(
225
+ CORSMiddleware,
226
+ allow_origins=["*"], allow_credentials=True,
227
+ allow_methods=["*"], allow_headers=["*"]
228
+ )
229
+
230
+ class Query(BaseModel):
231
+ question: str
232
+
233
+ class AdminPayload(BaseModel):
234
+ password: str
235
+ force: typing.Optional[bool] = False
236
+
237
+ # ---------------- Chat Endpoint ----------------
238
+ @app.post("/chat")
239
+ async def chat(q: Query):
240
+ try:
241
+ r = summarize_combined_wrapper(q.question)
242
+ return {"answer": r["answer"], "citations": r.get("citations", []), "status": "success"}
243
+ except Exception as e:
244
+ return {"answer": str(e), "citations": [], "status": "error"}
245
+
246
+ # ============================================================
247
+ # Disk Usage
248
+ # ============================================================
249
+ @app.get("/admin/disk_usage")
250
+ def api_disk_usage():
251
+ usage = get_disk_usage("/")
252
+ faiss_size = get_folder_size_bytes(FAISS_INDEX_DIR)
253
+ return {
254
+ "disk_total_human": bytes_to_human(usage["total"]),
255
+ "disk_used_human": bytes_to_human(usage["used"]),
256
+ "disk_free_human": bytes_to_human(usage["free"]),
257
+ "faiss_index_size": bytes_to_human(faiss_size),
258
+ "faiss_index_dir": FAISS_INDEX_DIR,
259
+ "threshold_gb": DISK_USAGE_THRESHOLD_GB,
260
+ }
261
+
262
+ # ============================================================
263
+ # Safe Rebuild Index
264
+ # ============================================================
265
+ def _check(p: AdminPayload):
266
+ if p.password != ADMIN_PASS:
267
+ raise HTTPException(status_code=401, detail="Unauthorized")
268
+
269
+ @app.post("/admin/safe_rebuild_index")
270
+ def admin_safe_rebuild(p: AdminPayload):
271
+ _check(p)
272
+ usage = get_disk_usage("/")
273
+ used_gb = usage["used"] / (1024 ** 3)
274
+
275
+ if used_gb >= DISK_USAGE_THRESHOLD_GB and not p.force:
276
+ return {
277
+ "status": "error",
278
+ "reason": f"Disk usage {used_gb:.2f}GB is above safety threshold {DISK_USAGE_THRESHOLD_GB}GB. Use force:true to override."
279
+ }
280
+
281
+ try:
282
+ if os.path.exists(FAISS_INDEX_DIR):
283
+ for f in os.listdir(FAISS_INDEX_DIR):
284
+ fp = os.path.join(FAISS_INDEX_DIR, f)
285
+ try:
286
+ if os.path.isdir(fp):
287
+ shutil.rmtree(fp)
288
+ else:
289
+ os.remove(fp)
290
+ except Exception as e:
291
+ print(f"Warning: could not delete {fp}: {e}")
292
+ else:
293
+ os.makedirs(FAISS_INDEX_DIR, exist_ok=True)
294
+ except Exception as e:
295
+ return {"status": "error", "reason": f"Failed to clear FAISS index folder: {e}"}
296
+
297
+ try:
298
+ res = rebuild_index()
299
+ return {"status": "ok", "result": res}
300
+ except Exception as e:
301
+ traceback.print_exc()
302
+ return {"status": "error", "reason": str(e)}
303
+
304
+ # ============================================================
305
+ # Password Validation
306
+ # ============================================================
307
+ @app.post("/admin/validate_password")
308
+ def api_validate_password(p: AdminPayload):
309
+ if p.password == ADMIN_PASS:
310
+ return {"valid": True}
311
+ else:
312
+ return {"valid": False}
313
+
314
+ # ============================================================
315
+ # Existing Admin Endpoints
316
+ # ============================================================
317
+ @app.post("/admin/rebuild_index")
318
+ def api_rebuild_index(p: AdminPayload):
319
+ _check(p)
320
+ return {"status": "ok", "result": rebuild_index()}
321
+
322
+ @app.post("/admin/rebuild_glossary")
323
+ def api_rebuild_glossary(p: AdminPayload):
324
+ _check(p)
325
+ return {"status": "ok", "result": rebuild_glossary()}
326
+
327
+ @app.post("/admin/reset_faiss")
328
+ def api_reset_faiss(p: AdminPayload):
329
+ _check(p)
330
+ return {"status": "ok", "result": reset_faiss_cache()}
331
+
332
+ @app.post("/admin/clear_index")
333
+ def api_clear_index(p: AdminPayload):
334
+ _check(p)
335
+ try:
336
+ return {"status": "ok", "result": clear_index()}
337
+ except Exception as e:
338
+ return {"status": "ok", "result": str(e)}
339
+
340
+ # ============================================================
341
+ # ✔✔ MOUNT API UNDER /api (Fix Android 404)
342
+ # ============================================================
343
+ from fastapi import FastAPI as _FastAPI
344
+
345
+ root_app = _FastAPI(title="Root Server", description="API root router")
346
+ root_app.mount("/api", app)
347
+
348
+ # root_app is now the server entry point
349
+ app = root_app
350
+
351
+ # ============================================================
352
+ # Local Run (now serves root_app correctly)
353
+ # ============================================================
354
+ if __name__ == "__main__":
355
+ import uvicorn
356
+ uvicorn.run("api:app", host="0.0.0.0", port=7861)
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py — Gradio-Only Chat UI (No FastAPI)
2
+
3
+ import gradio as gr
4
+ import time
5
+ import json
6
+
7
+ # Import your main chatbot pipeline from api.py
8
+ from api import summarize_combined_wrapper # THIS is your real chatbot function
9
+
10
+ # -----------------------------
11
+ # CHAT FUNCTION (direct call)
12
+ # -----------------------------
13
+ def stream_chat_generator(question: str):
14
+ if not question or not question.strip():
15
+ yield "<i>Please enter a question.</i>"
16
+ return
17
+
18
+ try:
19
+ res = summarize_combined_wrapper(question)
20
+ full = res.get("answer", "") if isinstance(res, dict) else str(res)
21
+ except Exception as e:
22
+ full = f"Error: {e}"
23
+
24
+ # Stream answer in chunks
25
+ CHUNK = 80
26
+ for i in range(0, len(full), CHUNK):
27
+ yield full[: i + CHUNK]
28
+ time.sleep(0.02)
29
+
30
+
31
+ # -----------------------------
32
+ # GRADIO UI (Mobile Friendly)
33
+ # -----------------------------
34
+ with gr.Blocks(
35
+ css="""
36
+ @media (max-width: 600px) {
37
+ .gradio-container { padding: 8px !important; }
38
+ input, textarea { font-size: 16px !important; }
39
+ button { font-size: 17px !important; }
40
+ }
41
+ .header {
42
+ display: flex;
43
+ align-items: center;
44
+ gap: 12px;
45
+ margin-bottom: 15px;
46
+ }
47
+ .logo {
48
+ width: 50px;
49
+ height: 50px;
50
+ background: #0ea5a4;
51
+ border-radius: 8px;
52
+ color: white;
53
+ font-size: 24px;
54
+ font-weight: bold;
55
+ display: flex;
56
+ justify-content: center;
57
+ align-items: center;
58
+ }
59
+ .title-text {
60
+ font-size: 22px;
61
+ font-weight: 600;
62
+ margin: 0;
63
+ }
64
+ """,
65
+ title="Clinical Research Dictionary"
66
+ ) as demo:
67
+
68
+ # Header
69
+ gr.HTML("""
70
+ <div class='header'>
71
+ <div class='logo'>CT</div>
72
+ <div>
73
+ <div class='title-text'>Clinical Research Dictionary</div>
74
+ <div style='font-size:14px; color:gray'>
75
+ Ask GCP, eCRF, LIMS, and clinical trial questions.
76
+ </div>
77
+ </div>
78
+ </div>
79
+ """)
80
+
81
+ with gr.Column():
82
+ q = gr.Textbox(
83
+ label="Your Question",
84
+ placeholder="e.g. What is an eCRF?",
85
+ lines=1
86
+ )
87
+ submit = gr.Button("Submit", variant="primary")
88
+ out = gr.HTML()
89
+
90
+ submit.click(stream_chat_generator, inputs=q, outputs=out)
91
+
92
+ gr.Markdown("### Answers (one per source)")
93
+
94
+
95
+ # Launch UI
96
+ demo.launch(
97
+ server_name="0.0.0.0",
98
+ server_port=7860,
99
+ share=False,
100
+ inline=False,
101
+ show_api=False
102
+ )
postBuild ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ echo "🔧 PostBuild starting — optimizing CT-Chat Space..."
5
+
6
+ # -------------------------------------------------------
7
+ # 1️⃣ Fix dependency mismatches (Gradio & Websockets)
8
+ # -------------------------------------------------------
9
+ pip install --force-reinstall --no-cache-dir "websockets>=12" "gradio-client>=1.3.0"
10
+
11
+ # -------------------------------------------------------
12
+ # 2️⃣ Create and register shared NLTK data directory
13
+ # -------------------------------------------------------
14
+ echo "📁 Preparing shared NLTK data directory..."
15
+ export NLTK_DATA="/usr/local/share/nltk_data"
16
+ mkdir -p $NLTK_DATA
17
+ chmod -R 777 $NLTK_DATA
18
+
19
+ # -------------------------------------------------------
20
+ # 3️⃣ Preload all required NLTK resources (including punkt_tab)
21
+ # -------------------------------------------------------
22
+ echo "📦 Downloading NLTK resources..."
23
+ python -m nltk.downloader -d $NLTK_DATA \
24
+ punkt punkt_tab averaged_perceptron_tagger averaged_perceptron_tagger_eng stopwords wordnet omw-1.4
25
+
26
+ # -------------------------------------------------------
27
+ # 4️⃣ Verify NLTK installs and paths
28
+ # -------------------------------------------------------
29
+ python - <<'PYCODE'
30
+ import nltk, os
31
+ print(f"NLTK data path → {nltk.data.path}")
32
+ for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger_eng", "stopwords", "wordnet"]:
33
+ try:
34
+ nltk.data.find(pkg)
35
+ print(f"✅ Verified NLTK resource: {pkg}")
36
+ except LookupError:
37
+ print(f"⚠️ Missing NLTK resource: {pkg}")
38
+ PYCODE
39
+
40
+ # -------------------------------------------------------
41
+ # 5️⃣ Clean caches (stay <50GB)
42
+ # -------------------------------------------------------
43
+ echo "🧹 Cleaning Hugging Face + Torch caches..."
44
+ rm -rf /root/.cache/* || true
45
+ rm -rf /home/user/.cache/* || true
46
+ rm -rf /usr/local/share/nltk_data/taggers/__pycache__ || true
47
+ rm -rf /home/user/app/hf_cache/* || true
48
+ rm -rf /home/user/app/logs/* || true
49
+
50
+ # -------------------------------------------------------
51
+ # 6️⃣ Ensure writable temporary cache for runtime
52
+ # -------------------------------------------------------
53
+ echo "📦 Preparing /tmp/hf_cache..."
54
+ mkdir -p /tmp/hf_cache
55
+ chmod -R 777 /tmp/hf_cache
56
+
57
+ # -------------------------------------------------------
58
+ # ✅ Done
59
+ # -------------------------------------------------------
60
+ echo "✅ PostBuild completed successfully — NLTK preloaded (punkt_tab OK), cache ready at /tmp/hf_cache."