# app.py — Updated: autocomplete + no-cache + keep existing functions
# Based on your uploaded app (app (3).py). Kept functions intact and only added UI + JS enhancements.
import gradio as gr
import time
import re
from pathlib import Path
from api import summarize_combined_wrapper
from fastapi import FastAPI
from fastapi.responses import JSONResponse
# -----------------------------------------
# TEMP: Dataset Path Debugger
# -----------------------------------------
import os, glob
DISCLAIMER_TEXT = (
"This app is intended for educational and informational purposes only. "
"It does not provide medical advice, diagnosis, or treatment. "
"Content is derived from publicly available, authoritative sources "
"including FDA, ICH, SCDM, CDISC, and similar organizations."
)
print("\n===== DATASET CHECK =====")
print("HOME DIR:", os.listdir("/home"))
print("USER DIR:", os.listdir("/home/user"))
print("HF CACHE:", glob.glob("/home/user/.cache/huggingface/datasets/*"))
print("HF SNAPSHOTS:", glob.glob("/home/user/.cache/huggingface/datasets/**", recursive=True))
print("==========================\n")
# -----------------------------
# Chat response streamer (unchanged)
# -----------------------------
def stream_chat_generator(question: str):
if not question or not question.strip():
yield "Please enter a question."
return
try:
res = summarize_combined_wrapper(question)
full = res.get("answer", "") if isinstance(res, dict) else str(res)
except Exception as e:
full = f"Error: {e}"
# stream in chunks
CHUNK = 80
for i in range(0, len(full), CHUNK):
yield full[: i + CHUNK]
time.sleep(0.025)
# -----------------------------
# Load Glossary From File
# -----------------------------
GLOSSARY_FILE = Path("glossary.html")
if not GLOSSARY_FILE.exists():
# create minimal placeholder if missing
GLOSSARY_HTML = "
(glossary.html not found — please upload)
"
else:
GLOSSARY_HTML = GLOSSARY_FILE.read_text(encoding="utf-8")
# -----------------------------
# Build autocomplete terms list from glossary.html (dedupe + sort)
# -----------------------------
def extract_terms_from_glossary(html_text: str):
"""
Heuristic extraction:
- find large comma-separated blocks inside the glossary file and extract tokens
- normalize whitespace, strip punctuation, dedupe (case-insensitive)
"""
# remove HTML tags (simple)
text = re.sub(r"<[^>]+>", " ", html_text)
# collapse multiple spaces
text = re.sub(r"\s+", " ", text)
# find sequences that look like many comma-separated tokens:
candidates = []
# pick long segments containing commas
for seg in re.split(r"[;\n\r]", text):
if seg.count(",") >= 3 or len(seg.split()) > 20:
candidates.append(seg)
tokens = []
for seg in candidates:
parts = [p.strip() for p in seg.split(",")]
for p in parts:
# remove stray parentheses-only content at ends
cleaned = re.sub(r'^\(|\)$', '', p).strip()
# skip very short tokens like single characters
if cleaned and len(cleaned) > 1:
# keep original capitalization but normalize whitespace
cleaned = re.sub(r"\s+", " ", cleaned)
tokens.append(cleaned)
# fallback: if tokens empty, try to split entire text by commas
if not tokens:
tokens = [p.strip() for p in text.split(",") if len(p.strip()) > 1]
# dedupe case-insensitively, preserve first-seen capitalization
seen = {}
for t in tokens:
key = t.lower()
if key not in seen:
seen[key] = t
terms = sorted(seen.values(), key=lambda s: s.lower())
return terms
AUTOCOMPLETE_TERMS = extract_terms_from_glossary(GLOSSARY_HTML)
# Build datalist options string (safe-escaped)
def build_options_html(terms):
opt_lines = []
for t in terms:
# escape double quotes in value attribute
v = t.replace('"', """)
opt_lines.append(f'