# app.py — Updated: autocomplete + no-cache + keep existing functions # Based on your uploaded app (app (3).py). Kept functions intact and only added UI + JS enhancements. import gradio as gr import time import re from pathlib import Path from api import summarize_combined_wrapper from fastapi import FastAPI from fastapi.responses import JSONResponse # ----------------------------------------- # TEMP: Dataset Path Debugger # ----------------------------------------- import os, glob DISCLAIMER_TEXT = ( "This app is intended for educational and informational purposes only. " "It does not provide medical advice, diagnosis, or treatment. " "Content is derived from publicly available, authoritative sources " "including FDA, ICH, SCDM, CDISC, and similar organizations." ) print("\n===== DATASET CHECK =====") print("HOME DIR:", os.listdir("/home")) print("USER DIR:", os.listdir("/home/user")) print("HF CACHE:", glob.glob("/home/user/.cache/huggingface/datasets/*")) print("HF SNAPSHOTS:", glob.glob("/home/user/.cache/huggingface/datasets/**", recursive=True)) print("==========================\n") # ----------------------------- # Chat response streamer (unchanged) # ----------------------------- def stream_chat_generator(question: str): if not question or not question.strip(): yield "Please enter a question." return try: res = summarize_combined_wrapper(question) full = res.get("answer", "") if isinstance(res, dict) else str(res) except Exception as e: full = f"Error: {e}" # stream in chunks CHUNK = 80 for i in range(0, len(full), CHUNK): yield full[: i + CHUNK] time.sleep(0.025) # ----------------------------- # Load Glossary From File # ----------------------------- GLOSSARY_FILE = Path("glossary.html") if not GLOSSARY_FILE.exists(): # create minimal placeholder if missing GLOSSARY_HTML = "

(glossary.html not found — please upload)

" else: GLOSSARY_HTML = GLOSSARY_FILE.read_text(encoding="utf-8") # ----------------------------- # Build autocomplete terms list from glossary.html (dedupe + sort) # ----------------------------- def extract_terms_from_glossary(html_text: str): """ Heuristic extraction: - find large comma-separated blocks inside the glossary file and extract tokens - normalize whitespace, strip punctuation, dedupe (case-insensitive) """ # remove HTML tags (simple) text = re.sub(r"<[^>]+>", " ", html_text) # collapse multiple spaces text = re.sub(r"\s+", " ", text) # find sequences that look like many comma-separated tokens: candidates = [] # pick long segments containing commas for seg in re.split(r"[;\n\r]", text): if seg.count(",") >= 3 or len(seg.split()) > 20: candidates.append(seg) tokens = [] for seg in candidates: parts = [p.strip() for p in seg.split(",")] for p in parts: # remove stray parentheses-only content at ends cleaned = re.sub(r'^$|$$', '', p).strip() # skip very short tokens like single characters if cleaned and len(cleaned) > 1: # keep original capitalization but normalize whitespace cleaned = re.sub(r"\s+", " ", cleaned) tokens.append(cleaned) # fallback: if tokens empty, try to split entire text by commas if not tokens: tokens = [p.strip() for p in text.split(",") if len(p.strip()) > 1] # dedupe case-insensitively, preserve first-seen capitalization seen = {} for t in tokens: key = t.lower() if key not in seen: seen[key] = t terms = sorted(seen.values(), key=lambda s: s.lower()) return terms AUTOCOMPLETE_TERMS = extract_terms_from_glossary(GLOSSARY_HTML) # Build datalist options string (safe-escaped) def build_options_html(terms): opt_lines = [] for t in terms: # escape double quotes in value attribute v = t.replace('"', """) opt_lines.append(f'

Available Clinical Trial Terms & Acronyms (3000+)