Revanthr75
/

chatbot

Model card Files Files and versions

xet

Community

Revanthr75 commited on Oct 4, 2025

Commit

aeb5696

verified ·

1 Parent(s): 372cde3

new

Browse files

Files changed (1) hide show

med.ipynb +174 -0

med.ipynb ADDED Viewed

	@@ -0,0 +1,174 @@

+import os, re, requests, json
+from typing import List, Dict, Any, Tuple
+from bs4 import BeautifulSoup
+import numpy as np
+import faiss
+import streamlit as st
+from sentence_transformers import SentenceTransformer  # Local HF model use
+MEDLINE_WSEARCH = "https://wsearch.nlm.nih.gov/ws/query"
+DISCLAIMER = ("This assistant provides general health information and is not a substitute for professional medical advice, "
+              "diagnosis, or treatment. For personal medical concerns, consult a qualified clinician or seek emergency care for urgent symptoms.")
+# --- Red flag patterns for basic triage ---
+RED_FLAGS = [
+    r"\b(chest pain|pressure in chest)\b",
+    r"\b(trouble breathing|shortness of breath|severe breathlessness)\b",
+    r"\b(signs of stroke|face droop|arm weakness|speech trouble|sudden confusion)\b",
+    r"\b(severe allergic reaction|anaphylaxis|swelling of face|swelling of tongue)\b",
+    r"\b(black stools|vomiting blood|severe bleeding)\b",
+    r"\b(severe dehydration|no urination|sunken eyes)\b",
+    r"\b(high fever|stiff neck|severe headache)\b",
+]
+def has_red_flags(text: str) -> bool:
+    t = text.lower()
+    return any(re.search(p, t) for p in RED_FLAGS)
+# --- MedlinePlus search and fetch ---
+def medline_search(term: str, retmax: int = 5, rettype: str = "brief") -> List[Dict[str, str]]:
+    params = {"db": "healthTopics", "term": term, "retmax": str(retmax), "rettype": rettype}
+    r = requests.get(MEDLINE_WSEARCH, params=params, timeout=10)
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "xml")
+    results = []
+    for doc in soup.find_all("document"):
+        title = doc.find("content", {"name": "title"})
+        url = doc.find("content", {"name": "url"})
+        snippet = doc.find("content", {"name": "snippet"}) or doc.find("content", {"name": "full-summary"})
+        if title and url:
+            results.append({"title": title.text.strip(), "url": url.text.strip(), "snippet": (snippet.text.strip() if snippet else "")})
+    return results
+def fetch_page_text(url: str, max_chars: int = 12000) -> str:
+    r = requests.get(url, timeout=10)
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "html.parser")
+    for tag in soup(["script", "style", "nav", "footer", "header", "form", "aside"]):
+        tag.decompose()
+    text = soup.get_text(separator="\n")
+    text = re.sub(r"\n{2,}", "\n", text)
+    return text[:max_chars].strip()
+def chunk_text(text: str, approx_tokens: int = 220) -> List[str]:
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), approx_tokens):
+        chunk = " ".join(words[i:i+approx_tokens])
+        if len(chunk) > 40:
+            chunks.append(chunk)
+    return chunks
+# --- Embeddings via Hugging Face ---
+@st.cache_resource
+def load_local_embedder():
+    # Uses Hugging Face model from the Hub locally
+    return SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+def hf_inference_embed(texts: List[str], hf_token: str) -> np.ndarray:
+    # Uses Hugging Face Inference API directly to get embeddings from the model repo
+    # Some providers return lists of vectors; normalize after
+    api_url = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
+    headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
+    # Batch once for simplicity; for large corpora, split into smaller requests
+    resp = requests.post(api_url, headers=headers, json={"inputs": texts}, timeout=30)
+    resp.raise_for_status()
+    data = resp.json()
+    # Handle potential {'error': ...} or streaming-like responses
+    if isinstance(data, dict) and "error" in data:
+        raise RuntimeError(data["error"])
+    # Expect a list of vectors
+    arr = np.array(data, dtype=np.float32)
+    # L2 normalize for cosine similarity
+    norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
+    return arr / norms
+def build_faiss(embeddings: np.ndarray) -> faiss.IndexFlatIP:
+    dim = embeddings.shape[1]
+    index = faiss.IndexFlatIP(dim)
+    index.add(embeddings.astype(np.float32))
+    return index
+def search_index(index: faiss.IndexFlatIP, query_emb: np.ndarray, k: int = 6) -> Tuple[np.ndarray, np.ndarray]:
+    D, I = index.search(query_emb.astype(np.float32), k)
+    return D, I
+def format_answer(query: str, hits: List[int], docs: List[Dict[str, str]], urgent: bool) -> str:
+    grouped = {}
+    for idx in hits:
+        d = docs[idx]
+        key = (d["source_title"], d["source_url"])
+        grouped.setdefault(key, []).append(d["content"])
+    lines = []
+    if urgent:
+        lines.append("Potential urgent symptoms detected. Consider seeking immediate care before self-care steps.")
+    lines.append("What it is:\n- Below are excerpts from MedlinePlus topics related to the question.")
+    lines.append("Common symptoms:\n- See excerpts; symptom overlap is common, confirm with a clinician.")
+    lines.append("Self-care steps:\n- Follow patient-friendly guidance in the excerpts when appropriate.")
+    lines.append("When to seek care:\n- New, severe, or worsening symptoms, or red flags such as chest pain, trouble breathing, stroke signs, or severe allergic reaction.")
+    lines.append("Sources:")
+    for (title, url), chunks in grouped.items():
+        lines.append(f"- {title} — {url}")
+        for c in chunks[:2]:
+            snippet = (c[:360] + "…") if len(c) > 360 else c
+            lines.append(f"  • {snippet}")
+    lines.append(DISCLAIMER)
+    return "\n\n".join(lines)
+st.set_page_config(page_title="MedAssist (HF MiniLM + MedlinePlus)", page_icon="🩺")
+st.title("MedAssist: Hugging Face MiniLM + MedlinePlus")
+st.info(DISCLAIMER)
+with st.sidebar:
+    st.header("Retriever settings")
+    use_hf_api = st.checkbox("Use Hugging Face Inference API (else local)", value=False)
+    hf_token = st.text_input("HF API Token (if API mode)", type="password")
+    topk_urls = st.slider("MedlinePlus URLs to fetch", 1, 8, 4)
+    chunks_per_url = st.slider("Chunks per URL", 2, 12, 6)
+    topk = st.slider("Top chunks to return", 2, 12, 6)
+    st.caption("MedlinePlus wsearch → fetch pages → MiniLM embeddings → FAISS semantic search")
+query = st.text_input("Describe symptoms or enter a medical term")
+if st.button("Search"):
+    urgent = has_red_flags(query)
+    try:
+        topics = medline_search(query, retmax=topk_urls, rettype="brief")
+    except Exception as e:
+        st.error(f"MedlinePlus search failed: {e}")
+        topics = []
+    docs = []
+    for t in topics:
+        try:
+            text = fetch_page_text(t["url"])
+            chunks = chunk_text(text)[:chunks_per_url]
+            for ch in chunks:
+                docs.append({"source_title": t["title"], "source_url": t["url"], "content": ch})
+        except Exception:
+            continue
+    if not docs:
+        st.warning("No relevant MedlinePlus content found. Try a different term or consult a clinician.")
+    else:
+        texts = [d["content"] for d in docs]
+        try:
+            if use_hf_api:
+                if not hf_token:
+                    st.error("Provide a Hugging Face API token to use the Inference API.")
+                    st.stop()
+                doc_emb = hf_inference_embed(texts, hf_token)
+                q_emb = hf_inference_embed([query], hf_token)
+            else:
+                model = load_local_embedder()  # Downloads from Hugging Face Hub
+                doc_emb = model.encode(texts, normalize_embeddings=True, batch_size=32, show_progress_bar=False)
+                q_emb = model.encode([query], normalize_embeddings=True)
+        except Exception as e:
+            st.error(f"Embedding failed: {e}")
+            st.stop()
+        index = build_faiss(np.array(doc_emb, dtype=np.float32))
+        D, I = search_index(index, np.array(q_emb, dtype=np.float32), k=topk)
+        hit_ids = [int(i) for i in I[0] if i >= 0]
+        answer = format_answer(query, hit_ids, docs, urgent)
+        st.markdown(answer)