HassanDataSci's picture
Update app.py
73dc160 verified
import streamlit as st
import requests
from bs4 import BeautifulSoup
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import pipeline
# βœ… Verified healthcare sources
urls = [
# CDC
"https://www.cdc.gov/asthma/faqs.htm",
"https://www.cdc.gov/bloodpressure/index.htm",
"https://www.cdc.gov/diabetes/basics/index.html",
"https://www.cdc.gov/obesity/data/adult.html",
"https://www.cdc.gov/nutrition/data-statistics/index.html",
"https://www.cdc.gov/flu/about/index.html",
"https://www.cdc.gov/hepatitis/hbv/index.htm",
"https://www.cdc.gov/stroke/about.htm",
"https://www.cdc.gov/arthritis/basics/index.html",
"https://www.cdc.gov/kidneydisease/publications-resources/index.html",
"https://www.cdc.gov/vaccines/vpd/vaccines-list.html",
"https://www.cdc.gov/coronavirus/2019-ncov/your-health/about-covid-19/basics-covid-19.html",
"https://www.cdc.gov/rsv/about/symptoms.html",
"https://www.cdc.gov/tb/topic/basics/default.htm",
# WHO
"https://www.who.int/news-room/fact-sheets/detail/mental-disorders",
"https://www.who.int/news-room/fact-sheets/detail/influenza-(seasonal)",
"https://www.who.int/news-room/questions-and-answers/item/vaccines-and-immunization-what-is-vaccination",
# Mayo Clinic
"https://www.mayoclinic.org/diseases-conditions/asthma/symptoms-causes/syc-20369653",
"https://www.mayoclinic.org/diseases-conditions/depression/symptoms-causes/syc-20356007",
"https://www.mayoclinic.org/diseases-conditions/high-blood-pressure/symptoms-causes/syc-20373410",
"https://www.mayoclinic.org/diseases-conditions/type-2-diabetes/symptoms-causes/syc-20351193",
"https://www.mayoclinic.org/healthy-lifestyle/nutrition-and-healthy-eating/in-depth/nutrition/art-20049340",
# MedlinePlus
"https://medlineplus.gov/asthma.html",
"https://medlineplus.gov/depression.html",
"https://medlineplus.gov/diabetes.html",
"https://medlineplus.gov/influenza.html",
"https://medlineplus.gov/vaccines.html",
"https://medlineplus.gov/coronaryarterydisease.html",
# NIH
"https://www.nih.gov/about-nih/what-we-do/nih-almanac/national-heart-lung-blood-institute-nhlbi",
"https://www.nih.gov/news-events/nih-research-matters/brain-aging-tied-cell-cleanup-problems",
"https://www.nih.gov/news-events/nih-research-matters/covid-19-causes-changes-brain",
# NIMH
"https://www.nimh.nih.gov/health/topics/depression",
"https://www.nimh.nih.gov/health/topics/anxiety-disorders",
"https://www.nimh.nih.gov/health/publications/depression",
"https://www.nimh.nih.gov/health/publications/anxiety-disorders",
# U.S. Dept of Health
"https://health.gov/myhealthfinder/topics/health-conditions/diabetes",
"https://health.gov/myhealthfinder/topics/everyday-healthy-living/nutrition",
"https://health.gov/myhealthfinder/topics/everyday-healthy-living/physical-activity",
# John Hopkins Medicine
"https://www.hopkinsmedicine.org/health/conditions-and-diseases/high-blood-pressure-hypertension",
"https://www.hopkinsmedicine.org/health/conditions-and-diseases/diabetes",
# Cleveland Clinic
"https://my.clevelandclinic.org/health/diseases/22276-long-covid",
"https://my.clevelandclinic.org/health/diseases/9634-diabetes",
# Harvard Health
"https://www.health.harvard.edu/mental-health/what-causes-depression",
"https://www.health.harvard.edu/staying-healthy/foods-linked-to-better-brainpower",
"https://www.health.harvard.edu/newsletter_article/how-sleep-cleans-your-brain",
# Health.gov
"https://www.health.gov/healthypeople",
"https://www.health.gov/news",
# NIH Office of Dietary Supplements
"https://ods.od.nih.gov/factsheets/Iron-HealthProfessional/",
"https://ods.od.nih.gov/factsheets/VitaminD-HealthProfessional/",
# Red Cross
"https://www.redcross.org/about-us/news-and-events/news/2023/blood-donation-facts.html"
]
# 🧽 Scraper function
def scrape_url(url):
try:
response = requests.get(url, timeout=10)
soup = BeautifulSoup(response.content, "html.parser")
for tag in soup(["nav", "footer", "script", "style", "header", "noscript", "iframe"]):
tag.decompose()
text = "\n\n".join(p.get_text(strip=True) for p in soup.find_all("p") if len(p.get_text(strip=True)) > 50)
return Document(page_content=text, metadata={"source": url})
except Exception as e:
print(f"Error scraping {url}: {e}")
return None
# 🧠 Vector store
@st.cache_resource
def prepare_knowledge_base():
docs = [scrape_url(url) for url in urls]
docs = [doc for doc in docs if doc]
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
return FAISS.from_documents(chunks, embeddings)
# πŸ€– Load LLM
@st.cache_resource
def load_llm():
return pipeline(
"text2text-generation",
model="google/flan-t5-base",
device=-1,
max_new_tokens=400,
do_sample=False,
num_beams=1
)
# πŸ–ΌοΈ UI
st.set_page_config(page_title="Healthcare RAG", layout="centered")
st.title("πŸ₯ Healthcare RAG Assistant")
st.markdown("Ask your question. The assistant will summarize info from verified sources like CDC, WHO, Mayo Clinic, and NIMH.")
# Load model and vectorstore
with st.spinner("Loading verified knowledge base..."):
vectorstore = prepare_knowledge_base()
llm = load_llm()
question = st.text_input("πŸ’¬ Ask a healthcare question:")
if question:
with st.spinner("Generating expert response..."):
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
context_docs = retriever.get_relevant_documents(question)
context = "\n\n".join(doc.page_content for doc in context_docs)[:2000] # trim context for faster generation
prompt = f"""
You are a helpful and professional healthcare assistant. You have access to trusted health documents from CDC, Mayo Clinic, WHO, and NIMH.
Using only the context below, summarize the information in a clear, helpful, and medically accurate way. Do not copy or fabricate facts. Structure the response using short paragraphs or bullet points. Act like you're writing a high-quality health blog or patient education guide.
Context:
{context}
Question: {question}
Answer:
"""
response = llm(prompt)
st.success(response[0]['generated_text'])
with st.expander("πŸ“š Sources used"):
for doc in context_docs:
st.markdown(f"- [{doc.metadata['source']}]({doc.metadata['source']})")