suryaprakash01's picture
Update app.py
6b1d407 verified
#!/usr/bin/env python3
"""
Pathology RAG System - Streamlit Version
Query existing FAISS database
"""
import os
import sys
from pathlib import Path
from datetime import datetime
import streamlit as st
# Force CPU
os.environ["CUDA_VISIBLE_DEVICES"] = ""
# Add src folder
sys.path.append("src")
DB_PATH = "output/biomedbert_vector_db"
if not Path(DB_PATH).exists():
st.error("Vector database not found. Upload output/biomedbert_vector_db.")
st.stop()
# Import RAG pipeline & Updater
try:
from retriever import CompleteRAGPipeline
from document_processor import DynamicRAGUpdater
except ImportError as e:
st.error(f"Import error: {e}")
st.stop()
# -----------------------------
# Load Pipeline (cached)
# -----------------------------
@st.cache_resource
def load_pipeline():
# Cache busted to pick up the new ask method return dictionary
pipeline = CompleteRAGPipeline(
faiss_db_path=DB_PATH,
embedding_model="microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
)
return pipeline
pipeline = load_pipeline()
# -----------------------------
# Page Config
# -----------------------------
st.set_page_config(
page_title="Pathology RAG",
layout="wide"
)
st.title("🔬 Pathology Report Analysis System")
st.markdown(
"""
AI-powered search and question answering over pathology reports
Vector database powered by **BiomedBERT + FAISS**
"""
)
# -----------------------------
# Session State
# -----------------------------
if "query_count" not in st.session_state:
st.session_state.query_count = 0
# -----------------------------
# Sidebar
# -----------------------------
st.sidebar.header("System Info")
st.sidebar.write(f"Queries: {st.session_state.query_count}")
st.sidebar.write("Embedding Model:")
st.sidebar.write("BiomedBERT")
st.sidebar.write("Vector DB:")
st.sidebar.write("FAISS")
# -----------------------------
# Document Upload
# -----------------------------
st.sidebar.divider()
st.sidebar.header("📄 Upload Report")
with st.sidebar.form(key='upload_form', clear_on_submit=True):
uploaded_file = st.file_uploader("Upload PDF Pathology Report", type=["pdf"])
submit_btn = st.form_submit_button("Process Document")
if submit_btn and uploaded_file is not None:
with st.spinner("Processing Document... this may take a while."):
# Save file
upload_dir = Path("uploaded_reports")
upload_dir.mkdir(exist_ok=True)
pdf_path = upload_dir / uploaded_file.name
with open(pdf_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Instantiate updater
updater = DynamicRAGUpdater(
vector_db_path=DB_PATH,
embedding_model="microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
upload_dir=str(upload_dir)
)
# Process and add to vector database
try:
stats = updater.process_and_add_pdf(str(pdf_path))
st.sidebar.success(f"Successfully processed `{uploaded_file.name}`")
st.sidebar.json(stats)
# Clear pipeline cache to reflect new db index
load_pipeline.clear()
except Exception as e:
st.sidebar.error(f"Error during processing: {e}")
st.sidebar.divider()
# -----------------------------
# Query Input
# -----------------------------
st.header("🔎 Ask a Question")
question = st.text_area(
"Enter your medical query",
placeholder="What are common findings in breast cancer pathology?",
)
num_sources = st.slider(
"Number of sources",
min_value=1,
max_value=10,
value=5
)
# -----------------------------
# Search Button
# -----------------------------
if st.button("Search"):
if question.strip() == "":
st.warning("Please enter a question.")
else:
with st.spinner("Running RAG pipeline..."):
st.session_state.query_count += 1
result = pipeline.ask(
question,
top_k=num_sources
)
answer = result["answer"]
st.subheader("Answer")
st.markdown(answer)
# Metadata
st.subheader("Query Info")
st.write({
"query_number": st.session_state.query_count,
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"sources_used": result["num_sources"]
})
# Sources
st.subheader("Sources")
sources = result["sources"]
if not sources:
st.write("No sources retrieved.")
for i, source in enumerate(sources, 1):
chunk = source["chunk"]
with st.expander(f"Source {i} | {chunk['filename']}"):
st.write(chunk["text"][:600])