Spaces:

suryaprakash01
/

Knowledge_Extraction_from_Pathology_reports

Sleeping

App Files Files Community

Knowledge_Extraction_from_Pathology_reports / app.py

suryaprakash01

Update app.py

6b1d407 verified about 2 months ago

raw

history blame contribute delete

4.83 kB

	#!/usr/bin/env python3
	"""
	Pathology RAG System - Streamlit Version
	Query existing FAISS database
	"""

	import os
	import sys
	from pathlib import Path
	from datetime import datetime

	import streamlit as st

	# Force CPU
	os.environ["CUDA_VISIBLE_DEVICES"] = ""

	# Add src folder
	sys.path.append("src")

	DB_PATH = "output/biomedbert_vector_db"

	if not Path(DB_PATH).exists():
	st.error("Vector database not found. Upload output/biomedbert_vector_db.")
	st.stop()

	# Import RAG pipeline & Updater
	try:
	from retriever import CompleteRAGPipeline
	from document_processor import DynamicRAGUpdater
	except ImportError as e:
	st.error(f"Import error: {e}")
	st.stop()


	# -----------------------------
	# Load Pipeline (cached)
	# -----------------------------

	@st.cache_resource
	def load_pipeline():
	# Cache busted to pick up the new ask method return dictionary
	pipeline = CompleteRAGPipeline(
	faiss_db_path=DB_PATH,
	embedding_model="microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
	)

	return pipeline


	pipeline = load_pipeline()


	# -----------------------------
	# Page Config
	# -----------------------------

	st.set_page_config(
	page_title="Pathology RAG",
	layout="wide"
	)

	st.title("🔬 Pathology Report Analysis System")

	st.markdown(
	"""
	AI-powered search and question answering over pathology reports
	Vector database powered by BiomedBERT + FAISS
	"""
	)


	# -----------------------------
	# Session State
	# -----------------------------

	if "query_count" not in st.session_state:
	st.session_state.query_count = 0


	# -----------------------------
	# Sidebar
	# -----------------------------

	st.sidebar.header("System Info")

	st.sidebar.write(f"Queries: {st.session_state.query_count}")

	st.sidebar.write("Embedding Model:")
	st.sidebar.write("BiomedBERT")

	st.sidebar.write("Vector DB:")
	st.sidebar.write("FAISS")


	# -----------------------------
	# Document Upload
	# -----------------------------

	st.sidebar.divider()
	st.sidebar.header("📄 Upload Report")

	with st.sidebar.form(key='upload_form', clear_on_submit=True):
	uploaded_file = st.file_uploader("Upload PDF Pathology Report", type=["pdf"])
	submit_btn = st.form_submit_button("Process Document")

	if submit_btn and uploaded_file is not None:
	with st.spinner("Processing Document... this may take a while."):

	# Save file
	upload_dir = Path("uploaded_reports")
	upload_dir.mkdir(exist_ok=True)
	pdf_path = upload_dir / uploaded_file.name

	with open(pdf_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	# Instantiate updater
	updater = DynamicRAGUpdater(
	vector_db_path=DB_PATH,
	embedding_model="microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
	upload_dir=str(upload_dir)
	)

	# Process and add to vector database
	try:
	stats = updater.process_and_add_pdf(str(pdf_path))
	st.sidebar.success(f"Successfully processed `{uploaded_file.name}`")
	st.sidebar.json(stats)

	# Clear pipeline cache to reflect new db index
	load_pipeline.clear()

	except Exception as e:
	st.sidebar.error(f"Error during processing: {e}")

	st.sidebar.divider()

	# -----------------------------
	# Query Input
	# -----------------------------

	st.header("🔎 Ask a Question")

	question = st.text_area(
	"Enter your medical query",
	placeholder="What are common findings in breast cancer pathology?",
	)

	num_sources = st.slider(
	"Number of sources",
	min_value=1,
	max_value=10,
	value=5
	)


	# -----------------------------
	# Search Button
	# -----------------------------

	if st.button("Search"):

	if question.strip() == "":
	st.warning("Please enter a question.")

	else:

	with st.spinner("Running RAG pipeline..."):

	st.session_state.query_count += 1

	result = pipeline.ask(
	question,
	top_k=num_sources
	)

	answer = result["answer"]

	st.subheader("Answer")

	st.markdown(answer)


	# Metadata
	st.subheader("Query Info")

	st.write({
	"query_number": st.session_state.query_count,
	"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"sources_used": result["num_sources"]
	})


	# Sources
	st.subheader("Sources")

	sources = result["sources"]

	if not sources:
	st.write("No sources retrieved.")

	for i, source in enumerate(sources, 1):

	chunk = source["chunk"]

	with st.expander(f"Source {i} \| {chunk['filename']}"):

	st.write(chunk["text"][:600])