Spaces:

cryogenic22
/

doc_knowledge_base

Runtime error

App Files Files Community

doc_knowledge_base / app.py

cryogenic22

Create app.py

69b6b11 verified 10 months ago

raw

history blame contribute delete

26.6 kB

	"""
	Streamlit application for the Pharmaceutical R&D Knowledge Ecosystem.
	"""

	import streamlit as st
	import os
	import pandas as pd
	import json
	import tempfile
	import time
	from datetime import datetime

	from pdf_processor import PDFProcessor
	from knowledge_store import KnowledgeStore
	from llm_interface import LLMInterface
	from graph_builder import (
	init_handlers,
	build_document_extraction_graph,
	build_protocol_coach_graph,
	build_content_authoring_graph,
	build_traceability_graph
	)

	# =========================================================================
	# App Setup and Configuration
	# =========================================================================

	# Page configuration
	st.set_page_config(
	page_title="Pharma R&D Knowledge Ecosystem",
	page_icon="💊",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Initialize session state variables if they don't exist
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []

	if "documents" not in st.session_state:
	st.session_state.documents = []

	if "knowledge_base_stats" not in st.session_state:
	st.session_state.knowledge_base_stats = {
	"documents": 0,
	"studies": 0,
	"endpoints": 0,
	"objectives": 0,
	"vectors": 0
	}

	# Initialize our handlers and graphs
	@st.cache_resource
	def initialize_app():
	"""Initialize app resources and LangGraph workflows."""
	# Get API key from environment or secrets
	api_key = os.environ.get("ANTHROPIC_API_KEY")
	if not api_key and hasattr(st, "secrets") and "ANTHROPIC_API_KEY" in st.secrets:
	api_key = st.secrets["ANTHROPIC_API_KEY"]

	# Initialize handlers
	pdf_processor, knowledge_store, llm_interface = init_handlers(api_key)

	# Build LangGraph workflows
	extraction_graph = build_document_extraction_graph()
	coach_graph = build_protocol_coach_graph()
	authoring_graph = build_content_authoring_graph()
	traceability_graph = build_traceability_graph()

	return {
	"pdf_processor": pdf_processor,
	"knowledge_store": knowledge_store,
	"llm_interface": llm_interface,
	"extraction_graph": extraction_graph,
	"coach_graph": coach_graph,
	"authoring_graph": authoring_graph,
	"traceability_graph": traceability_graph
	}

	# Initialize app resources
	app_resources = initialize_app()
	pdf_processor = app_resources["pdf_processor"]
	knowledge_store = app_resources["knowledge_store"]
	llm_interface = app_resources["llm_interface"]
	extraction_graph = app_resources["extraction_graph"]
	coach_graph = app_resources["coach_graph"]
	authoring_graph = app_resources["authoring_graph"]
	traceability_graph = app_resources["traceability_graph"]

	# =========================================================================
	# Helper Functions
	# =========================================================================

	def update_knowledge_base_stats():
	"""Update the knowledge base statistics in session state."""
	try:
	# Get counts of different entity types
	documents = knowledge_store.get_all_documents()
	document_count = len(documents)

	# Get unique protocol IDs
	protocol_ids = set()
	for doc in documents:
	if "protocol_id" in doc and doc["protocol_id"]:
	protocol_ids.add(doc["protocol_id"])

	# Get vector store stats
	vector_stats = knowledge_store.get_vector_store_stats()
	vector_count = vector_stats.get("document_count", 0)

	# Count objectives and endpoints across all protocols
	objective_count = 0
	endpoint_count = 0
	for protocol_id in protocol_ids:
	objectives = knowledge_store.get_objectives_by_protocol_id(protocol_id)
	endpoints = knowledge_store.get_endpoints_by_protocol_id(protocol_id)
	objective_count += len(objectives)
	endpoint_count += len(endpoints)

	# Update session state
	st.session_state.knowledge_base_stats = {
	"documents": document_count,
	"studies": len(protocol_ids),
	"objectives": objective_count,
	"endpoints": endpoint_count,
	"vectors": vector_count
	}
	except Exception as e:
	st.error(f"Error updating knowledge base stats: {e}")

	def process_document(uploaded_file):
	"""Process an uploaded document and store in knowledge base."""
	try:
	# Create a progress bar
	progress_bar = st.progress(0)
	status_text = st.empty()

	# Step 1: Save the uploaded file
	status_text.text("Saving uploaded file...")
	progress_bar.progress(10)

	# Save uploaded file temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	tmp_file.write(uploaded_file.getbuffer())
	file_path = tmp_file.name

	# Step 2: Process through LangGraph extraction workflow
	status_text.text("Parsing document...")
	progress_bar.progress(20)

	# Initialize state for extraction
	initial_state = {
	"document_path": file_path,
	"status": "initialized"
	}

	# Run extraction workflow
	result_state = extraction_graph.invoke(initial_state)

	# Update progress based on status
	if result_state.get("status") == "error":
	status_text.text(f"Error: {result_state.get('error', 'Unknown error')}")
	progress_bar.progress(100)
	return {
	"status": "error",
	"error": result_state.get("error", "Unknown error"),
	"filename": uploaded_file.name
	}

	# Update progress
	status_text.text("Processing completed successfully!")
	progress_bar.progress(100)

	# Update knowledge base stats
	update_knowledge_base_stats()

	# Return result
	return {
	"status": "success",
	"filename": uploaded_file.name,
	"document_id": result_state.get("document_id"),
	"protocol_id": result_state.get("extracted_study", {}).get("protocol_id")
	}
	except Exception as e:
	st.error(f"Error processing document: {e}")
	return {
	"status": "error",
	"error": str(e),
	"filename": uploaded_file.name
	}
	finally:
	# Clean up temporary file
	if 'file_path' in locals():
	try:
	os.unlink(file_path)
	except:
	pass

	def chat_with_protocol_coach(query):
	"""Process a query through the Protocol Coach."""
	try:
	# Initialize state for Protocol Coach
	initial_state = {
	"query": query,
	"chat_history": st.session_state.chat_history
	}

	# Run Protocol Coach workflow
	result_state = coach_graph.invoke(initial_state)

	return {
	"status": "success",
	"response": result_state.get("response", "I couldn't generate a response."),
	"context": result_state.get("retrieved_context", [])
	}
	except Exception as e:
	return {
	"status": "error",
	"error": str(e)
	}

	def generate_document_section(section_type, protocol_id=None, style_guide=None):
	"""Generate a document section using the content authoring workflow."""
	try:
	# Initialize state for Content Authoring
	initial_state = {
	"section_type": section_type,
	"target_protocol_id": protocol_id,
	"style_guide": style_guide
	}

	# Run Content Authoring workflow
	result_state = authoring_graph.invoke(initial_state)

	return {
	"status": "success",
	"content": result_state.get("generated_content", "I couldn't generate the content."),
	"context": result_state.get("retrieved_context", [])
	}
	except Exception as e:
	return {
	"status": "error",
	"error": str(e)
	}

	def analyze_document_traceability(source_id, target_id, entity_type):
	"""Analyze traceability between two documents."""
	try:
	# Initialize state for Traceability Analysis
	initial_state = {
	"source_document_id": source_id,
	"target_document_id": target_id,
	"entity_type": entity_type
	}

	# Run Traceability Analysis workflow
	result_state = traceability_graph.invoke(initial_state)

	return {
	"status": "success",
	"analysis": result_state.get("analysis", "I couldn't perform the analysis."),
	"matched_pairs": result_state.get("matched_pairs", [])
	}
	except Exception as e:
	return {
	"status": "error",
	"error": str(e)
	}

	# =========================================================================
	# Sidebar: Document Upload and Management
	# =========================================================================

	def render_sidebar():
	"""Render the sidebar for document management."""
	st.sidebar.title("Document Management")

	# Knowledge Base Stats
	st.sidebar.subheader("Knowledge Base Stats")
	stats = st.session_state.knowledge_base_stats
	col1, col2 = st.sidebar.columns(2)
	col1.metric("Documents", stats["documents"])
	col2.metric("Studies", stats["studies"])
	col1.metric("Objectives", stats["objectives"])
	col2.metric("Endpoints", stats["endpoints"])
	st.sidebar.metric("Vector Chunks", stats["vectors"])

	# Document Upload
	st.sidebar.subheader("Upload Documents")
	uploaded_files = st.sidebar.file_uploader(
	"Upload Protocol/SAP PDFs",
	type="pdf",
	accept_multiple_files=True,
	help="Upload clinical documents (Protocol, SAP, etc.) to add to the knowledge base."
	)

	# Process uploaded files if any
	if uploaded_files:
	if st.sidebar.button("Process Documents"):
	with st.sidebar.expander("Processing Results", expanded=True):
	for uploaded_file in uploaded_files:
	st.write(f"Processing: {uploaded_file.name}")
	result = process_document(uploaded_file)

	if result["status"] == "success":
	st.success(f"Successfully processed {result['filename']}")

	# Add to documents list if not already there
	doc_exists = False
	for doc in st.session_state.documents:
	if doc.get("filename") == result["filename"]:
	doc_exists = True
	break

	if not doc_exists:
	st.session_state.documents.append({
	"filename": result["filename"],
	"document_id": result.get("document_id"),
	"protocol_id": result.get("protocol_id"),
	"processed_date": datetime.now().strftime("%Y-%m-%d %H:%M")
	})
	else:
	st.error(f"Error processing {result['filename']}: {result.get('error', 'Unknown error')}")

	# Document list
	st.sidebar.subheader("Processed Documents")
	if not st.session_state.documents:
	st.sidebar.info("No documents processed yet.")
	else:
	for i, doc in enumerate(st.session_state.documents):
	with st.sidebar.expander(f"{doc['filename']}"):
	st.write(f"Protocol ID: {doc.get('protocol_id', 'Unknown')}")
	st.write(f"Processed: {doc.get('processed_date', 'Unknown')}")

	# Refresh Stats Button
	if st.sidebar.button("Refresh Stats"):
	update_knowledge_base_stats()
	st.sidebar.success("Stats refreshed!")

	# =========================================================================
	# Main Content Tabs
	# =========================================================================

	def render_protocol_coach_tab():
	"""Render the Protocol Coach chatbot tab."""
	st.header("Protocol Coach Chatbot")
	st.info("Ask questions about the protocol documents in the knowledge base. The Protocol Coach will retrieve relevant information to answer your questions.")

	# Initialize or display chat history
	for message in st.session_state.chat_history:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Chat input
	if query := st.chat_input("Ask about protocols..."):
	# Add user message to chat history and display
	st.session_state.chat_history.append({"role": "user", "content": query})
	with st.chat_message("user"):
	st.markdown(query)

	# Process query
	with st.chat_message("assistant"):
	with st.spinner("Thinking..."):
	result = chat_with_protocol_coach(query)
	if result["status"] == "success":
	st.markdown(result["response"])

	# Show context sources if debug mode enabled
	if st.session_state.get("debug_mode", False):
	with st.expander("Context Sources"):
	for i, ctx in enumerate(result.get("context", [])):
	st.write(f"Source {i+1}: {ctx.get('metadata', {}).get('source', 'Unknown')}")
	st.write(f"Section: {ctx.get('metadata', {}).get('section', 'Unknown')}")
	st.write("---")

	# Add assistant response to chat history
	st.session_state.chat_history.append({"role": "assistant", "content": result["response"]})
	else:
	st.error(f"Error: {result.get('error', 'Unknown error')}")
	st.session_state.chat_history.append({"role": "assistant", "content": f"Error: {result.get('error', 'Unknown error')}"})

	def render_content_authoring_tab():
	"""Render the Content Authoring tab."""
	st.header("Content Authoring Assistant")
	st.info("Generate document sections based on knowledge extracted from similar documents.")

	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader("Content Generation Settings")

	# Section Type Selection
	section_types = [
	"Introduction",
	"Objectives and Endpoints",
	"Study Design",
	"Study Population",
	"Statistical Considerations",
	"Inclusion Criteria",
	"Exclusion Criteria",
	"Safety Assessments",
	"Pharmacokinetic Assessments"
	]
	section_type = st.selectbox("Select Section Type", section_types)

	# Protocol Selection for Context (Optional)
	protocol_options = ["--None--"]
	for doc in st.session_state.documents:
	if doc.get("protocol_id"):
	protocol_options.append(doc.get("protocol_id"))

	target_protocol = st.selectbox(
	"Target Protocol ID (Optional)",
	protocol_options
	)
	target_protocol = None if target_protocol == "--None--" else target_protocol

	# Style Guide (Optional)
	style_guide = st.text_area(
	"Style Guide (Optional)",
	placeholder="Enter any specific style guidelines or content requirements..."
	)

	# Generate Button
	generate_button = st.button("Generate Content")

	# Debug toggle
	st.session_state.debug_mode = st.checkbox("Show Context Sources", value=st.session_state.get("debug_mode", False))

	with col2:
	st.subheader("Generated Content")

	if generate_button:
	with st.spinner("Generating content..."):
	result = generate_document_section(
	section_type=section_type,
	protocol_id=target_protocol,
	style_guide=style_guide if style_guide else None
	)

	if result["status"] == "success":
	st.markdown(result["content"])

	# Show context sources if debug mode enabled
	if st.session_state.get("debug_mode", False):
	with st.expander("Context Sources"):
	for i, ctx in enumerate(result.get("context", [])):
	st.write(f"Source {i+1}: {ctx.get('metadata', {}).get('source', 'Unknown')}")
	st.write(f"Section: {ctx.get('metadata', {}).get('section', 'Unknown')}")
	st.write("---")
	else:
	st.error(f"Error: {result.get('error', 'Unknown error')}")

	def render_traceability_tab():
	"""Render the Document Traceability tab."""
	st.header("Cross-Document Traceability")
	st.info("Analyze relationships between related documents (e.g., Protocol and SAP).")

	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader("Traceability Analysis Settings")

	# Document Selection
	document_options = []
	for doc in st.session_state.documents:
	document_options.append({
	"id": doc.get("document_id", ""),
	"label": f"{doc['filename']} ({doc.get('protocol_id', 'Unknown')})"
	})

	# Source Document
	source_options = [{"id": "", "label": "--Select Source Document--"}] + document_options
	source_doc = st.selectbox(
	"Source Document",
	options=source_options,
	format_func=lambda x: x["label"]
	)

	# Target Document
	target_options = [{"id": "", "label": "--Select Target Document--"}] + document_options
	target_doc = st.selectbox(
	"Target Document",
	options=target_options,
	format_func=lambda x: x["label"]
	)

	# Entity Type
	entity_types = [
	{"id": "objectives", "label": "Study Objectives"},
	{"id": "endpoints", "label": "Endpoints"},
	{"id": "population", "label": "Population Criteria"}
	]
	entity_type = st.selectbox(
	"Entity Type to Compare",
	options=entity_types,
	format_func=lambda x: x["label"]
	)

	# Analyze Button
	analyze_button = st.button("Analyze Traceability")

	with col2:
	st.subheader("Analysis Results")

	if analyze_button:
	if not source_doc["id"] or not target_doc["id"]:
	st.error("Please select both source and target documents.")
	else:
	with st.spinner("Analyzing traceability..."):
	result = analyze_document_traceability(
	source_id=source_doc["id"],
	target_id=target_doc["id"],
	entity_type=entity_type["id"]
	)

	if result["status"] == "success":
	st.markdown(result["analysis"])

	# Show matched pairs if debug mode enabled
	if st.session_state.get("debug_mode", False) and result.get("matched_pairs"):
	with st.expander("Matched Entity Pairs"):
	for i, pair in enumerate(result["matched_pairs"]):
	st.write(f"Pair {i+1}")
	st.write(f"Source: {pair.get('source_text', 'Unknown')}")
	st.write(f"Target: {pair.get('target_text', 'Unknown')}")
	st.write("---")
	else:
	st.error(f"Error: {result.get('error', 'Unknown error')}")

	def render_knowledge_explorer_tab():
	"""Render the Knowledge Base Explorer tab."""
	st.header("Knowledge Base Explorer")
	st.info("Explore the structured data extracted from documents in the knowledge base.")

	# Entity Type Selection
	entity_types = [
	{"id": "studies", "label": "Studies"},
	{"id": "objectives", "label": "Study Objectives"},
	{"id": "endpoints", "label": "Endpoints"},
	{"id": "population", "label": "Population Criteria"},
	{"id": "documents", "label": "Documents"}
	]
	entity_type = st.selectbox(
	"Select Entity Type",
	options=entity_types,
	format_func=lambda x: x["label"]
	)

	# Filter by Protocol ID (Optional)
	protocol_options = ["--All Protocols--"]
	for doc in st.session_state.documents:
	if doc.get("protocol_id") and doc.get("protocol_id") not in protocol_options:
	protocol_options.append(doc.get("protocol_id"))

	filter_protocol = st.selectbox(
	"Filter by Protocol ID",
	protocol_options
	)
	filter_protocol = None if filter_protocol == "--All Protocols--" else filter_protocol

	# Search Query (Optional)
	search_query = st.text_input(
	"Search Query (Optional)",
	placeholder="Enter text to search for..."
	)

	# Display Results
	st.subheader("Results")

	try:
	# Retrieve data based on entity type
	if entity_type["id"] == "studies":
	if filter_protocol:
	data = [knowledge_store.get_study_by_protocol_id(filter_protocol)]
	else:
	data = knowledge_store.get_all_studies()
	elif entity_type["id"] == "objectives":
	if filter_protocol:
	data = knowledge_store.get_objectives_by_protocol_id(filter_protocol)
	else:
	# Get all objectives across protocols
	data = []
	documents = knowledge_store.get_all_documents()
	protocol_ids = set()
	for doc in documents:
	if "protocol_id" in doc and doc["protocol_id"]:
	protocol_ids.add(doc["protocol_id"])

	for pid in protocol_ids:
	data.extend(knowledge_store.get_objectives_by_protocol_id(pid))
	elif entity_type["id"] == "endpoints":
	if filter_protocol:
	data = knowledge_store.get_endpoints_by_protocol_id(filter_protocol)
	else:
	# Get all endpoints across protocols
	data = []
	documents = knowledge_store.get_all_documents()
	protocol_ids = set()
	for doc in documents:
	if "protocol_id" in doc and doc["protocol_id"]:
	protocol_ids.add(doc["protocol_id"])

	for pid in protocol_ids:
	data.extend(knowledge_store.get_endpoints_by_protocol_id(pid))
	elif entity_type["id"] == "population":
	if filter_protocol:
	data = knowledge_store.get_population_criteria_by_protocol_id(filter_protocol)
	else:
	# Get all population criteria across protocols
	data = []
	documents = knowledge_store.get_all_documents()
	protocol_ids = set()
	for doc in documents:
	if "protocol_id" in doc and doc["protocol_id"]:
	protocol_ids.add(doc["protocol_id"])

	for pid in protocol_ids:
	data.extend(knowledge_store.get_population_criteria_by_protocol_id(pid))
	elif entity_type["id"] == "documents":
	if filter_protocol:
	data = knowledge_store.get_documents_by_protocol_id(filter_protocol)
	else:
	data = knowledge_store.get_all_documents()
	else:
	data = []

	# Filter by search query if provided
	if search_query:
	filtered_data = []
	search_lower = search_query.lower()
	for item in data:
	# Convert item to string for searching
	item_str = json.dumps(item).lower()
	if search_lower in item_str:
	filtered_data.append(item)
	data = filtered_data

	# Display results
	if not data:
	st.info("No data found.")
	else:
	st.write(f"{len(data)} items found")

	# Display as table if possible, otherwise as JSON
	try:
	df = pd.DataFrame(data)
	st.dataframe(df, use_container_width=True)
	except Exception as e:
	st.json(data)
	except Exception as e:
	st.error(f"Error retrieving data: {e}")

	# =========================================================================
	# Main App
	# =========================================================================

	def main():
	"""Main application function."""
	st.title("🧠 Pharmaceutical R&D Knowledge Ecosystem")

	# Render the sidebar for document management
	render_sidebar()

	# Initialize knowledge base stats on first load
	if st.session_state.knowledge_base_stats["documents"] == 0:
	update_knowledge_base_stats()

	# Main content tabs
	tab1, tab2, tab3, tab4 = st.tabs([
	"📝 Content Authoring",
	"🤖 Protocol Coach",
	"🔍 Knowledge Explorer",
	"🔄 Cross-Document Traceability"
	])

	with tab1:
	render_content_authoring_tab()

	with tab2:
	render_protocol_coach_tab()

	with tab3:
	render_knowledge_explorer_tab()

	with tab4:
	render_traceability_tab()

	# Footer
	st.markdown("---")
	st.caption("Pharmaceutical R&D Knowledge Ecosystem \| A demonstration of AI-assisted document processing and knowledge management")

	if __name__ == "__main__":
	main()