Spaces:

cryogenic22
/

doc_knowledge_base

Runtime error

App Files Files Community

cryogenic22 commited on Apr 19, 2025

Commit

69b6b11

verified ·

1 Parent(s): 3e1db99

Create app.py

Browse files

Files changed (1) hide show

app.py +694 -0

app.py ADDED Viewed

	@@ -0,0 +1,694 @@

+"""
+Streamlit application for the Pharmaceutical R&D Knowledge Ecosystem.
+"""
+import streamlit as st
+import os
+import pandas as pd
+import json
+import tempfile
+import time
+from datetime import datetime
+from pdf_processor import PDFProcessor
+from knowledge_store import KnowledgeStore
+from llm_interface import LLMInterface
+from graph_builder import (
+    init_handlers,
+    build_document_extraction_graph,
+    build_protocol_coach_graph,
+    build_content_authoring_graph,
+    build_traceability_graph
+)
+# =========================================================================
+# App Setup and Configuration
+# =========================================================================
+# Page configuration
+st.set_page_config(
+    page_title="Pharma R&D Knowledge Ecosystem",
+    page_icon="💊",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Initialize session state variables if they don't exist
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+if "documents" not in st.session_state:
+    st.session_state.documents = []
+if "knowledge_base_stats" not in st.session_state:
+    st.session_state.knowledge_base_stats = {
+        "documents": 0,
+        "studies": 0,
+        "endpoints": 0,
+        "objectives": 0,
+        "vectors": 0
+    }
+# Initialize our handlers and graphs
+@st.cache_resource
+def initialize_app():
+    """Initialize app resources and LangGraph workflows."""
+    # Get API key from environment or secrets
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key and hasattr(st, "secrets") and "ANTHROPIC_API_KEY" in st.secrets:
+        api_key = st.secrets["ANTHROPIC_API_KEY"]
+    # Initialize handlers
+    pdf_processor, knowledge_store, llm_interface = init_handlers(api_key)
+    # Build LangGraph workflows
+    extraction_graph = build_document_extraction_graph()
+    coach_graph = build_protocol_coach_graph()
+    authoring_graph = build_content_authoring_graph()
+    traceability_graph = build_traceability_graph()
+    return {
+        "pdf_processor": pdf_processor,
+        "knowledge_store": knowledge_store,
+        "llm_interface": llm_interface,
+        "extraction_graph": extraction_graph,
+        "coach_graph": coach_graph,
+        "authoring_graph": authoring_graph,
+        "traceability_graph": traceability_graph
+    }
+# Initialize app resources
+app_resources = initialize_app()
+pdf_processor = app_resources["pdf_processor"]
+knowledge_store = app_resources["knowledge_store"]
+llm_interface = app_resources["llm_interface"]
+extraction_graph = app_resources["extraction_graph"]
+coach_graph = app_resources["coach_graph"]
+authoring_graph = app_resources["authoring_graph"]
+traceability_graph = app_resources["traceability_graph"]
+# =========================================================================
+# Helper Functions
+# =========================================================================
+def update_knowledge_base_stats():
+    """Update the knowledge base statistics in session state."""
+    try:
+        # Get counts of different entity types
+        documents = knowledge_store.get_all_documents()
+        document_count = len(documents)
+        # Get unique protocol IDs
+        protocol_ids = set()
+        for doc in documents:
+            if "protocol_id" in doc and doc["protocol_id"]:
+                protocol_ids.add(doc["protocol_id"])
+        # Get vector store stats
+        vector_stats = knowledge_store.get_vector_store_stats()
+        vector_count = vector_stats.get("document_count", 0)
+        # Count objectives and endpoints across all protocols
+        objective_count = 0
+        endpoint_count = 0
+        for protocol_id in protocol_ids:
+            objectives = knowledge_store.get_objectives_by_protocol_id(protocol_id)
+            endpoints = knowledge_store.get_endpoints_by_protocol_id(protocol_id)
+            objective_count += len(objectives)
+            endpoint_count += len(endpoints)
+        # Update session state
+        st.session_state.knowledge_base_stats = {
+            "documents": document_count,
+            "studies": len(protocol_ids),
+            "objectives": objective_count,
+            "endpoints": endpoint_count,
+            "vectors": vector_count
+        }
+    except Exception as e:
+        st.error(f"Error updating knowledge base stats: {e}")
+def process_document(uploaded_file):
+    """Process an uploaded document and store in knowledge base."""
+    try:
+        # Create a progress bar
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        # Step 1: Save the uploaded file
+        status_text.text("Saving uploaded file...")
+        progress_bar.progress(10)
+        # Save uploaded file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+            tmp_file.write(uploaded_file.getbuffer())
+            file_path = tmp_file.name
+        # Step 2: Process through LangGraph extraction workflow
+        status_text.text("Parsing document...")
+        progress_bar.progress(20)
+        # Initialize state for extraction
+        initial_state = {
+            "document_path": file_path,
+            "status": "initialized"
+        }
+        # Run extraction workflow
+        result_state = extraction_graph.invoke(initial_state)
+        # Update progress based on status
+        if result_state.get("status") == "error":
+            status_text.text(f"Error: {result_state.get('error', 'Unknown error')}")
+            progress_bar.progress(100)
+            return {
+                "status": "error",
+                "error": result_state.get("error", "Unknown error"),
+                "filename": uploaded_file.name
+            }
+        # Update progress
+        status_text.text("Processing completed successfully!")
+        progress_bar.progress(100)
+        # Update knowledge base stats
+        update_knowledge_base_stats()
+        # Return result
+        return {
+            "status": "success",
+            "filename": uploaded_file.name,
+            "document_id": result_state.get("document_id"),
+            "protocol_id": result_state.get("extracted_study", {}).get("protocol_id")
+        }
+    except Exception as e:
+        st.error(f"Error processing document: {e}")
+        return {
+            "status": "error",
+            "error": str(e),
+            "filename": uploaded_file.name
+        }
+    finally:
+        # Clean up temporary file
+        if 'file_path' in locals():
+            try:
+                os.unlink(file_path)
+            except:
+                pass
+def chat_with_protocol_coach(query):
+    """Process a query through the Protocol Coach."""
+    try:
+        # Initialize state for Protocol Coach
+        initial_state = {
+            "query": query,
+            "chat_history": st.session_state.chat_history
+        }
+        # Run Protocol Coach workflow
+        result_state = coach_graph.invoke(initial_state)
+        return {
+            "status": "success",
+            "response": result_state.get("response", "I couldn't generate a response."),
+            "context": result_state.get("retrieved_context", [])
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e)
+        }
+def generate_document_section(section_type, protocol_id=None, style_guide=None):
+    """Generate a document section using the content authoring workflow."""
+    try:
+        # Initialize state for Content Authoring
+        initial_state = {
+            "section_type": section_type,
+            "target_protocol_id": protocol_id,
+            "style_guide": style_guide
+        }
+        # Run Content Authoring workflow
+        result_state = authoring_graph.invoke(initial_state)
+        return {
+            "status": "success",
+            "content": result_state.get("generated_content", "I couldn't generate the content."),
+            "context": result_state.get("retrieved_context", [])
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e)
+        }
+def analyze_document_traceability(source_id, target_id, entity_type):
+    """Analyze traceability between two documents."""
+    try:
+        # Initialize state for Traceability Analysis
+        initial_state = {
+            "source_document_id": source_id,
+            "target_document_id": target_id,
+            "entity_type": entity_type
+        }
+        # Run Traceability Analysis workflow
+        result_state = traceability_graph.invoke(initial_state)
+        return {
+            "status": "success",
+            "analysis": result_state.get("analysis", "I couldn't perform the analysis."),
+            "matched_pairs": result_state.get("matched_pairs", [])
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e)
+        }
+# =========================================================================
+# Sidebar: Document Upload and Management
+# =========================================================================
+def render_sidebar():
+    """Render the sidebar for document management."""
+    st.sidebar.title("Document Management")
+    # Knowledge Base Stats
+    st.sidebar.subheader("Knowledge Base Stats")
+    stats = st.session_state.knowledge_base_stats
+    col1, col2 = st.sidebar.columns(2)
+    col1.metric("Documents", stats["documents"])
+    col2.metric("Studies", stats["studies"])
+    col1.metric("Objectives", stats["objectives"])
+    col2.metric("Endpoints", stats["endpoints"])
+    st.sidebar.metric("Vector Chunks", stats["vectors"])
+    # Document Upload
+    st.sidebar.subheader("Upload Documents")
+    uploaded_files = st.sidebar.file_uploader(
+        "Upload Protocol/SAP PDFs",
+        type="pdf",
+        accept_multiple_files=True,
+        help="Upload clinical documents (Protocol, SAP, etc.) to add to the knowledge base."
+    )
+    # Process uploaded files if any
+    if uploaded_files:
+        if st.sidebar.button("Process Documents"):
+            with st.sidebar.expander("Processing Results", expanded=True):
+                for uploaded_file in uploaded_files:
+                    st.write(f"Processing: {uploaded_file.name}")
+                    result = process_document(uploaded_file)
+                    if result["status"] == "success":
+                        st.success(f"Successfully processed {result['filename']}")
+                        # Add to documents list if not already there
+                        doc_exists = False
+                        for doc in st.session_state.documents:
+                            if doc.get("filename") == result["filename"]:
+                                doc_exists = True
+                                break
+                        if not doc_exists:
+                            st.session_state.documents.append({
+                                "filename": result["filename"],
+                                "document_id": result.get("document_id"),
+                                "protocol_id": result.get("protocol_id"),
+                                "processed_date": datetime.now().strftime("%Y-%m-%d %H:%M")
+                            })
+                    else:
+                        st.error(f"Error processing {result['filename']}: {result.get('error', 'Unknown error')}")
+    # Document list
+    st.sidebar.subheader("Processed Documents")
+    if not st.session_state.documents:
+        st.sidebar.info("No documents processed yet.")
+    else:
+        for i, doc in enumerate(st.session_state.documents):
+            with st.sidebar.expander(f"{doc['filename']}"):
+                st.write(f"**Protocol ID:** {doc.get('protocol_id', 'Unknown')}")
+                st.write(f"**Processed:** {doc.get('processed_date', 'Unknown')}")
+    # Refresh Stats Button
+    if st.sidebar.button("Refresh Stats"):
+        update_knowledge_base_stats()
+        st.sidebar.success("Stats refreshed!")
+# =========================================================================
+# Main Content Tabs
+# =========================================================================
+def render_protocol_coach_tab():
+    """Render the Protocol Coach chatbot tab."""
+    st.header("Protocol Coach Chatbot")
+    st.info("Ask questions about the protocol documents in the knowledge base. The Protocol Coach will retrieve relevant information to answer your questions.")
+    # Initialize or display chat history
+    for message in st.session_state.chat_history:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Chat input
+    if query := st.chat_input("Ask about protocols..."):
+        # Add user message to chat history and display
+        st.session_state.chat_history.append({"role": "user", "content": query})
+        with st.chat_message("user"):
+            st.markdown(query)
+        # Process query
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                result = chat_with_protocol_coach(query)
+                if result["status"] == "success":
+                    st.markdown(result["response"])
+                    # Show context sources if debug mode enabled
+                    if st.session_state.get("debug_mode", False):
+                        with st.expander("Context Sources"):
+                            for i, ctx in enumerate(result.get("context", [])):
+                                st.write(f"**Source {i+1}:** {ctx.get('metadata', {}).get('source', 'Unknown')}")
+                                st.write(f"**Section:** {ctx.get('metadata', {}).get('section', 'Unknown')}")
+                                st.write("---")
+                    # Add assistant response to chat history
+                    st.session_state.chat_history.append({"role": "assistant", "content": result["response"]})
+                else:
+                    st.error(f"Error: {result.get('error', 'Unknown error')}")
+                    st.session_state.chat_history.append({"role": "assistant", "content": f"Error: {result.get('error', 'Unknown error')}"})
+def render_content_authoring_tab():
+    """Render the Content Authoring tab."""
+    st.header("Content Authoring Assistant")
+    st.info("Generate document sections based on knowledge extracted from similar documents.")
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.subheader("Content Generation Settings")
+        # Section Type Selection
+        section_types = [
+            "Introduction",
+            "Objectives and Endpoints",
+            "Study Design",
+            "Study Population",
+            "Statistical Considerations",
+            "Inclusion Criteria",
+            "Exclusion Criteria",
+            "Safety Assessments",
+            "Pharmacokinetic Assessments"
+        ]
+        section_type = st.selectbox("Select Section Type", section_types)
+        # Protocol Selection for Context (Optional)
+        protocol_options = ["--None--"]
+        for doc in st.session_state.documents:
+            if doc.get("protocol_id"):
+                protocol_options.append(doc.get("protocol_id"))
+        target_protocol = st.selectbox(
+            "Target Protocol ID (Optional)",
+            protocol_options
+        )
+        target_protocol = None if target_protocol == "--None--" else target_protocol
+        # Style Guide (Optional)
+        style_guide = st.text_area(
+            "Style Guide (Optional)",
+            placeholder="Enter any specific style guidelines or content requirements..."
+        )
+        # Generate Button
+        generate_button = st.button("Generate Content")
+        # Debug toggle
+        st.session_state.debug_mode = st.checkbox("Show Context Sources", value=st.session_state.get("debug_mode", False))
+    with col2:
+        st.subheader("Generated Content")
+        if generate_button:
+            with st.spinner("Generating content..."):
+                result = generate_document_section(
+                    section_type=section_type,
+                    protocol_id=target_protocol,
+                    style_guide=style_guide if style_guide else None
+                )
+                if result["status"] == "success":
+                    st.markdown(result["content"])
+                    # Show context sources if debug mode enabled
+                    if st.session_state.get("debug_mode", False):
+                        with st.expander("Context Sources"):
+                            for i, ctx in enumerate(result.get("context", [])):
+                                st.write(f"**Source {i+1}:** {ctx.get('metadata', {}).get('source', 'Unknown')}")
+                                st.write(f"**Section:** {ctx.get('metadata', {}).get('section', 'Unknown')}")
+                                st.write("---")
+                else:
+                    st.error(f"Error: {result.get('error', 'Unknown error')}")
+def render_traceability_tab():
+    """Render the Document Traceability tab."""
+    st.header("Cross-Document Traceability")
+    st.info("Analyze relationships between related documents (e.g., Protocol and SAP).")
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.subheader("Traceability Analysis Settings")
+        # Document Selection
+        document_options = []
+        for doc in st.session_state.documents:
+            document_options.append({
+                "id": doc.get("document_id", ""),
+                "label": f"{doc['filename']} ({doc.get('protocol_id', 'Unknown')})"
+            })
+        # Source Document
+        source_options = [{"id": "", "label": "--Select Source Document--"}] + document_options
+        source_doc = st.selectbox(
+            "Source Document",
+            options=source_options,
+            format_func=lambda x: x["label"]
+        )
+        # Target Document
+        target_options = [{"id": "", "label": "--Select Target Document--"}] + document_options
+        target_doc = st.selectbox(
+            "Target Document",
+            options=target_options,
+            format_func=lambda x: x["label"]
+        )
+        # Entity Type
+        entity_types = [
+            {"id": "objectives", "label": "Study Objectives"},
+            {"id": "endpoints", "label": "Endpoints"},
+            {"id": "population", "label": "Population Criteria"}
+        ]
+        entity_type = st.selectbox(
+            "Entity Type to Compare",
+            options=entity_types,
+            format_func=lambda x: x["label"]
+        )
+        # Analyze Button
+        analyze_button = st.button("Analyze Traceability")
+    with col2:
+        st.subheader("Analysis Results")
+        if analyze_button:
+            if not source_doc["id"] or not target_doc["id"]:
+                st.error("Please select both source and target documents.")
+            else:
+                with st.spinner("Analyzing traceability..."):
+                    result = analyze_document_traceability(
+                        source_id=source_doc["id"],
+                        target_id=target_doc["id"],
+                        entity_type=entity_type["id"]
+                    )
+                    if result["status"] == "success":
+                        st.markdown(result["analysis"])
+                        # Show matched pairs if debug mode enabled
+                        if st.session_state.get("debug_mode", False) and result.get("matched_pairs"):
+                            with st.expander("Matched Entity Pairs"):
+                                for i, pair in enumerate(result["matched_pairs"]):
+                                    st.write(f"**Pair {i+1}**")
+                                    st.write(f"**Source:** {pair.get('source_text', 'Unknown')}")
+                                    st.write(f"**Target:** {pair.get('target_text', 'Unknown')}")
+                                    st.write("---")
+                    else:
+                        st.error(f"Error: {result.get('error', 'Unknown error')}")
+def render_knowledge_explorer_tab():
+    """Render the Knowledge Base Explorer tab."""
+    st.header("Knowledge Base Explorer")
+    st.info("Explore the structured data extracted from documents in the knowledge base.")
+    # Entity Type Selection
+    entity_types = [
+        {"id": "studies", "label": "Studies"},
+        {"id": "objectives", "label": "Study Objectives"},
+        {"id": "endpoints", "label": "Endpoints"},
+        {"id": "population", "label": "Population Criteria"},
+        {"id": "documents", "label": "Documents"}
+    ]
+    entity_type = st.selectbox(
+        "Select Entity Type",
+        options=entity_types,
+        format_func=lambda x: x["label"]
+    )
+    # Filter by Protocol ID (Optional)
+    protocol_options = ["--All Protocols--"]
+    for doc in st.session_state.documents:
+        if doc.get("protocol_id") and doc.get("protocol_id") not in protocol_options:
+            protocol_options.append(doc.get("protocol_id"))
+    filter_protocol = st.selectbox(
+        "Filter by Protocol ID",
+        protocol_options
+    )
+    filter_protocol = None if filter_protocol == "--All Protocols--" else filter_protocol
+    # Search Query (Optional)
+    search_query = st.text_input(
+        "Search Query (Optional)",
+        placeholder="Enter text to search for..."
+    )
+    # Display Results
+    st.subheader("Results")
+    try:
+        # Retrieve data based on entity type
+        if entity_type["id"] == "studies":
+            if filter_protocol:
+                data = [knowledge_store.get_study_by_protocol_id(filter_protocol)]
+            else:
+                data = knowledge_store.get_all_studies()
+        elif entity_type["id"] == "objectives":
+            if filter_protocol:
+                data = knowledge_store.get_objectives_by_protocol_id(filter_protocol)
+            else:
+                # Get all objectives across protocols
+                data = []
+                documents = knowledge_store.get_all_documents()
+                protocol_ids = set()
+                for doc in documents:
+                    if "protocol_id" in doc and doc["protocol_id"]:
+                        protocol_ids.add(doc["protocol_id"])
+                for pid in protocol_ids:
+                    data.extend(knowledge_store.get_objectives_by_protocol_id(pid))
+        elif entity_type["id"] == "endpoints":
+            if filter_protocol:
+                data = knowledge_store.get_endpoints_by_protocol_id(filter_protocol)
+            else:
+                # Get all endpoints across protocols
+                data = []
+                documents = knowledge_store.get_all_documents()
+                protocol_ids = set()
+                for doc in documents:
+                    if "protocol_id" in doc and doc["protocol_id"]:
+                        protocol_ids.add(doc["protocol_id"])
+                for pid in protocol_ids:
+                    data.extend(knowledge_store.get_endpoints_by_protocol_id(pid))
+        elif entity_type["id"] == "population":
+            if filter_protocol:
+                data = knowledge_store.get_population_criteria_by_protocol_id(filter_protocol)
+            else:
+                # Get all population criteria across protocols
+                data = []
+                documents = knowledge_store.get_all_documents()
+                protocol_ids = set()
+                for doc in documents:
+                    if "protocol_id" in doc and doc["protocol_id"]:
+                        protocol_ids.add(doc["protocol_id"])
+                for pid in protocol_ids:
+                    data.extend(knowledge_store.get_population_criteria_by_protocol_id(pid))
+        elif entity_type["id"] == "documents":
+            if filter_protocol:
+                data = knowledge_store.get_documents_by_protocol_id(filter_protocol)
+            else:
+                data = knowledge_store.get_all_documents()
+        else:
+            data = []
+        # Filter by search query if provided
+        if search_query:
+            filtered_data = []
+            search_lower = search_query.lower()
+            for item in data:
+                # Convert item to string for searching
+                item_str = json.dumps(item).lower()
+                if search_lower in item_str:
+                    filtered_data.append(item)
+            data = filtered_data
+        # Display results
+        if not data:
+            st.info("No data found.")
+        else:
+            st.write(f"{len(data)} items found")
+            # Display as table if possible, otherwise as JSON
+            try:
+                df = pd.DataFrame(data)
+                st.dataframe(df, use_container_width=True)
+            except Exception as e:
+                st.json(data)
+    except Exception as e:
+        st.error(f"Error retrieving data: {e}")
+# =========================================================================
+# Main App
+# =========================================================================
+def main():
+    """Main application function."""
+    st.title("🧠 Pharmaceutical R&D Knowledge Ecosystem")
+    # Render the sidebar for document management
+    render_sidebar()
+    # Initialize knowledge base stats on first load
+    if st.session_state.knowledge_base_stats["documents"] == 0:
+        update_knowledge_base_stats()
+    # Main content tabs
+    tab1, tab2, tab3, tab4 = st.tabs([
+        "📝 Content Authoring",
+        "🤖 Protocol Coach",
+        "🔍 Knowledge Explorer",
+        "🔄 Cross-Document Traceability"
+    ])
+    with tab1:
+        render_content_authoring_tab()
+    with tab2:
+        render_protocol_coach_tab()
+    with tab3:
+        render_knowledge_explorer_tab()
+    with tab4:
+        render_traceability_tab()
+    # Footer
+    st.markdown("---")
+    st.caption("Pharmaceutical R&D Knowledge Ecosystem | A demonstration of AI-assisted document processing and knowledge management")
+if __name__ == "__main__":
+    main()