SPARKNET / demo /pages /5_πŸ“„_Document_Viewer.py
MHamdan's picture
Add password authentication and full demo application
a05467c
"""
Document Viewer - SPARKNET
View and explore processed documents from the state manager.
Provides visual chunk segmentation, OCR regions, and layout visualization.
"""
import streamlit as st
import sys
from pathlib import Path
import time
import hashlib
import base64
from typing import List, Dict, Any
PROJECT_ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
sys.path.insert(0, str(PROJECT_ROOT / "demo"))
# Import state manager and RAG config
from state_manager import (
get_state_manager,
ProcessedDocument,
render_global_status_bar,
)
from rag_config import (
get_unified_rag_system,
get_store_stats,
get_indexed_documents,
get_chunks_for_document,
check_ollama,
)
st.set_page_config(
page_title="Document Viewer - SPARKNET",
page_icon="πŸ“„",
layout="wide"
)
# Authentication
from auth import check_password, show_logout_button
if not check_password():
st.stop()
show_logout_button()
# Custom CSS
st.markdown("""
<style>
.chunk-card {
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
border-radius: 10px;
padding: 12px;
margin: 8px 0;
border-left: 4px solid #4ECDC4;
}
.chunk-header {
display: flex;
justify-content: space-between;
font-size: 11px;
color: #8b949e;
margin-bottom: 6px;
}
.chunk-text {
font-size: 13px;
line-height: 1.5;
color: #c9d1d9;
font-family: 'Monaco', 'Menlo', monospace;
}
.ocr-region {
background: #161b22;
border-radius: 6px;
padding: 8px;
margin: 4px 0;
border-left: 3px solid;
}
.layout-region {
display: inline-block;
padding: 4px 8px;
margin: 3px;
border-radius: 4px;
font-size: 11px;
}
.doc-card {
background: #0d1117;
border-radius: 10px;
padding: 15px;
margin: 10px 0;
border: 1px solid #30363d;
cursor: pointer;
transition: border-color 0.2s;
}
.doc-card:hover {
border-color: #4ECDC4;
}
.doc-card.active {
border-color: #4ECDC4;
border-width: 2px;
}
.metric-mini {
background: #161b22;
border-radius: 6px;
padding: 8px;
text-align: center;
margin: 4px;
}
.metric-mini .value {
font-size: 18px;
font-weight: bold;
color: #4ECDC4;
}
.metric-mini .label {
font-size: 10px;
color: #8b949e;
text-transform: uppercase;
}
.page-viewer {
background: #0d1117;
border-radius: 10px;
padding: 20px;
max-height: 600px;
overflow-y: auto;
}
.confidence-high { color: #4ECDC4; }
.confidence-med { color: #ffc107; }
.confidence-low { color: #dc3545; }
</style>
""", unsafe_allow_html=True)
def get_chunk_color(index: int) -> str:
"""Get distinct color for chunk visualization."""
colors = [
"#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4",
"#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F",
"#BB8FCE", "#85C1E9", "#F8B500", "#00CED1"
]
return colors[index % len(colors)]
def get_confidence_class(conf: float) -> str:
"""Get confidence CSS class."""
if conf >= 0.8:
return "confidence-high"
elif conf >= 0.6:
return "confidence-med"
return "confidence-low"
def get_layout_color(layout_type: str) -> str:
"""Get color for layout type."""
colors = {
"title": "#FF6B6B",
"heading": "#FF8E6B",
"paragraph": "#4ECDC4",
"text": "#45B7D1",
"list": "#96CEB4",
"table": "#FFEAA7",
"figure": "#DDA0DD",
"header": "#98D8C8",
"footer": "#8b949e",
}
return colors.get(layout_type.lower(), "#666")
# Initialize state manager
state_manager = get_state_manager()
# Header
st.markdown("# πŸ“„ Document Viewer")
st.markdown("Explore processed documents, chunks, OCR regions, and layout structure")
# Global status bar
render_global_status_bar()
st.markdown("---")
# Get all documents from state and RAG
all_state_docs = state_manager.get_all_documents()
rag_docs = get_indexed_documents()
# Sidebar for document selection
with st.sidebar:
st.markdown("## πŸ“š Documents")
# Processed documents from state manager
if all_state_docs:
st.markdown("### Recently Processed")
selected_doc_id = None
for doc in reversed(all_state_docs[-10:]):
is_active = state_manager.state.get("active_doc_id") == doc.doc_id
card_class = "doc-card active" if is_active else "doc-card"
if st.button(
f"πŸ“„ {doc.filename[:25]}...",
key=f"doc_{doc.doc_id}",
use_container_width=True,
type="primary" if is_active else "secondary"
):
state_manager.set_active_document(doc.doc_id)
st.rerun()
# Mini stats
cols = st.columns(3)
cols[0].caption(f"πŸ“„ {doc.page_count}p")
cols[1].caption(f"πŸ“¦ {len(doc.chunks)}")
if doc.indexed:
cols[2].caption("βœ“ Indexed")
st.markdown("---")
else:
st.info("No documents processed yet")
st.markdown("Go to **Live Processing** to process documents")
# RAG indexed documents
if rag_docs:
st.markdown("### πŸ“Š RAG Index")
st.caption(f"{len(rag_docs)} documents indexed")
for doc in rag_docs[:5]:
st.caption(f"β€’ {doc.get('document_id', 'unknown')[:20]}...")
# Main content
active_doc = state_manager.get_active_document()
if active_doc:
# Document header
col1, col2 = st.columns([3, 1])
with col1:
st.markdown(f"## πŸ“„ {active_doc.filename}")
st.caption(f"ID: `{active_doc.doc_id}` | Type: {active_doc.file_type} | Processed: {active_doc.created_at.strftime('%Y-%m-%d %H:%M')}")
with col2:
if active_doc.indexed:
st.success(f"βœ“ Indexed ({active_doc.indexed_chunks} chunks)")
else:
st.warning("Not indexed")
# Summary metrics
metric_cols = st.columns(6)
metric_cols[0].markdown(f"""
<div class="metric-mini">
<div class="value">{active_doc.page_count}</div>
<div class="label">Pages</div>
</div>
""", unsafe_allow_html=True)
metric_cols[1].markdown(f"""
<div class="metric-mini">
<div class="value">{len(active_doc.chunks)}</div>
<div class="label">Chunks</div>
</div>
""", unsafe_allow_html=True)
metric_cols[2].markdown(f"""
<div class="metric-mini">
<div class="value">{len(active_doc.ocr_regions)}</div>
<div class="label">OCR Regions</div>
</div>
""", unsafe_allow_html=True)
layout_count = len(active_doc.layout_data.get("regions", []))
metric_cols[3].markdown(f"""
<div class="metric-mini">
<div class="value">{layout_count}</div>
<div class="label">Layout Regions</div>
</div>
""", unsafe_allow_html=True)
metric_cols[4].markdown(f"""
<div class="metric-mini">
<div class="value">{len(active_doc.raw_text):,}</div>
<div class="label">Characters</div>
</div>
""", unsafe_allow_html=True)
metric_cols[5].markdown(f"""
<div class="metric-mini">
<div class="value">{active_doc.processing_time:.1f}s</div>
<div class="label">Process Time</div>
</div>
""", unsafe_allow_html=True)
st.markdown("---")
# Tabs for different views
tab_chunks, tab_text, tab_ocr, tab_layout, tab_pages = st.tabs([
"πŸ“¦ Chunks",
"πŸ“ Full Text",
"πŸ” OCR Regions",
"πŸ—ΊοΈ Layout",
"πŸ“„ Page Images"
])
with tab_chunks:
st.markdown("### Document Chunks")
# Filter options
filter_cols = st.columns([2, 1, 1])
with filter_cols[0]:
search_term = st.text_input("Search in chunks", placeholder="Enter search term...")
with filter_cols[1]:
chunk_types = list(set(c.get("chunk_type", "text") for c in active_doc.chunks))
selected_type = st.selectbox("Filter by type", ["All"] + chunk_types)
with filter_cols[2]:
page_filter = st.selectbox("Filter by page", ["All"] + list(range(1, active_doc.page_count + 1)))
# Filter chunks
filtered_chunks = active_doc.chunks
if search_term:
filtered_chunks = [c for c in filtered_chunks if search_term.lower() in c.get("text", "").lower()]
if selected_type != "All":
filtered_chunks = [c for c in filtered_chunks if c.get("chunk_type") == selected_type]
if page_filter != "All":
filtered_chunks = [c for c in filtered_chunks if c.get("page", 0) + 1 == page_filter]
st.caption(f"Showing {len(filtered_chunks)} of {len(active_doc.chunks)} chunks")
# Display chunks
for i, chunk in enumerate(filtered_chunks[:30]):
chunk_type = chunk.get("chunk_type", "text")
conf = chunk.get("confidence", 0)
color = get_chunk_color(i)
conf_class = get_confidence_class(conf)
with st.expander(f"[{i+1}] {chunk_type.upper()} - {chunk.get('text', '')[:60]}...", expanded=(i == 0)):
st.markdown(f"""
<div class="chunk-card" style="border-left-color: {color};">
<div class="chunk-header">
<span>ID: <code>{chunk.get('chunk_id', 'N/A')}</code></span>
<span>Page {chunk.get('page', 0) + 1}</span>
<span class="{conf_class}">Confidence: {conf:.0%}</span>
</div>
<div class="chunk-text">{chunk.get('text', '')}</div>
</div>
""", unsafe_allow_html=True)
# Bounding box info
bbox = chunk.get("bbox")
if bbox:
st.caption(f"Bbox: ({bbox[0]:.0f}, {bbox[1]:.0f}) - ({bbox[2]:.0f}, {bbox[3]:.0f})")
if len(filtered_chunks) > 30:
st.info(f"Showing 30 of {len(filtered_chunks)} matching chunks")
with tab_text:
st.markdown("### Extracted Text")
# Text display options
text_cols = st.columns([1, 1, 1])
with text_cols[0]:
show_page_markers = st.checkbox("Show page markers", value=True)
with text_cols[1]:
font_size = st.slider("Font size", 10, 18, 13)
with text_cols[2]:
max_chars = st.slider("Max characters", 5000, 50000, 20000, 1000)
text_to_display = active_doc.raw_text[:max_chars]
if len(active_doc.raw_text) > max_chars:
text_to_display += f"\n\n... [Truncated - {len(active_doc.raw_text) - max_chars:,} more characters]"
st.markdown(f"""
<div class="page-viewer" style="font-size: {font_size}px;">
<pre style="white-space: pre-wrap; font-family: monospace; margin: 0;">{text_to_display}</pre>
</div>
""", unsafe_allow_html=True)
# Download button
st.download_button(
"πŸ“₯ Download Full Text",
active_doc.raw_text,
file_name=f"{active_doc.filename}.txt",
mime="text/plain"
)
with tab_ocr:
st.markdown("### OCR Regions")
if active_doc.ocr_regions:
# Group by page
by_page = {}
for region in active_doc.ocr_regions:
page = region.get("page", 0)
if page not in by_page:
by_page[page] = []
by_page[page].append(region)
# Page selector
page_select = st.selectbox(
"Select page",
sorted(by_page.keys()),
format_func=lambda x: f"Page {x + 1} ({len(by_page.get(x, []))} regions)"
)
if page_select is not None and page_select in by_page:
page_regions = by_page[page_select]
# Summary
avg_conf = sum(r.get("confidence", 0) for r in page_regions) / len(page_regions) if page_regions else 0
conf_class = get_confidence_class(avg_conf)
st.markdown(f"**{len(page_regions)} regions** | Average confidence: <span class='{conf_class}'>{avg_conf:.0%}</span>", unsafe_allow_html=True)
# Filter by confidence
min_conf = st.slider("Minimum confidence", 0.0, 1.0, 0.5, 0.1)
filtered_regions = [r for r in page_regions if r.get("confidence", 0) >= min_conf]
for i, region in enumerate(filtered_regions[:50]):
conf = region.get("confidence", 0)
conf_class = get_confidence_class(conf)
color = "#4ECDC4" if conf >= 0.8 else "#ffc107" if conf >= 0.6 else "#dc3545"
st.markdown(f"""
<div class="ocr-region" style="border-left-color: {color};">
<div style="display: flex; justify-content: space-between; margin-bottom: 4px;">
<span style="font-size: 11px; color: #8b949e;">Region {i+1}</span>
<span class="{conf_class}" style="font-size: 11px;">{conf:.0%}</span>
</div>
<div style="font-family: monospace; font-size: 12px;">{region.get('text', '')}</div>
</div>
""", unsafe_allow_html=True)
if len(filtered_regions) > 50:
st.info(f"Showing 50 of {len(filtered_regions)} regions")
else:
st.info("No OCR regions available for this document")
st.markdown("OCR regions are extracted during document processing with OCR enabled.")
with tab_layout:
st.markdown("### Layout Structure")
layout_regions = active_doc.layout_data.get("regions", [])
if layout_regions:
# Group by type
by_type = {}
for region in layout_regions:
rtype = region.get("type", "unknown")
if rtype not in by_type:
by_type[rtype] = []
by_type[rtype].append(region)
# Type summary
st.markdown("**Detected region types:**")
type_cols = st.columns(min(len(by_type), 6))
for i, (rtype, regions) in enumerate(by_type.items()):
color = get_layout_color(rtype)
type_cols[i % 6].markdown(f"""
<div class="layout-region" style="background: {color}20; border: 1px solid {color};">
<strong>{rtype.title()}</strong>: {len(regions)}
</div>
""", unsafe_allow_html=True)
st.markdown("---")
# Layout regions list
type_filter = st.selectbox("Filter by type", ["All"] + list(by_type.keys()))
filtered_layout = layout_regions
if type_filter != "All":
filtered_layout = by_type.get(type_filter, [])
for i, region in enumerate(filtered_layout[:30]):
rtype = region.get("type", "unknown")
conf = region.get("confidence", 0)
color = get_layout_color(rtype)
conf_class = get_confidence_class(conf)
st.markdown(f"""
<div style="background: #161b22; border-radius: 6px; padding: 10px; margin: 6px 0; border-left: 3px solid {color};">
<div style="display: flex; justify-content: space-between;">
<span><strong style="color: {color};">{rtype.upper()}</strong></span>
<span>Page {region.get('page', 0) + 1}</span>
<span class="{conf_class}">{conf:.0%}</span>
</div>
</div>
""", unsafe_allow_html=True)
if len(filtered_layout) > 30:
st.info(f"Showing 30 of {len(filtered_layout)} regions")
else:
st.info("No layout regions available for this document")
st.markdown("Layout regions are extracted during document processing with layout detection enabled.")
with tab_pages:
st.markdown("### Page Images")
if active_doc.page_images:
page_idx = st.selectbox(
"Select page",
list(range(len(active_doc.page_images))),
format_func=lambda x: f"Page {x + 1}"
)
if page_idx is not None and page_idx < len(active_doc.page_images):
img_data = active_doc.page_images[page_idx]
# Display image
st.image(
f"data:image/png;base64,{img_data}",
caption=f"Page {page_idx + 1}",
use_container_width=True
)
# Overlay options
st.markdown("**Overlay options:**")
overlay_cols = st.columns(3)
with overlay_cols[0]:
show_chunks = st.checkbox("Show chunk boundaries", value=False)
with overlay_cols[1]:
show_ocr = st.checkbox("Show OCR regions", value=False)
with overlay_cols[2]:
show_layout = st.checkbox("Show layout regions", value=False)
if show_chunks or show_ocr or show_layout:
st.info("Overlay visualization coming soon - requires image annotation support")
else:
st.info("No page images available for this document")
st.markdown("Page images are extracted from PDF documents during processing.")
# Navigation to other modules
st.markdown("---")
st.markdown("### πŸ”— Actions")
nav_cols = st.columns(4)
with nav_cols[0]:
if st.button("πŸ’¬ Ask Questions", use_container_width=True):
st.switch_page("pages/2_πŸ’¬_Interactive_RAG.py")
with nav_cols[1]:
if st.button("🎯 View Evidence", use_container_width=True):
st.switch_page("pages/4_🎯_Evidence_Viewer.py")
with nav_cols[2]:
if st.button("πŸ“Š Compare Documents", use_container_width=True):
st.switch_page("pages/3_πŸ“Š_Document_Comparison.py")
with nav_cols[3]:
if st.button("πŸ”¬ Process New", use_container_width=True):
st.switch_page("pages/1_πŸ”¬_Live_Processing.py")
else:
# No active document
st.markdown("## No Document Selected")
col1, col2 = st.columns(2)
with col1:
st.markdown("""
### Getting Started
1. Go to **Live Processing** to upload and process a document
2. Processed documents will appear in the sidebar
3. Click on a document to view its details
Or select a document from the sidebar if you've already processed some.
""")
if st.button("πŸ”¬ Go to Live Processing", type="primary", use_container_width=True):
st.switch_page("pages/1_πŸ”¬_Live_Processing.py")
with col2:
# Show RAG stats
stats = get_store_stats()
st.markdown("### RAG Index Status")
st.metric("Total Indexed Chunks", stats.get("total_chunks", 0))
if rag_docs:
st.markdown("**Indexed Documents:**")
for doc in rag_docs[:5]:
doc_id = doc.get("document_id", "unknown")
chunks = doc.get("chunk_count", 0)
st.caption(f"β€’ {doc_id[:30]}... ({chunks} chunks)")
if len(rag_docs) > 5:
st.caption(f"... and {len(rag_docs) - 5} more")