| """ |
| Document Viewer - SPARKNET |
| |
| View and explore processed documents from the state manager. |
| Provides visual chunk segmentation, OCR regions, and layout visualization. |
| """ |
|
|
| import streamlit as st |
| import sys |
| from pathlib import Path |
| import time |
| import hashlib |
| import base64 |
| from typing import List, Dict, Any |
|
|
| PROJECT_ROOT = Path(__file__).parent.parent.parent |
| sys.path.insert(0, str(PROJECT_ROOT)) |
| sys.path.insert(0, str(PROJECT_ROOT / "demo")) |
|
|
| |
| from state_manager import ( |
| get_state_manager, |
| ProcessedDocument, |
| render_global_status_bar, |
| ) |
| from rag_config import ( |
| get_unified_rag_system, |
| get_store_stats, |
| get_indexed_documents, |
| get_chunks_for_document, |
| check_ollama, |
| ) |
|
|
| st.set_page_config( |
| page_title="Document Viewer - SPARKNET", |
| page_icon="π", |
| layout="wide" |
| ) |
|
|
| |
| from auth import check_password, show_logout_button |
| if not check_password(): |
| st.stop() |
| show_logout_button() |
|
|
| |
| st.markdown(""" |
| <style> |
| .chunk-card { |
| background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); |
| border-radius: 10px; |
| padding: 12px; |
| margin: 8px 0; |
| border-left: 4px solid #4ECDC4; |
| } |
| .chunk-header { |
| display: flex; |
| justify-content: space-between; |
| font-size: 11px; |
| color: #8b949e; |
| margin-bottom: 6px; |
| } |
| .chunk-text { |
| font-size: 13px; |
| line-height: 1.5; |
| color: #c9d1d9; |
| font-family: 'Monaco', 'Menlo', monospace; |
| } |
| .ocr-region { |
| background: #161b22; |
| border-radius: 6px; |
| padding: 8px; |
| margin: 4px 0; |
| border-left: 3px solid; |
| } |
| .layout-region { |
| display: inline-block; |
| padding: 4px 8px; |
| margin: 3px; |
| border-radius: 4px; |
| font-size: 11px; |
| } |
| .doc-card { |
| background: #0d1117; |
| border-radius: 10px; |
| padding: 15px; |
| margin: 10px 0; |
| border: 1px solid #30363d; |
| cursor: pointer; |
| transition: border-color 0.2s; |
| } |
| .doc-card:hover { |
| border-color: #4ECDC4; |
| } |
| .doc-card.active { |
| border-color: #4ECDC4; |
| border-width: 2px; |
| } |
| .metric-mini { |
| background: #161b22; |
| border-radius: 6px; |
| padding: 8px; |
| text-align: center; |
| margin: 4px; |
| } |
| .metric-mini .value { |
| font-size: 18px; |
| font-weight: bold; |
| color: #4ECDC4; |
| } |
| .metric-mini .label { |
| font-size: 10px; |
| color: #8b949e; |
| text-transform: uppercase; |
| } |
| .page-viewer { |
| background: #0d1117; |
| border-radius: 10px; |
| padding: 20px; |
| max-height: 600px; |
| overflow-y: auto; |
| } |
| .confidence-high { color: #4ECDC4; } |
| .confidence-med { color: #ffc107; } |
| .confidence-low { color: #dc3545; } |
| </style> |
| """, unsafe_allow_html=True) |
|
|
|
|
| def get_chunk_color(index: int) -> str: |
| """Get distinct color for chunk visualization.""" |
| colors = [ |
| "#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", |
| "#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F", |
| "#BB8FCE", "#85C1E9", "#F8B500", "#00CED1" |
| ] |
| return colors[index % len(colors)] |
|
|
|
|
| def get_confidence_class(conf: float) -> str: |
| """Get confidence CSS class.""" |
| if conf >= 0.8: |
| return "confidence-high" |
| elif conf >= 0.6: |
| return "confidence-med" |
| return "confidence-low" |
|
|
|
|
| def get_layout_color(layout_type: str) -> str: |
| """Get color for layout type.""" |
| colors = { |
| "title": "#FF6B6B", |
| "heading": "#FF8E6B", |
| "paragraph": "#4ECDC4", |
| "text": "#45B7D1", |
| "list": "#96CEB4", |
| "table": "#FFEAA7", |
| "figure": "#DDA0DD", |
| "header": "#98D8C8", |
| "footer": "#8b949e", |
| } |
| return colors.get(layout_type.lower(), "#666") |
|
|
|
|
| |
| state_manager = get_state_manager() |
|
|
| |
| st.markdown("# π Document Viewer") |
| st.markdown("Explore processed documents, chunks, OCR regions, and layout structure") |
|
|
| |
| render_global_status_bar() |
|
|
| st.markdown("---") |
|
|
| |
| all_state_docs = state_manager.get_all_documents() |
| rag_docs = get_indexed_documents() |
|
|
| |
| with st.sidebar: |
| st.markdown("## π Documents") |
|
|
| |
| if all_state_docs: |
| st.markdown("### Recently Processed") |
| selected_doc_id = None |
|
|
| for doc in reversed(all_state_docs[-10:]): |
| is_active = state_manager.state.get("active_doc_id") == doc.doc_id |
| card_class = "doc-card active" if is_active else "doc-card" |
|
|
| if st.button( |
| f"π {doc.filename[:25]}...", |
| key=f"doc_{doc.doc_id}", |
| use_container_width=True, |
| type="primary" if is_active else "secondary" |
| ): |
| state_manager.set_active_document(doc.doc_id) |
| st.rerun() |
|
|
| |
| cols = st.columns(3) |
| cols[0].caption(f"π {doc.page_count}p") |
| cols[1].caption(f"π¦ {len(doc.chunks)}") |
| if doc.indexed: |
| cols[2].caption("β Indexed") |
| st.markdown("---") |
| else: |
| st.info("No documents processed yet") |
| st.markdown("Go to **Live Processing** to process documents") |
|
|
| |
| if rag_docs: |
| st.markdown("### π RAG Index") |
| st.caption(f"{len(rag_docs)} documents indexed") |
| for doc in rag_docs[:5]: |
| st.caption(f"β’ {doc.get('document_id', 'unknown')[:20]}...") |
|
|
| |
| active_doc = state_manager.get_active_document() |
|
|
| if active_doc: |
| |
| col1, col2 = st.columns([3, 1]) |
|
|
| with col1: |
| st.markdown(f"## π {active_doc.filename}") |
| st.caption(f"ID: `{active_doc.doc_id}` | Type: {active_doc.file_type} | Processed: {active_doc.created_at.strftime('%Y-%m-%d %H:%M')}") |
|
|
| with col2: |
| if active_doc.indexed: |
| st.success(f"β Indexed ({active_doc.indexed_chunks} chunks)") |
| else: |
| st.warning("Not indexed") |
|
|
| |
| metric_cols = st.columns(6) |
| metric_cols[0].markdown(f""" |
| <div class="metric-mini"> |
| <div class="value">{active_doc.page_count}</div> |
| <div class="label">Pages</div> |
| </div> |
| """, unsafe_allow_html=True) |
| metric_cols[1].markdown(f""" |
| <div class="metric-mini"> |
| <div class="value">{len(active_doc.chunks)}</div> |
| <div class="label">Chunks</div> |
| </div> |
| """, unsafe_allow_html=True) |
| metric_cols[2].markdown(f""" |
| <div class="metric-mini"> |
| <div class="value">{len(active_doc.ocr_regions)}</div> |
| <div class="label">OCR Regions</div> |
| </div> |
| """, unsafe_allow_html=True) |
| layout_count = len(active_doc.layout_data.get("regions", [])) |
| metric_cols[3].markdown(f""" |
| <div class="metric-mini"> |
| <div class="value">{layout_count}</div> |
| <div class="label">Layout Regions</div> |
| </div> |
| """, unsafe_allow_html=True) |
| metric_cols[4].markdown(f""" |
| <div class="metric-mini"> |
| <div class="value">{len(active_doc.raw_text):,}</div> |
| <div class="label">Characters</div> |
| </div> |
| """, unsafe_allow_html=True) |
| metric_cols[5].markdown(f""" |
| <div class="metric-mini"> |
| <div class="value">{active_doc.processing_time:.1f}s</div> |
| <div class="label">Process Time</div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| st.markdown("---") |
|
|
| |
| tab_chunks, tab_text, tab_ocr, tab_layout, tab_pages = st.tabs([ |
| "π¦ Chunks", |
| "π Full Text", |
| "π OCR Regions", |
| "πΊοΈ Layout", |
| "π Page Images" |
| ]) |
|
|
| with tab_chunks: |
| st.markdown("### Document Chunks") |
|
|
| |
| filter_cols = st.columns([2, 1, 1]) |
| with filter_cols[0]: |
| search_term = st.text_input("Search in chunks", placeholder="Enter search term...") |
| with filter_cols[1]: |
| chunk_types = list(set(c.get("chunk_type", "text") for c in active_doc.chunks)) |
| selected_type = st.selectbox("Filter by type", ["All"] + chunk_types) |
| with filter_cols[2]: |
| page_filter = st.selectbox("Filter by page", ["All"] + list(range(1, active_doc.page_count + 1))) |
|
|
| |
| filtered_chunks = active_doc.chunks |
| if search_term: |
| filtered_chunks = [c for c in filtered_chunks if search_term.lower() in c.get("text", "").lower()] |
| if selected_type != "All": |
| filtered_chunks = [c for c in filtered_chunks if c.get("chunk_type") == selected_type] |
| if page_filter != "All": |
| filtered_chunks = [c for c in filtered_chunks if c.get("page", 0) + 1 == page_filter] |
|
|
| st.caption(f"Showing {len(filtered_chunks)} of {len(active_doc.chunks)} chunks") |
|
|
| |
| for i, chunk in enumerate(filtered_chunks[:30]): |
| chunk_type = chunk.get("chunk_type", "text") |
| conf = chunk.get("confidence", 0) |
| color = get_chunk_color(i) |
| conf_class = get_confidence_class(conf) |
|
|
| with st.expander(f"[{i+1}] {chunk_type.upper()} - {chunk.get('text', '')[:60]}...", expanded=(i == 0)): |
| st.markdown(f""" |
| <div class="chunk-card" style="border-left-color: {color};"> |
| <div class="chunk-header"> |
| <span>ID: <code>{chunk.get('chunk_id', 'N/A')}</code></span> |
| <span>Page {chunk.get('page', 0) + 1}</span> |
| <span class="{conf_class}">Confidence: {conf:.0%}</span> |
| </div> |
| <div class="chunk-text">{chunk.get('text', '')}</div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| bbox = chunk.get("bbox") |
| if bbox: |
| st.caption(f"Bbox: ({bbox[0]:.0f}, {bbox[1]:.0f}) - ({bbox[2]:.0f}, {bbox[3]:.0f})") |
|
|
| if len(filtered_chunks) > 30: |
| st.info(f"Showing 30 of {len(filtered_chunks)} matching chunks") |
|
|
| with tab_text: |
| st.markdown("### Extracted Text") |
|
|
| |
| text_cols = st.columns([1, 1, 1]) |
| with text_cols[0]: |
| show_page_markers = st.checkbox("Show page markers", value=True) |
| with text_cols[1]: |
| font_size = st.slider("Font size", 10, 18, 13) |
| with text_cols[2]: |
| max_chars = st.slider("Max characters", 5000, 50000, 20000, 1000) |
|
|
| text_to_display = active_doc.raw_text[:max_chars] |
| if len(active_doc.raw_text) > max_chars: |
| text_to_display += f"\n\n... [Truncated - {len(active_doc.raw_text) - max_chars:,} more characters]" |
|
|
| st.markdown(f""" |
| <div class="page-viewer" style="font-size: {font_size}px;"> |
| <pre style="white-space: pre-wrap; font-family: monospace; margin: 0;">{text_to_display}</pre> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| st.download_button( |
| "π₯ Download Full Text", |
| active_doc.raw_text, |
| file_name=f"{active_doc.filename}.txt", |
| mime="text/plain" |
| ) |
|
|
| with tab_ocr: |
| st.markdown("### OCR Regions") |
|
|
| if active_doc.ocr_regions: |
| |
| by_page = {} |
| for region in active_doc.ocr_regions: |
| page = region.get("page", 0) |
| if page not in by_page: |
| by_page[page] = [] |
| by_page[page].append(region) |
|
|
| |
| page_select = st.selectbox( |
| "Select page", |
| sorted(by_page.keys()), |
| format_func=lambda x: f"Page {x + 1} ({len(by_page.get(x, []))} regions)" |
| ) |
|
|
| if page_select is not None and page_select in by_page: |
| page_regions = by_page[page_select] |
|
|
| |
| avg_conf = sum(r.get("confidence", 0) for r in page_regions) / len(page_regions) if page_regions else 0 |
| conf_class = get_confidence_class(avg_conf) |
|
|
| st.markdown(f"**{len(page_regions)} regions** | Average confidence: <span class='{conf_class}'>{avg_conf:.0%}</span>", unsafe_allow_html=True) |
|
|
| |
| min_conf = st.slider("Minimum confidence", 0.0, 1.0, 0.5, 0.1) |
| filtered_regions = [r for r in page_regions if r.get("confidence", 0) >= min_conf] |
|
|
| for i, region in enumerate(filtered_regions[:50]): |
| conf = region.get("confidence", 0) |
| conf_class = get_confidence_class(conf) |
| color = "#4ECDC4" if conf >= 0.8 else "#ffc107" if conf >= 0.6 else "#dc3545" |
|
|
| st.markdown(f""" |
| <div class="ocr-region" style="border-left-color: {color};"> |
| <div style="display: flex; justify-content: space-between; margin-bottom: 4px;"> |
| <span style="font-size: 11px; color: #8b949e;">Region {i+1}</span> |
| <span class="{conf_class}" style="font-size: 11px;">{conf:.0%}</span> |
| </div> |
| <div style="font-family: monospace; font-size: 12px;">{region.get('text', '')}</div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| if len(filtered_regions) > 50: |
| st.info(f"Showing 50 of {len(filtered_regions)} regions") |
| else: |
| st.info("No OCR regions available for this document") |
| st.markdown("OCR regions are extracted during document processing with OCR enabled.") |
|
|
| with tab_layout: |
| st.markdown("### Layout Structure") |
|
|
| layout_regions = active_doc.layout_data.get("regions", []) |
|
|
| if layout_regions: |
| |
| by_type = {} |
| for region in layout_regions: |
| rtype = region.get("type", "unknown") |
| if rtype not in by_type: |
| by_type[rtype] = [] |
| by_type[rtype].append(region) |
|
|
| |
| st.markdown("**Detected region types:**") |
| type_cols = st.columns(min(len(by_type), 6)) |
| for i, (rtype, regions) in enumerate(by_type.items()): |
| color = get_layout_color(rtype) |
| type_cols[i % 6].markdown(f""" |
| <div class="layout-region" style="background: {color}20; border: 1px solid {color};"> |
| <strong>{rtype.title()}</strong>: {len(regions)} |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| st.markdown("---") |
|
|
| |
| type_filter = st.selectbox("Filter by type", ["All"] + list(by_type.keys())) |
|
|
| filtered_layout = layout_regions |
| if type_filter != "All": |
| filtered_layout = by_type.get(type_filter, []) |
|
|
| for i, region in enumerate(filtered_layout[:30]): |
| rtype = region.get("type", "unknown") |
| conf = region.get("confidence", 0) |
| color = get_layout_color(rtype) |
| conf_class = get_confidence_class(conf) |
|
|
| st.markdown(f""" |
| <div style="background: #161b22; border-radius: 6px; padding: 10px; margin: 6px 0; border-left: 3px solid {color};"> |
| <div style="display: flex; justify-content: space-between;"> |
| <span><strong style="color: {color};">{rtype.upper()}</strong></span> |
| <span>Page {region.get('page', 0) + 1}</span> |
| <span class="{conf_class}">{conf:.0%}</span> |
| </div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| if len(filtered_layout) > 30: |
| st.info(f"Showing 30 of {len(filtered_layout)} regions") |
| else: |
| st.info("No layout regions available for this document") |
| st.markdown("Layout regions are extracted during document processing with layout detection enabled.") |
|
|
| with tab_pages: |
| st.markdown("### Page Images") |
|
|
| if active_doc.page_images: |
| page_idx = st.selectbox( |
| "Select page", |
| list(range(len(active_doc.page_images))), |
| format_func=lambda x: f"Page {x + 1}" |
| ) |
|
|
| if page_idx is not None and page_idx < len(active_doc.page_images): |
| img_data = active_doc.page_images[page_idx] |
|
|
| |
| st.image( |
| f"data:image/png;base64,{img_data}", |
| caption=f"Page {page_idx + 1}", |
| use_container_width=True |
| ) |
|
|
| |
| st.markdown("**Overlay options:**") |
| overlay_cols = st.columns(3) |
| with overlay_cols[0]: |
| show_chunks = st.checkbox("Show chunk boundaries", value=False) |
| with overlay_cols[1]: |
| show_ocr = st.checkbox("Show OCR regions", value=False) |
| with overlay_cols[2]: |
| show_layout = st.checkbox("Show layout regions", value=False) |
|
|
| if show_chunks or show_ocr or show_layout: |
| st.info("Overlay visualization coming soon - requires image annotation support") |
| else: |
| st.info("No page images available for this document") |
| st.markdown("Page images are extracted from PDF documents during processing.") |
|
|
| |
| st.markdown("---") |
| st.markdown("### π Actions") |
|
|
| nav_cols = st.columns(4) |
|
|
| with nav_cols[0]: |
| if st.button("π¬ Ask Questions", use_container_width=True): |
| st.switch_page("pages/2_π¬_Interactive_RAG.py") |
|
|
| with nav_cols[1]: |
| if st.button("π― View Evidence", use_container_width=True): |
| st.switch_page("pages/4_π―_Evidence_Viewer.py") |
|
|
| with nav_cols[2]: |
| if st.button("π Compare Documents", use_container_width=True): |
| st.switch_page("pages/3_π_Document_Comparison.py") |
|
|
| with nav_cols[3]: |
| if st.button("π¬ Process New", use_container_width=True): |
| st.switch_page("pages/1_π¬_Live_Processing.py") |
|
|
| else: |
| |
| st.markdown("## No Document Selected") |
|
|
| col1, col2 = st.columns(2) |
|
|
| with col1: |
| st.markdown(""" |
| ### Getting Started |
| |
| 1. Go to **Live Processing** to upload and process a document |
| 2. Processed documents will appear in the sidebar |
| 3. Click on a document to view its details |
| |
| Or select a document from the sidebar if you've already processed some. |
| """) |
|
|
| if st.button("π¬ Go to Live Processing", type="primary", use_container_width=True): |
| st.switch_page("pages/1_π¬_Live_Processing.py") |
|
|
| with col2: |
| |
| stats = get_store_stats() |
| st.markdown("### RAG Index Status") |
| st.metric("Total Indexed Chunks", stats.get("total_chunks", 0)) |
|
|
| if rag_docs: |
| st.markdown("**Indexed Documents:**") |
| for doc in rag_docs[:5]: |
| doc_id = doc.get("document_id", "unknown") |
| chunks = doc.get("chunk_count", 0) |
| st.caption(f"β’ {doc_id[:30]}... ({chunks} chunks)") |
|
|
| if len(rag_docs) > 5: |
| st.caption(f"... and {len(rag_docs) - 5} more") |
|
|