Spaces:

MHamdan
/

SPARKNET

Sleeping

App Files Files Community

SPARKNET / demo /pages /5_📄_Document_Viewer.py

MHamdan

Add password authentication and full demo application

a05467c 3 months ago

raw

history blame contribute delete

19.5 kB

	"""
	Document Viewer - SPARKNET

	View and explore processed documents from the state manager.
	Provides visual chunk segmentation, OCR regions, and layout visualization.
	"""

	import streamlit as st
	import sys
	from pathlib import Path
	import time
	import hashlib
	import base64
	from typing import List, Dict, Any

	PROJECT_ROOT = Path(__file__).parent.parent.parent
	sys.path.insert(0, str(PROJECT_ROOT))
	sys.path.insert(0, str(PROJECT_ROOT / "demo"))

	# Import state manager and RAG config
	from state_manager import (
	get_state_manager,
	ProcessedDocument,
	render_global_status_bar,
	)
	from rag_config import (
	get_unified_rag_system,
	get_store_stats,
	get_indexed_documents,
	get_chunks_for_document,
	check_ollama,
	)

	st.set_page_config(
	page_title="Document Viewer - SPARKNET",
	page_icon="📄",
	layout="wide"
	)

	# Authentication
	from auth import check_password, show_logout_button
	if not check_password():
	st.stop()
	show_logout_button()

	# Custom CSS
	st.markdown("""
	<style>
	.chunk-card {
	background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
	border-radius: 10px;
	padding: 12px;
	margin: 8px 0;
	border-left: 4px solid #4ECDC4;
	}
	.chunk-header {
	display: flex;
	justify-content: space-between;
	font-size: 11px;
	color: #8b949e;
	margin-bottom: 6px;
	}
	.chunk-text {
	font-size: 13px;
	line-height: 1.5;
	color: #c9d1d9;
	font-family: 'Monaco', 'Menlo', monospace;
	}
	.ocr-region {
	background: #161b22;
	border-radius: 6px;
	padding: 8px;
	margin: 4px 0;
	border-left: 3px solid;
	}
	.layout-region {
	display: inline-block;
	padding: 4px 8px;
	margin: 3px;
	border-radius: 4px;
	font-size: 11px;
	}
	.doc-card {
	background: #0d1117;
	border-radius: 10px;
	padding: 15px;
	margin: 10px 0;
	border: 1px solid #30363d;
	cursor: pointer;
	transition: border-color 0.2s;
	}
	.doc-card:hover {
	border-color: #4ECDC4;
	}
	.doc-card.active {
	border-color: #4ECDC4;
	border-width: 2px;
	}
	.metric-mini {
	background: #161b22;
	border-radius: 6px;
	padding: 8px;
	text-align: center;
	margin: 4px;
	}
	.metric-mini .value {
	font-size: 18px;
	font-weight: bold;
	color: #4ECDC4;
	}
	.metric-mini .label {
	font-size: 10px;
	color: #8b949e;
	text-transform: uppercase;
	}
	.page-viewer {
	background: #0d1117;
	border-radius: 10px;
	padding: 20px;
	max-height: 600px;
	overflow-y: auto;
	}
	.confidence-high { color: #4ECDC4; }
	.confidence-med { color: #ffc107; }
	.confidence-low { color: #dc3545; }
	</style>
	""", unsafe_allow_html=True)


	def get_chunk_color(index: int) -> str:
	"""Get distinct color for chunk visualization."""
	colors = [
	"#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4",
	"#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F",
	"#BB8FCE", "#85C1E9", "#F8B500", "#00CED1"
	]
	return colors[index % len(colors)]


	def get_confidence_class(conf: float) -> str:
	"""Get confidence CSS class."""
	if conf >= 0.8:
	return "confidence-high"
	elif conf >= 0.6:
	return "confidence-med"
	return "confidence-low"


	def get_layout_color(layout_type: str) -> str:
	"""Get color for layout type."""
	colors = {
	"title": "#FF6B6B",
	"heading": "#FF8E6B",
	"paragraph": "#4ECDC4",
	"text": "#45B7D1",
	"list": "#96CEB4",
	"table": "#FFEAA7",
	"figure": "#DDA0DD",
	"header": "#98D8C8",
	"footer": "#8b949e",
	}
	return colors.get(layout_type.lower(), "#666")


	# Initialize state manager
	state_manager = get_state_manager()

	# Header
	st.markdown("# 📄 Document Viewer")
	st.markdown("Explore processed documents, chunks, OCR regions, and layout structure")

	# Global status bar
	render_global_status_bar()

	st.markdown("---")

	# Get all documents from state and RAG
	all_state_docs = state_manager.get_all_documents()
	rag_docs = get_indexed_documents()

	# Sidebar for document selection
	with st.sidebar:
	st.markdown("## 📚 Documents")

	# Processed documents from state manager
	if all_state_docs:
	st.markdown("### Recently Processed")
	selected_doc_id = None

	for doc in reversed(all_state_docs[-10:]):
	is_active = state_manager.state.get("active_doc_id") == doc.doc_id
	card_class = "doc-card active" if is_active else "doc-card"

	if st.button(
	f"📄 {doc.filename[:25]}...",
	key=f"doc_{doc.doc_id}",
	use_container_width=True,
	type="primary" if is_active else "secondary"
	):
	state_manager.set_active_document(doc.doc_id)
	st.rerun()

	# Mini stats
	cols = st.columns(3)
	cols[0].caption(f"📄 {doc.page_count}p")
	cols[1].caption(f"📦 {len(doc.chunks)}")
	if doc.indexed:
	cols[2].caption("✓ Indexed")
	st.markdown("---")
	else:
	st.info("No documents processed yet")
	st.markdown("Go to Live Processing to process documents")

	# RAG indexed documents
	if rag_docs:
	st.markdown("### 📊 RAG Index")
	st.caption(f"{len(rag_docs)} documents indexed")
	for doc in rag_docs[:5]:
	st.caption(f"• {doc.get('document_id', 'unknown')[:20]}...")

	# Main content
	active_doc = state_manager.get_active_document()

	if active_doc:
	# Document header
	col1, col2 = st.columns([3, 1])

	with col1:
	st.markdown(f"## 📄 {active_doc.filename}")
	st.caption(f"ID: `{active_doc.doc_id}` \| Type: {active_doc.file_type} \| Processed: {active_doc.created_at.strftime('%Y-%m-%d %H:%M')}")

	with col2:
	if active_doc.indexed:
	st.success(f"✓ Indexed ({active_doc.indexed_chunks} chunks)")
	else:
	st.warning("Not indexed")

	# Summary metrics
	metric_cols = st.columns(6)
	metric_cols[0].markdown(f"""
	<div class="metric-mini">
	<div class="value">{active_doc.page_count}</div>
	<div class="label">Pages</div>
	</div>
	""", unsafe_allow_html=True)
	metric_cols[1].markdown(f"""
	<div class="metric-mini">
	<div class="value">{len(active_doc.chunks)}</div>
	<div class="label">Chunks</div>
	</div>
	""", unsafe_allow_html=True)
	metric_cols[2].markdown(f"""
	<div class="metric-mini">
	<div class="value">{len(active_doc.ocr_regions)}</div>
	<div class="label">OCR Regions</div>
	</div>
	""", unsafe_allow_html=True)
	layout_count = len(active_doc.layout_data.get("regions", []))
	metric_cols[3].markdown(f"""
	<div class="metric-mini">
	<div class="value">{layout_count}</div>
	<div class="label">Layout Regions</div>
	</div>
	""", unsafe_allow_html=True)
	metric_cols[4].markdown(f"""
	<div class="metric-mini">
	<div class="value">{len(active_doc.raw_text):,}</div>
	<div class="label">Characters</div>
	</div>
	""", unsafe_allow_html=True)
	metric_cols[5].markdown(f"""
	<div class="metric-mini">
	<div class="value">{active_doc.processing_time:.1f}s</div>
	<div class="label">Process Time</div>
	</div>
	""", unsafe_allow_html=True)

	st.markdown("---")

	# Tabs for different views
	tab_chunks, tab_text, tab_ocr, tab_layout, tab_pages = st.tabs([
	"📦 Chunks",
	"📝 Full Text",
	"🔍 OCR Regions",
	"🗺️ Layout",
	"📄 Page Images"
	])

	with tab_chunks:
	st.markdown("### Document Chunks")

	# Filter options
	filter_cols = st.columns([2, 1, 1])
	with filter_cols[0]:
	search_term = st.text_input("Search in chunks", placeholder="Enter search term...")
	with filter_cols[1]:
	chunk_types = list(set(c.get("chunk_type", "text") for c in active_doc.chunks))
	selected_type = st.selectbox("Filter by type", ["All"] + chunk_types)
	with filter_cols[2]:
	page_filter = st.selectbox("Filter by page", ["All"] + list(range(1, active_doc.page_count + 1)))

	# Filter chunks
	filtered_chunks = active_doc.chunks
	if search_term:
	filtered_chunks = [c for c in filtered_chunks if search_term.lower() in c.get("text", "").lower()]
	if selected_type != "All":
	filtered_chunks = [c for c in filtered_chunks if c.get("chunk_type") == selected_type]
	if page_filter != "All":
	filtered_chunks = [c for c in filtered_chunks if c.get("page", 0) + 1 == page_filter]

	st.caption(f"Showing {len(filtered_chunks)} of {len(active_doc.chunks)} chunks")

	# Display chunks
	for i, chunk in enumerate(filtered_chunks[:30]):
	chunk_type = chunk.get("chunk_type", "text")
	conf = chunk.get("confidence", 0)
	color = get_chunk_color(i)
	conf_class = get_confidence_class(conf)

	with st.expander(f"[{i+1}] {chunk_type.upper()} - {chunk.get('text', '')[:60]}...", expanded=(i == 0)):
	st.markdown(f"""
	<div class="chunk-card" style="border-left-color: {color};">
	<div class="chunk-header">
	<span>ID: <code>{chunk.get('chunk_id', 'N/A')}</code></span>
	<span>Page {chunk.get('page', 0) + 1}</span>
	<span class="{conf_class}">Confidence: {conf:.0%}</span>
	</div>
	<div class="chunk-text">{chunk.get('text', '')}</div>
	</div>
	""", unsafe_allow_html=True)

	# Bounding box info
	bbox = chunk.get("bbox")
	if bbox:
	st.caption(f"Bbox: ({bbox[0]:.0f}, {bbox[1]:.0f}) - ({bbox[2]:.0f}, {bbox[3]:.0f})")

	if len(filtered_chunks) > 30:
	st.info(f"Showing 30 of {len(filtered_chunks)} matching chunks")

	with tab_text:
	st.markdown("### Extracted Text")

	# Text display options
	text_cols = st.columns([1, 1, 1])
	with text_cols[0]:
	show_page_markers = st.checkbox("Show page markers", value=True)
	with text_cols[1]:
	font_size = st.slider("Font size", 10, 18, 13)
	with text_cols[2]:
	max_chars = st.slider("Max characters", 5000, 50000, 20000, 1000)

	text_to_display = active_doc.raw_text[:max_chars]
	if len(active_doc.raw_text) > max_chars:
	text_to_display += f"\n\n... [Truncated - {len(active_doc.raw_text) - max_chars:,} more characters]"

	st.markdown(f"""
	<div class="page-viewer" style="font-size: {font_size}px;">
	<pre style="white-space: pre-wrap; font-family: monospace; margin: 0;">{text_to_display}</pre>
	</div>
	""", unsafe_allow_html=True)

	# Download button
	st.download_button(
	"📥 Download Full Text",
	active_doc.raw_text,
	file_name=f"{active_doc.filename}.txt",
	mime="text/plain"
	)

	with tab_ocr:
	st.markdown("### OCR Regions")

	if active_doc.ocr_regions:
	# Group by page
	by_page = {}
	for region in active_doc.ocr_regions:
	page = region.get("page", 0)
	if page not in by_page:
	by_page[page] = []
	by_page[page].append(region)

	# Page selector
	page_select = st.selectbox(
	"Select page",
	sorted(by_page.keys()),
	format_func=lambda x: f"Page {x + 1} ({len(by_page.get(x, []))} regions)"
	)

	if page_select is not None and page_select in by_page:
	page_regions = by_page[page_select]

	# Summary
	avg_conf = sum(r.get("confidence", 0) for r in page_regions) / len(page_regions) if page_regions else 0
	conf_class = get_confidence_class(avg_conf)

	st.markdown(f"{len(page_regions)} regions \| Average confidence: <span class='{conf_class}'>{avg_conf:.0%}</span>", unsafe_allow_html=True)

	# Filter by confidence
	min_conf = st.slider("Minimum confidence", 0.0, 1.0, 0.5, 0.1)
	filtered_regions = [r for r in page_regions if r.get("confidence", 0) >= min_conf]

	for i, region in enumerate(filtered_regions[:50]):
	conf = region.get("confidence", 0)
	conf_class = get_confidence_class(conf)
	color = "#4ECDC4" if conf >= 0.8 else "#ffc107" if conf >= 0.6 else "#dc3545"

	st.markdown(f"""
	<div class="ocr-region" style="border-left-color: {color};">
	<div style="display: flex; justify-content: space-between; margin-bottom: 4px;">
	<span style="font-size: 11px; color: #8b949e;">Region {i+1}</span>
	<span class="{conf_class}" style="font-size: 11px;">{conf:.0%}</span>
	</div>
	<div style="font-family: monospace; font-size: 12px;">{region.get('text', '')}</div>
	</div>
	""", unsafe_allow_html=True)

	if len(filtered_regions) > 50:
	st.info(f"Showing 50 of {len(filtered_regions)} regions")
	else:
	st.info("No OCR regions available for this document")
	st.markdown("OCR regions are extracted during document processing with OCR enabled.")

	with tab_layout:
	st.markdown("### Layout Structure")

	layout_regions = active_doc.layout_data.get("regions", [])

	if layout_regions:
	# Group by type
	by_type = {}
	for region in layout_regions:
	rtype = region.get("type", "unknown")
	if rtype not in by_type:
	by_type[rtype] = []
	by_type[rtype].append(region)

	# Type summary
	st.markdown("Detected region types:")
	type_cols = st.columns(min(len(by_type), 6))
	for i, (rtype, regions) in enumerate(by_type.items()):
	color = get_layout_color(rtype)
	type_cols[i % 6].markdown(f"""
	<div class="layout-region" style="background: {color}20; border: 1px solid {color};">
	<strong>{rtype.title()}</strong>: {len(regions)}
	</div>
	""", unsafe_allow_html=True)

	st.markdown("---")

	# Layout regions list
	type_filter = st.selectbox("Filter by type", ["All"] + list(by_type.keys()))

	filtered_layout = layout_regions
	if type_filter != "All":
	filtered_layout = by_type.get(type_filter, [])

	for i, region in enumerate(filtered_layout[:30]):
	rtype = region.get("type", "unknown")
	conf = region.get("confidence", 0)
	color = get_layout_color(rtype)
	conf_class = get_confidence_class(conf)

	st.markdown(f"""
	<div style="background: #161b22; border-radius: 6px; padding: 10px; margin: 6px 0; border-left: 3px solid {color};">
	<div style="display: flex; justify-content: space-between;">
	<span><strong style="color: {color};">{rtype.upper()}</strong></span>
	<span>Page {region.get('page', 0) + 1}</span>
	<span class="{conf_class}">{conf:.0%}</span>
	</div>
	</div>
	""", unsafe_allow_html=True)

	if len(filtered_layout) > 30:
	st.info(f"Showing 30 of {len(filtered_layout)} regions")
	else:
	st.info("No layout regions available for this document")
	st.markdown("Layout regions are extracted during document processing with layout detection enabled.")

	with tab_pages:
	st.markdown("### Page Images")

	if active_doc.page_images:
	page_idx = st.selectbox(
	"Select page",
	list(range(len(active_doc.page_images))),
	format_func=lambda x: f"Page {x + 1}"
	)

	if page_idx is not None and page_idx < len(active_doc.page_images):
	img_data = active_doc.page_images[page_idx]

	# Display image
	st.image(
	f"data:image/png;base64,{img_data}",
	caption=f"Page {page_idx + 1}",
	use_container_width=True
	)

	# Overlay options
	st.markdown("Overlay options:")
	overlay_cols = st.columns(3)
	with overlay_cols[0]:
	show_chunks = st.checkbox("Show chunk boundaries", value=False)
	with overlay_cols[1]:
	show_ocr = st.checkbox("Show OCR regions", value=False)
	with overlay_cols[2]:
	show_layout = st.checkbox("Show layout regions", value=False)

	if show_chunks or show_ocr or show_layout:
	st.info("Overlay visualization coming soon - requires image annotation support")
	else:
	st.info("No page images available for this document")
	st.markdown("Page images are extracted from PDF documents during processing.")

	# Navigation to other modules
	st.markdown("---")
	st.markdown("### 🔗 Actions")

	nav_cols = st.columns(4)

	with nav_cols[0]:
	if st.button("💬 Ask Questions", use_container_width=True):
	st.switch_page("pages/2_💬_Interactive_RAG.py")

	with nav_cols[1]:
	if st.button("🎯 View Evidence", use_container_width=True):
	st.switch_page("pages/4_🎯_Evidence_Viewer.py")

	with nav_cols[2]:
	if st.button("📊 Compare Documents", use_container_width=True):
	st.switch_page("pages/3_📊_Document_Comparison.py")

	with nav_cols[3]:
	if st.button("🔬 Process New", use_container_width=True):
	st.switch_page("pages/1_🔬_Live_Processing.py")

	else:
	# No active document
	st.markdown("## No Document Selected")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("""
	### Getting Started

	1. Go to Live Processing to upload and process a document
	2. Processed documents will appear in the sidebar
	3. Click on a document to view its details

	Or select a document from the sidebar if you've already processed some.
	""")

	if st.button("🔬 Go to Live Processing", type="primary", use_container_width=True):
	st.switch_page("pages/1_🔬_Live_Processing.py")

	with col2:
	# Show RAG stats
	stats = get_store_stats()
	st.markdown("### RAG Index Status")
	st.metric("Total Indexed Chunks", stats.get("total_chunks", 0))

	if rag_docs:
	st.markdown("Indexed Documents:")
	for doc in rag_docs[:5]:
	doc_id = doc.get("document_id", "unknown")
	chunks = doc.get("chunk_count", 0)
	st.caption(f"• {doc_id[:30]}... ({chunks} chunks)")

	if len(rag_docs) > 5:
	st.caption(f"... and {len(rag_docs) - 5} more")