Spaces:

MHamdan
/

SPARKNET

Sleeping

App Files Files Community

SPARKNET / demo /pages /3_📊_Document_Comparison.py

MHamdan

Add password authentication and full demo application

a05467c 3 months ago

raw

history blame contribute delete

21.6 kB

	"""
	Document Comparison - SPARKNET

	Compare documents using semantic similarity, structure analysis,
	and content comparison with real embedding-based similarity.
	"""

	import streamlit as st
	import sys
	from pathlib import Path
	import pandas as pd

	PROJECT_ROOT = Path(__file__).parent.parent.parent
	sys.path.insert(0, str(PROJECT_ROOT))
	sys.path.insert(0, str(PROJECT_ROOT / "demo"))

	from state_manager import (
	get_state_manager,
	render_global_status_bar,
	)
	from rag_config import (
	get_indexed_documents,
	compute_document_similarity,
	search_similar_chunks,
	check_ollama,
	get_unified_rag_system,
	)

	st.set_page_config(page_title="Document Comparison - SPARKNET", page_icon="📊", layout="wide")

	# Authentication
	from auth import check_password, show_logout_button
	if not check_password():
	st.stop()
	show_logout_button()

	# Custom CSS
	st.markdown("""
	<style>
	.comparison-card {
	background: #161b22;
	border-radius: 10px;
	padding: 15px;
	margin: 10px 0;
	border: 1px solid #30363d;
	}
	.doc-header {
	font-size: 16px;
	font-weight: bold;
	color: #4ECDC4;
	margin-bottom: 10px;
	}
	.similarity-badge {
	display: inline-block;
	padding: 8px 16px;
	border-radius: 20px;
	font-weight: bold;
	font-size: 18px;
	}
	.sim-high {
	background: linear-gradient(90deg, #4ECDC4 0%, #44a08d 100%);
	color: white;
	}
	.sim-med {
	background: linear-gradient(90deg, #ffc107 0%, #ff8800 100%);
	color: black;
	}
	.sim-low {
	background: linear-gradient(90deg, #dc3545 0%, #c82333 100%);
	color: white;
	}
	.chunk-match {
	background: #0d1117;
	border-radius: 8px;
	padding: 10px;
	margin: 8px 0;
	border-left: 4px solid;
	}
	.diff-added {
	background: rgba(78, 205, 196, 0.1);
	border-left-color: #4ECDC4;
	}
	.diff-removed {
	background: rgba(220, 53, 69, 0.1);
	border-left-color: #dc3545;
	}
	.diff-common {
	background: rgba(139, 148, 158, 0.1);
	border-left-color: #8b949e;
	}
	.metric-card {
	background: #161b22;
	border-radius: 8px;
	padding: 15px;
	text-align: center;
	}
	.metric-value {
	font-size: 32px;
	font-weight: bold;
	}
	.metric-label {
	font-size: 11px;
	color: #8b949e;
	text-transform: uppercase;
	}
	</style>
	""", unsafe_allow_html=True)


	def get_similarity_class(sim: float) -> str:
	"""Get CSS class based on similarity."""
	if sim >= 0.7:
	return "sim-high"
	elif sim >= 0.4:
	return "sim-med"
	return "sim-low"


	def get_similarity_color(sim: float) -> str:
	"""Get color based on similarity."""
	if sim >= 0.7:
	return "#4ECDC4"
	elif sim >= 0.4:
	return "#ffc107"
	return "#dc3545"


	# Initialize state manager
	state_manager = get_state_manager()
	rag_system = get_unified_rag_system()

	# Header
	st.markdown("# 📊 Document Comparison")
	st.markdown("Compare documents using semantic similarity, structure analysis, and content comparison")

	# Global status bar
	render_global_status_bar()

	st.markdown("---")

	# Get documents
	all_docs = state_manager.get_all_documents()
	indexed_docs = get_indexed_documents()

	if not all_docs and not indexed_docs:
	st.warning("No documents available for comparison")
	st.markdown("""
	### Getting Started

	To compare documents:
	1. Go to Live Processing to upload and process documents
	2. Process at least 2 documents
	3. Come back here to compare them

	Features:
	- Semantic Similarity: Compare documents using embedding-based similarity
	- Structure Analysis: Compare document structure (pages, chunks, regions)
	- Content Comparison: Find similar passages between documents
	""")

	if st.button("🔬 Go to Live Processing", type="primary", use_container_width=True):
	st.switch_page("pages/1_🔬_Live_Processing.py")

	else:
	# Build document options
	doc_options = {}
	for doc in all_docs:
	doc_options[f"{doc.filename} (State)"] = {"id": doc.doc_id, "source": "state", "doc": doc}
	for doc in indexed_docs:
	doc_id = doc.get("document_id", "unknown")
	if doc_id not in [d["id"] for d in doc_options.values()]:
	doc_options[f"{doc_id} (RAG)"] = {"id": doc_id, "source": "rag", "doc": doc}

	if len(doc_options) < 2:
	st.warning("Need at least 2 documents for comparison. Process more documents first.")
	else:
	# Document selection
	st.markdown("### Select Documents to Compare")

	col1, col2 = st.columns(2)
	with col1:
	doc1_name = st.selectbox("Document 1", list(doc_options.keys()), index=0)
	with col2:
	remaining = [k for k in doc_options.keys() if k != doc1_name]
	doc2_name = st.selectbox("Document 2", remaining, index=0 if remaining else None)

	doc1_info = doc_options.get(doc1_name)
	doc2_info = doc_options.get(doc2_name)

	# Comparison type
	comparison_type = st.radio(
	"Comparison Type",
	["Semantic Similarity", "Structure Analysis", "Content Comparison"],
	horizontal=True,
	)

	if st.button("🔍 Compare Documents", type="primary", use_container_width=True):
	st.markdown("---")

	if comparison_type == "Semantic Similarity":
	st.markdown("### Semantic Similarity Analysis")

	with st.spinner("Computing document embeddings and similarity..."):
	# Use the compute_document_similarity function from rag_config
	if rag_system["status"] == "ready":
	result = compute_document_similarity(doc1_info["id"], doc2_info["id"])

	if result.get("error"):
	st.warning(f"Could not compute similarity: {result['error']}")
	# Use fallback based on text overlap
	if doc1_info["source"] == "state" and doc2_info["source"] == "state":
	doc1 = doc1_info["doc"]
	doc2 = doc2_info["doc"]
	# Simple word overlap
	words1 = set(doc1.raw_text.lower().split())
	words2 = set(doc2.raw_text.lower().split())
	overlap = len(words1 & words2) / max(len(words1 \| words2), 1)
	similarity = overlap
	else:
	similarity = 0.5 # Default fallback
	else:
	similarity = result.get("similarity", 0)
	else:
	st.error("RAG system not ready for similarity computation")
	similarity = 0.5

	# Display similarity score
	sim_class = get_similarity_class(similarity)
	sim_color = get_similarity_color(similarity)

	st.markdown(f"""
	<div style="text-align: center; padding: 30px;">
	<div class="similarity-badge {sim_class}">
	{similarity:.0%} Similarity
	</div>
	<p style="color: #8b949e; margin-top: 15px;">
	Based on embedding-based semantic similarity
	</p>
	</div>
	""", unsafe_allow_html=True)

	# Similarity interpretation
	if similarity >= 0.7:
	st.success("These documents are highly similar in content and meaning.")
	elif similarity >= 0.4:
	st.warning("These documents have moderate similarity - some shared topics.")
	else:
	st.info("These documents are quite different in content.")

	# Document details
	col1, col2 = st.columns(2)

	with col1:
	st.markdown(f"#### 📄 {doc1_name.split(' (')[0]}")
	if doc1_info["source"] == "state":
	doc = doc1_info["doc"]
	st.metric("Pages", doc.page_count)
	st.metric("Chunks", len(doc.chunks))
	st.metric("Characters", f"{len(doc.raw_text):,}")
	else:
	doc = doc1_info["doc"]
	st.metric("Chunks", doc.get("chunk_count", "N/A"))

	with col2:
	st.markdown(f"#### 📄 {doc2_name.split(' (')[0]}")
	if doc2_info["source"] == "state":
	doc = doc2_info["doc"]
	st.metric("Pages", doc.page_count)
	st.metric("Chunks", len(doc.chunks))
	st.metric("Characters", f"{len(doc.raw_text):,}")
	else:
	doc = doc2_info["doc"]
	st.metric("Chunks", doc.get("chunk_count", "N/A"))

	elif comparison_type == "Structure Analysis":
	st.markdown("### Document Structure Comparison")

	col1, col2 = st.columns(2)

	# Get structure data
	def get_structure(info):
	if info["source"] == "state":
	doc = info["doc"]
	return {
	"Pages": doc.page_count,
	"Chunks": len(doc.chunks),
	"OCR Regions": len(doc.ocr_regions),
	"Layout Regions": len(doc.layout_data.get("regions", [])),
	"Characters": len(doc.raw_text),
	"Words": len(doc.raw_text.split()),
	}
	else:
	doc = info["doc"]
	return {
	"Chunks": doc.get("chunk_count", 0),
	"Source": doc.get("source_path", "N/A"),
	}

	struct1 = get_structure(doc1_info)
	struct2 = get_structure(doc2_info)

	with col1:
	st.markdown(f"#### 📄 {doc1_name.split(' (')[0]}")
	for key, value in struct1.items():
	if isinstance(value, int) and value > 1000:
	st.metric(key, f"{value:,}")
	else:
	st.metric(key, value)

	with col2:
	st.markdown(f"#### 📄 {doc2_name.split(' (')[0]}")
	for key, value in struct2.items():
	if isinstance(value, int) and value > 1000:
	st.metric(key, f"{value:,}")
	else:
	st.metric(key, value)

	# Structure comparison chart
	st.markdown("---")
	st.markdown("### Comparison Chart")

	common_keys = [k for k in struct1.keys() if k in struct2 and isinstance(struct1[k], (int, float))]
	if common_keys:
	comparison_df = pd.DataFrame({
	"Metric": common_keys,
	doc1_name.split(' (')[0]: [struct1[k] for k in common_keys],
	doc2_name.split(' (')[0]: [struct2[k] for k in common_keys],
	})
	st.bar_chart(comparison_df.set_index("Metric"))

	# Chunk type comparison (if available)
	if doc1_info["source"] == "state" and doc2_info["source"] == "state":
	st.markdown("---")
	st.markdown("### Chunk Type Distribution")

	def get_chunk_types(doc):
	types = {}
	for chunk in doc.chunks:
	t = chunk.get("chunk_type", "unknown")
	types[t] = types.get(t, 0) + 1
	return types

	types1 = get_chunk_types(doc1_info["doc"])
	types2 = get_chunk_types(doc2_info["doc"])

	all_types = set(types1.keys()) \| set(types2.keys())

	type_df = pd.DataFrame({
	"Type": list(all_types),
	doc1_name.split(' (')[0]: [types1.get(t, 0) for t in all_types],
	doc2_name.split(' (')[0]: [types2.get(t, 0) for t in all_types],
	})
	st.dataframe(type_df, width='stretch', hide_index=True)

	else: # Content Comparison
	st.markdown("### Content Comparison")

	if doc1_info["source"] == "state" and doc2_info["source"] == "state":
	doc1 = doc1_info["doc"]
	doc2 = doc2_info["doc"]

	# Word overlap analysis
	words1 = set(doc1.raw_text.lower().split())
	words2 = set(doc2.raw_text.lower().split())

	common_words = words1 & words2
	only_doc1 = words1 - words2
	only_doc2 = words2 - words1

	# Metrics
	metric_cols = st.columns(4)
	metric_cols[0].markdown(f"""
	<div class="metric-card">
	<div class="metric-value" style="color: #4ECDC4;">{len(common_words):,}</div>
	<div class="metric-label">Common Words</div>
	</div>
	""", unsafe_allow_html=True)
	metric_cols[1].markdown(f"""
	<div class="metric-card">
	<div class="metric-value" style="color: #FF6B6B;">{len(only_doc1):,}</div>
	<div class="metric-label">Only in Doc 1</div>
	</div>
	""", unsafe_allow_html=True)
	metric_cols[2].markdown(f"""
	<div class="metric-card">
	<div class="metric-value" style="color: #45B7D1;">{len(only_doc2):,}</div>
	<div class="metric-label">Only in Doc 2</div>
	</div>
	""", unsafe_allow_html=True)

	overlap_pct = len(common_words) / max(len(words1 \| words2), 1)
	metric_cols[3].markdown(f"""
	<div class="metric-card">
	<div class="metric-value" style="color: #ffc107;">{overlap_pct:.0%}</div>
	<div class="metric-label">Word Overlap</div>
	</div>
	""", unsafe_allow_html=True)

	# Similar passages
	st.markdown("---")
	st.markdown("### Similar Passages")

	# Find similar chunks between documents
	with st.spinner("Finding similar passages..."):
	similar_passages = []

	# Compare first 10 chunks from doc1 against doc2
	for i, chunk1 in enumerate(doc1.chunks[:10]):
	text1 = chunk1.get("text", "")
	words_c1 = set(text1.lower().split())

	best_match = None
	best_score = 0

	for j, chunk2 in enumerate(doc2.chunks):
	text2 = chunk2.get("text", "")
	words_c2 = set(text2.lower().split())

	# Jaccard similarity
	if words_c1 and words_c2:
	score = len(words_c1 & words_c2) / len(words_c1 \| words_c2)
	if score > best_score and score > 0.3:
	best_score = score
	best_match = {
	"doc1_chunk": i,
	"doc2_chunk": j,
	"doc1_text": text1[:200],
	"doc2_text": text2[:200],
	"similarity": score,
	}

	if best_match:
	similar_passages.append(best_match)

	if similar_passages:
	# Sort by similarity
	similar_passages.sort(key=lambda x: x["similarity"], reverse=True)

	for i, match in enumerate(similar_passages[:5]):
	sim_color = get_similarity_color(match["similarity"])
	with st.expander(f"Match {i+1} - Similarity: {match['similarity']:.0%}"):
	col1, col2 = st.columns(2)
	with col1:
	st.markdown(f"{doc1_name.split(' (')[0]} (Chunk {match['doc1_chunk']+1})")
	st.markdown(f"""
	<div class="chunk-match diff-common">
	{match['doc1_text']}...
	</div>
	""", unsafe_allow_html=True)
	with col2:
	st.markdown(f"{doc2_name.split(' (')[0]} (Chunk {match['doc2_chunk']+1})")
	st.markdown(f"""
	<div class="chunk-match diff-common">
	{match['doc2_text']}...
	</div>
	""", unsafe_allow_html=True)
	else:
	st.info("No significantly similar passages found between documents")

	# Key terms comparison
	st.markdown("---")
	st.markdown("### Key Terms Comparison")

	# Get most frequent words (simple approach)
	from collections import Counter

	def get_top_words(text, n=20):
	words = text.lower().split()
	# Filter out common words
	stopwords = {"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
	"have", "has", "had", "do", "does", "did", "will", "would", "could",
	"should", "may", "might", "must", "and", "or", "but", "if", "then",
	"so", "to", "of", "in", "for", "on", "with", "at", "by", "from",
	"this", "that", "these", "those", "it", "its"}
	words = [w for w in words if len(w) > 3 and w not in stopwords]
	return Counter(words).most_common(n)

	top1 = get_top_words(doc1.raw_text)
	top2 = get_top_words(doc2.raw_text)

	col1, col2 = st.columns(2)
	with col1:
	st.markdown(f"Top terms in {doc1_name.split(' (')[0]}:")
	for word, count in top1[:10]:
	in_doc2 = word in [w for w, c in top2]
	color = "#4ECDC4" if in_doc2 else "#8b949e"
	st.markdown(f"<span style='color: {color};'>• {word}</span> ({count})", unsafe_allow_html=True)

	with col2:
	st.markdown(f"Top terms in {doc2_name.split(' (')[0]}:")
	for word, count in top2[:10]:
	in_doc1 = word in [w for w, c in top1]
	color = "#4ECDC4" if in_doc1 else "#8b949e"
	st.markdown(f"<span style='color: {color};'>• {word}</span> ({count})", unsafe_allow_html=True)

	else:
	st.info("Content comparison requires both documents to be in processed state")

	# Export options
	st.markdown("---")
	st.markdown("### Export Comparison")

	export_cols = st.columns(3)
	with export_cols[0]:
	if st.button("📄 Export as JSON", use_container_width=True):
	import json
	export_data = {
	"document1": doc1_name,
	"document2": doc2_name,
	"comparison_type": comparison_type,
	}
	st.json(export_data)
	with export_cols[1]:
	st.button("📊 Export as CSV", disabled=True, use_container_width=True)
	with export_cols[2]:
	st.button("📋 Export as PDF", disabled=True, use_container_width=True)

	# Navigation
	st.markdown("---")
	st.markdown("### Navigation")
	nav_cols = st.columns(4)

	with nav_cols[0]:
	if st.button("🔬 Live Processing", use_container_width=True):
	st.switch_page("pages/1_🔬_Live_Processing.py")
	with nav_cols[1]:
	if st.button("💬 Interactive RAG", use_container_width=True):
	st.switch_page("pages/2_💬_Interactive_RAG.py")
	with nav_cols[2]:
	if st.button("🎯 Evidence Viewer", use_container_width=True):
	st.switch_page("pages/4_🎯_Evidence_Viewer.py")
	with nav_cols[3]:
	if st.button("📄 Document Viewer", use_container_width=True):
	st.switch_page("pages/5_📄_Document_Viewer.py")