SPARKNET / demo /pages /3_πŸ“Š_Document_Comparison.py
MHamdan's picture
Add password authentication and full demo application
a05467c
"""
Document Comparison - SPARKNET
Compare documents using semantic similarity, structure analysis,
and content comparison with real embedding-based similarity.
"""
import streamlit as st
import sys
from pathlib import Path
import pandas as pd
PROJECT_ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
sys.path.insert(0, str(PROJECT_ROOT / "demo"))
from state_manager import (
get_state_manager,
render_global_status_bar,
)
from rag_config import (
get_indexed_documents,
compute_document_similarity,
search_similar_chunks,
check_ollama,
get_unified_rag_system,
)
st.set_page_config(page_title="Document Comparison - SPARKNET", page_icon="πŸ“Š", layout="wide")
# Authentication
from auth import check_password, show_logout_button
if not check_password():
st.stop()
show_logout_button()
# Custom CSS
st.markdown("""
<style>
.comparison-card {
background: #161b22;
border-radius: 10px;
padding: 15px;
margin: 10px 0;
border: 1px solid #30363d;
}
.doc-header {
font-size: 16px;
font-weight: bold;
color: #4ECDC4;
margin-bottom: 10px;
}
.similarity-badge {
display: inline-block;
padding: 8px 16px;
border-radius: 20px;
font-weight: bold;
font-size: 18px;
}
.sim-high {
background: linear-gradient(90deg, #4ECDC4 0%, #44a08d 100%);
color: white;
}
.sim-med {
background: linear-gradient(90deg, #ffc107 0%, #ff8800 100%);
color: black;
}
.sim-low {
background: linear-gradient(90deg, #dc3545 0%, #c82333 100%);
color: white;
}
.chunk-match {
background: #0d1117;
border-radius: 8px;
padding: 10px;
margin: 8px 0;
border-left: 4px solid;
}
.diff-added {
background: rgba(78, 205, 196, 0.1);
border-left-color: #4ECDC4;
}
.diff-removed {
background: rgba(220, 53, 69, 0.1);
border-left-color: #dc3545;
}
.diff-common {
background: rgba(139, 148, 158, 0.1);
border-left-color: #8b949e;
}
.metric-card {
background: #161b22;
border-radius: 8px;
padding: 15px;
text-align: center;
}
.metric-value {
font-size: 32px;
font-weight: bold;
}
.metric-label {
font-size: 11px;
color: #8b949e;
text-transform: uppercase;
}
</style>
""", unsafe_allow_html=True)
def get_similarity_class(sim: float) -> str:
"""Get CSS class based on similarity."""
if sim >= 0.7:
return "sim-high"
elif sim >= 0.4:
return "sim-med"
return "sim-low"
def get_similarity_color(sim: float) -> str:
"""Get color based on similarity."""
if sim >= 0.7:
return "#4ECDC4"
elif sim >= 0.4:
return "#ffc107"
return "#dc3545"
# Initialize state manager
state_manager = get_state_manager()
rag_system = get_unified_rag_system()
# Header
st.markdown("# πŸ“Š Document Comparison")
st.markdown("Compare documents using semantic similarity, structure analysis, and content comparison")
# Global status bar
render_global_status_bar()
st.markdown("---")
# Get documents
all_docs = state_manager.get_all_documents()
indexed_docs = get_indexed_documents()
if not all_docs and not indexed_docs:
st.warning("No documents available for comparison")
st.markdown("""
### Getting Started
To compare documents:
1. Go to **Live Processing** to upload and process documents
2. Process at least 2 documents
3. Come back here to compare them
Features:
- **Semantic Similarity**: Compare documents using embedding-based similarity
- **Structure Analysis**: Compare document structure (pages, chunks, regions)
- **Content Comparison**: Find similar passages between documents
""")
if st.button("πŸ”¬ Go to Live Processing", type="primary", use_container_width=True):
st.switch_page("pages/1_πŸ”¬_Live_Processing.py")
else:
# Build document options
doc_options = {}
for doc in all_docs:
doc_options[f"{doc.filename} (State)"] = {"id": doc.doc_id, "source": "state", "doc": doc}
for doc in indexed_docs:
doc_id = doc.get("document_id", "unknown")
if doc_id not in [d["id"] for d in doc_options.values()]:
doc_options[f"{doc_id} (RAG)"] = {"id": doc_id, "source": "rag", "doc": doc}
if len(doc_options) < 2:
st.warning("Need at least 2 documents for comparison. Process more documents first.")
else:
# Document selection
st.markdown("### Select Documents to Compare")
col1, col2 = st.columns(2)
with col1:
doc1_name = st.selectbox("Document 1", list(doc_options.keys()), index=0)
with col2:
remaining = [k for k in doc_options.keys() if k != doc1_name]
doc2_name = st.selectbox("Document 2", remaining, index=0 if remaining else None)
doc1_info = doc_options.get(doc1_name)
doc2_info = doc_options.get(doc2_name)
# Comparison type
comparison_type = st.radio(
"Comparison Type",
["Semantic Similarity", "Structure Analysis", "Content Comparison"],
horizontal=True,
)
if st.button("πŸ” Compare Documents", type="primary", use_container_width=True):
st.markdown("---")
if comparison_type == "Semantic Similarity":
st.markdown("### Semantic Similarity Analysis")
with st.spinner("Computing document embeddings and similarity..."):
# Use the compute_document_similarity function from rag_config
if rag_system["status"] == "ready":
result = compute_document_similarity(doc1_info["id"], doc2_info["id"])
if result.get("error"):
st.warning(f"Could not compute similarity: {result['error']}")
# Use fallback based on text overlap
if doc1_info["source"] == "state" and doc2_info["source"] == "state":
doc1 = doc1_info["doc"]
doc2 = doc2_info["doc"]
# Simple word overlap
words1 = set(doc1.raw_text.lower().split())
words2 = set(doc2.raw_text.lower().split())
overlap = len(words1 & words2) / max(len(words1 | words2), 1)
similarity = overlap
else:
similarity = 0.5 # Default fallback
else:
similarity = result.get("similarity", 0)
else:
st.error("RAG system not ready for similarity computation")
similarity = 0.5
# Display similarity score
sim_class = get_similarity_class(similarity)
sim_color = get_similarity_color(similarity)
st.markdown(f"""
<div style="text-align: center; padding: 30px;">
<div class="similarity-badge {sim_class}">
{similarity:.0%} Similarity
</div>
<p style="color: #8b949e; margin-top: 15px;">
Based on embedding-based semantic similarity
</p>
</div>
""", unsafe_allow_html=True)
# Similarity interpretation
if similarity >= 0.7:
st.success("These documents are highly similar in content and meaning.")
elif similarity >= 0.4:
st.warning("These documents have moderate similarity - some shared topics.")
else:
st.info("These documents are quite different in content.")
# Document details
col1, col2 = st.columns(2)
with col1:
st.markdown(f"#### πŸ“„ {doc1_name.split(' (')[0]}")
if doc1_info["source"] == "state":
doc = doc1_info["doc"]
st.metric("Pages", doc.page_count)
st.metric("Chunks", len(doc.chunks))
st.metric("Characters", f"{len(doc.raw_text):,}")
else:
doc = doc1_info["doc"]
st.metric("Chunks", doc.get("chunk_count", "N/A"))
with col2:
st.markdown(f"#### πŸ“„ {doc2_name.split(' (')[0]}")
if doc2_info["source"] == "state":
doc = doc2_info["doc"]
st.metric("Pages", doc.page_count)
st.metric("Chunks", len(doc.chunks))
st.metric("Characters", f"{len(doc.raw_text):,}")
else:
doc = doc2_info["doc"]
st.metric("Chunks", doc.get("chunk_count", "N/A"))
elif comparison_type == "Structure Analysis":
st.markdown("### Document Structure Comparison")
col1, col2 = st.columns(2)
# Get structure data
def get_structure(info):
if info["source"] == "state":
doc = info["doc"]
return {
"Pages": doc.page_count,
"Chunks": len(doc.chunks),
"OCR Regions": len(doc.ocr_regions),
"Layout Regions": len(doc.layout_data.get("regions", [])),
"Characters": len(doc.raw_text),
"Words": len(doc.raw_text.split()),
}
else:
doc = info["doc"]
return {
"Chunks": doc.get("chunk_count", 0),
"Source": doc.get("source_path", "N/A"),
}
struct1 = get_structure(doc1_info)
struct2 = get_structure(doc2_info)
with col1:
st.markdown(f"#### πŸ“„ {doc1_name.split(' (')[0]}")
for key, value in struct1.items():
if isinstance(value, int) and value > 1000:
st.metric(key, f"{value:,}")
else:
st.metric(key, value)
with col2:
st.markdown(f"#### πŸ“„ {doc2_name.split(' (')[0]}")
for key, value in struct2.items():
if isinstance(value, int) and value > 1000:
st.metric(key, f"{value:,}")
else:
st.metric(key, value)
# Structure comparison chart
st.markdown("---")
st.markdown("### Comparison Chart")
common_keys = [k for k in struct1.keys() if k in struct2 and isinstance(struct1[k], (int, float))]
if common_keys:
comparison_df = pd.DataFrame({
"Metric": common_keys,
doc1_name.split(' (')[0]: [struct1[k] for k in common_keys],
doc2_name.split(' (')[0]: [struct2[k] for k in common_keys],
})
st.bar_chart(comparison_df.set_index("Metric"))
# Chunk type comparison (if available)
if doc1_info["source"] == "state" and doc2_info["source"] == "state":
st.markdown("---")
st.markdown("### Chunk Type Distribution")
def get_chunk_types(doc):
types = {}
for chunk in doc.chunks:
t = chunk.get("chunk_type", "unknown")
types[t] = types.get(t, 0) + 1
return types
types1 = get_chunk_types(doc1_info["doc"])
types2 = get_chunk_types(doc2_info["doc"])
all_types = set(types1.keys()) | set(types2.keys())
type_df = pd.DataFrame({
"Type": list(all_types),
doc1_name.split(' (')[0]: [types1.get(t, 0) for t in all_types],
doc2_name.split(' (')[0]: [types2.get(t, 0) for t in all_types],
})
st.dataframe(type_df, width='stretch', hide_index=True)
else: # Content Comparison
st.markdown("### Content Comparison")
if doc1_info["source"] == "state" and doc2_info["source"] == "state":
doc1 = doc1_info["doc"]
doc2 = doc2_info["doc"]
# Word overlap analysis
words1 = set(doc1.raw_text.lower().split())
words2 = set(doc2.raw_text.lower().split())
common_words = words1 & words2
only_doc1 = words1 - words2
only_doc2 = words2 - words1
# Metrics
metric_cols = st.columns(4)
metric_cols[0].markdown(f"""
<div class="metric-card">
<div class="metric-value" style="color: #4ECDC4;">{len(common_words):,}</div>
<div class="metric-label">Common Words</div>
</div>
""", unsafe_allow_html=True)
metric_cols[1].markdown(f"""
<div class="metric-card">
<div class="metric-value" style="color: #FF6B6B;">{len(only_doc1):,}</div>
<div class="metric-label">Only in Doc 1</div>
</div>
""", unsafe_allow_html=True)
metric_cols[2].markdown(f"""
<div class="metric-card">
<div class="metric-value" style="color: #45B7D1;">{len(only_doc2):,}</div>
<div class="metric-label">Only in Doc 2</div>
</div>
""", unsafe_allow_html=True)
overlap_pct = len(common_words) / max(len(words1 | words2), 1)
metric_cols[3].markdown(f"""
<div class="metric-card">
<div class="metric-value" style="color: #ffc107;">{overlap_pct:.0%}</div>
<div class="metric-label">Word Overlap</div>
</div>
""", unsafe_allow_html=True)
# Similar passages
st.markdown("---")
st.markdown("### Similar Passages")
# Find similar chunks between documents
with st.spinner("Finding similar passages..."):
similar_passages = []
# Compare first 10 chunks from doc1 against doc2
for i, chunk1 in enumerate(doc1.chunks[:10]):
text1 = chunk1.get("text", "")
words_c1 = set(text1.lower().split())
best_match = None
best_score = 0
for j, chunk2 in enumerate(doc2.chunks):
text2 = chunk2.get("text", "")
words_c2 = set(text2.lower().split())
# Jaccard similarity
if words_c1 and words_c2:
score = len(words_c1 & words_c2) / len(words_c1 | words_c2)
if score > best_score and score > 0.3:
best_score = score
best_match = {
"doc1_chunk": i,
"doc2_chunk": j,
"doc1_text": text1[:200],
"doc2_text": text2[:200],
"similarity": score,
}
if best_match:
similar_passages.append(best_match)
if similar_passages:
# Sort by similarity
similar_passages.sort(key=lambda x: x["similarity"], reverse=True)
for i, match in enumerate(similar_passages[:5]):
sim_color = get_similarity_color(match["similarity"])
with st.expander(f"Match {i+1} - Similarity: {match['similarity']:.0%}"):
col1, col2 = st.columns(2)
with col1:
st.markdown(f"**{doc1_name.split(' (')[0]}** (Chunk {match['doc1_chunk']+1})")
st.markdown(f"""
<div class="chunk-match diff-common">
{match['doc1_text']}...
</div>
""", unsafe_allow_html=True)
with col2:
st.markdown(f"**{doc2_name.split(' (')[0]}** (Chunk {match['doc2_chunk']+1})")
st.markdown(f"""
<div class="chunk-match diff-common">
{match['doc2_text']}...
</div>
""", unsafe_allow_html=True)
else:
st.info("No significantly similar passages found between documents")
# Key terms comparison
st.markdown("---")
st.markdown("### Key Terms Comparison")
# Get most frequent words (simple approach)
from collections import Counter
def get_top_words(text, n=20):
words = text.lower().split()
# Filter out common words
stopwords = {"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "must", "and", "or", "but", "if", "then",
"so", "to", "of", "in", "for", "on", "with", "at", "by", "from",
"this", "that", "these", "those", "it", "its"}
words = [w for w in words if len(w) > 3 and w not in stopwords]
return Counter(words).most_common(n)
top1 = get_top_words(doc1.raw_text)
top2 = get_top_words(doc2.raw_text)
col1, col2 = st.columns(2)
with col1:
st.markdown(f"**Top terms in {doc1_name.split(' (')[0]}:**")
for word, count in top1[:10]:
in_doc2 = word in [w for w, c in top2]
color = "#4ECDC4" if in_doc2 else "#8b949e"
st.markdown(f"<span style='color: {color};'>β€’ {word}</span> ({count})", unsafe_allow_html=True)
with col2:
st.markdown(f"**Top terms in {doc2_name.split(' (')[0]}:**")
for word, count in top2[:10]:
in_doc1 = word in [w for w, c in top1]
color = "#4ECDC4" if in_doc1 else "#8b949e"
st.markdown(f"<span style='color: {color};'>β€’ {word}</span> ({count})", unsafe_allow_html=True)
else:
st.info("Content comparison requires both documents to be in processed state")
# Export options
st.markdown("---")
st.markdown("### Export Comparison")
export_cols = st.columns(3)
with export_cols[0]:
if st.button("πŸ“„ Export as JSON", use_container_width=True):
import json
export_data = {
"document1": doc1_name,
"document2": doc2_name,
"comparison_type": comparison_type,
}
st.json(export_data)
with export_cols[1]:
st.button("πŸ“Š Export as CSV", disabled=True, use_container_width=True)
with export_cols[2]:
st.button("πŸ“‹ Export as PDF", disabled=True, use_container_width=True)
# Navigation
st.markdown("---")
st.markdown("### Navigation")
nav_cols = st.columns(4)
with nav_cols[0]:
if st.button("πŸ”¬ Live Processing", use_container_width=True):
st.switch_page("pages/1_πŸ”¬_Live_Processing.py")
with nav_cols[1]:
if st.button("πŸ’¬ Interactive RAG", use_container_width=True):
st.switch_page("pages/2_πŸ’¬_Interactive_RAG.py")
with nav_cols[2]:
if st.button("🎯 Evidence Viewer", use_container_width=True):
st.switch_page("pages/4_🎯_Evidence_Viewer.py")
with nav_cols[3]:
if st.button("πŸ“„ Document Viewer", use_container_width=True):
st.switch_page("pages/5_πŸ“„_Document_Viewer.py")