| """ |
| Document Comparison - SPARKNET |
| |
| Compare documents using semantic similarity, structure analysis, |
| and content comparison with real embedding-based similarity. |
| """ |
|
|
| import streamlit as st |
| import sys |
| from pathlib import Path |
| import pandas as pd |
|
|
| PROJECT_ROOT = Path(__file__).parent.parent.parent |
| sys.path.insert(0, str(PROJECT_ROOT)) |
| sys.path.insert(0, str(PROJECT_ROOT / "demo")) |
|
|
| from state_manager import ( |
| get_state_manager, |
| render_global_status_bar, |
| ) |
| from rag_config import ( |
| get_indexed_documents, |
| compute_document_similarity, |
| search_similar_chunks, |
| check_ollama, |
| get_unified_rag_system, |
| ) |
|
|
| st.set_page_config(page_title="Document Comparison - SPARKNET", page_icon="π", layout="wide") |
|
|
| |
| from auth import check_password, show_logout_button |
| if not check_password(): |
| st.stop() |
| show_logout_button() |
|
|
| |
| st.markdown(""" |
| <style> |
| .comparison-card { |
| background: #161b22; |
| border-radius: 10px; |
| padding: 15px; |
| margin: 10px 0; |
| border: 1px solid #30363d; |
| } |
| .doc-header { |
| font-size: 16px; |
| font-weight: bold; |
| color: #4ECDC4; |
| margin-bottom: 10px; |
| } |
| .similarity-badge { |
| display: inline-block; |
| padding: 8px 16px; |
| border-radius: 20px; |
| font-weight: bold; |
| font-size: 18px; |
| } |
| .sim-high { |
| background: linear-gradient(90deg, #4ECDC4 0%, #44a08d 100%); |
| color: white; |
| } |
| .sim-med { |
| background: linear-gradient(90deg, #ffc107 0%, #ff8800 100%); |
| color: black; |
| } |
| .sim-low { |
| background: linear-gradient(90deg, #dc3545 0%, #c82333 100%); |
| color: white; |
| } |
| .chunk-match { |
| background: #0d1117; |
| border-radius: 8px; |
| padding: 10px; |
| margin: 8px 0; |
| border-left: 4px solid; |
| } |
| .diff-added { |
| background: rgba(78, 205, 196, 0.1); |
| border-left-color: #4ECDC4; |
| } |
| .diff-removed { |
| background: rgba(220, 53, 69, 0.1); |
| border-left-color: #dc3545; |
| } |
| .diff-common { |
| background: rgba(139, 148, 158, 0.1); |
| border-left-color: #8b949e; |
| } |
| .metric-card { |
| background: #161b22; |
| border-radius: 8px; |
| padding: 15px; |
| text-align: center; |
| } |
| .metric-value { |
| font-size: 32px; |
| font-weight: bold; |
| } |
| .metric-label { |
| font-size: 11px; |
| color: #8b949e; |
| text-transform: uppercase; |
| } |
| </style> |
| """, unsafe_allow_html=True) |
|
|
|
|
| def get_similarity_class(sim: float) -> str: |
| """Get CSS class based on similarity.""" |
| if sim >= 0.7: |
| return "sim-high" |
| elif sim >= 0.4: |
| return "sim-med" |
| return "sim-low" |
|
|
|
|
| def get_similarity_color(sim: float) -> str: |
| """Get color based on similarity.""" |
| if sim >= 0.7: |
| return "#4ECDC4" |
| elif sim >= 0.4: |
| return "#ffc107" |
| return "#dc3545" |
|
|
|
|
| |
| state_manager = get_state_manager() |
| rag_system = get_unified_rag_system() |
|
|
| |
| st.markdown("# π Document Comparison") |
| st.markdown("Compare documents using semantic similarity, structure analysis, and content comparison") |
|
|
| |
| render_global_status_bar() |
|
|
| st.markdown("---") |
|
|
| |
| all_docs = state_manager.get_all_documents() |
| indexed_docs = get_indexed_documents() |
|
|
| if not all_docs and not indexed_docs: |
| st.warning("No documents available for comparison") |
| st.markdown(""" |
| ### Getting Started |
| |
| To compare documents: |
| 1. Go to **Live Processing** to upload and process documents |
| 2. Process at least 2 documents |
| 3. Come back here to compare them |
| |
| Features: |
| - **Semantic Similarity**: Compare documents using embedding-based similarity |
| - **Structure Analysis**: Compare document structure (pages, chunks, regions) |
| - **Content Comparison**: Find similar passages between documents |
| """) |
|
|
| if st.button("π¬ Go to Live Processing", type="primary", use_container_width=True): |
| st.switch_page("pages/1_π¬_Live_Processing.py") |
|
|
| else: |
| |
| doc_options = {} |
| for doc in all_docs: |
| doc_options[f"{doc.filename} (State)"] = {"id": doc.doc_id, "source": "state", "doc": doc} |
| for doc in indexed_docs: |
| doc_id = doc.get("document_id", "unknown") |
| if doc_id not in [d["id"] for d in doc_options.values()]: |
| doc_options[f"{doc_id} (RAG)"] = {"id": doc_id, "source": "rag", "doc": doc} |
|
|
| if len(doc_options) < 2: |
| st.warning("Need at least 2 documents for comparison. Process more documents first.") |
| else: |
| |
| st.markdown("### Select Documents to Compare") |
|
|
| col1, col2 = st.columns(2) |
| with col1: |
| doc1_name = st.selectbox("Document 1", list(doc_options.keys()), index=0) |
| with col2: |
| remaining = [k for k in doc_options.keys() if k != doc1_name] |
| doc2_name = st.selectbox("Document 2", remaining, index=0 if remaining else None) |
|
|
| doc1_info = doc_options.get(doc1_name) |
| doc2_info = doc_options.get(doc2_name) |
|
|
| |
| comparison_type = st.radio( |
| "Comparison Type", |
| ["Semantic Similarity", "Structure Analysis", "Content Comparison"], |
| horizontal=True, |
| ) |
|
|
| if st.button("π Compare Documents", type="primary", use_container_width=True): |
| st.markdown("---") |
|
|
| if comparison_type == "Semantic Similarity": |
| st.markdown("### Semantic Similarity Analysis") |
|
|
| with st.spinner("Computing document embeddings and similarity..."): |
| |
| if rag_system["status"] == "ready": |
| result = compute_document_similarity(doc1_info["id"], doc2_info["id"]) |
|
|
| if result.get("error"): |
| st.warning(f"Could not compute similarity: {result['error']}") |
| |
| if doc1_info["source"] == "state" and doc2_info["source"] == "state": |
| doc1 = doc1_info["doc"] |
| doc2 = doc2_info["doc"] |
| |
| words1 = set(doc1.raw_text.lower().split()) |
| words2 = set(doc2.raw_text.lower().split()) |
| overlap = len(words1 & words2) / max(len(words1 | words2), 1) |
| similarity = overlap |
| else: |
| similarity = 0.5 |
| else: |
| similarity = result.get("similarity", 0) |
| else: |
| st.error("RAG system not ready for similarity computation") |
| similarity = 0.5 |
|
|
| |
| sim_class = get_similarity_class(similarity) |
| sim_color = get_similarity_color(similarity) |
|
|
| st.markdown(f""" |
| <div style="text-align: center; padding: 30px;"> |
| <div class="similarity-badge {sim_class}"> |
| {similarity:.0%} Similarity |
| </div> |
| <p style="color: #8b949e; margin-top: 15px;"> |
| Based on embedding-based semantic similarity |
| </p> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| if similarity >= 0.7: |
| st.success("These documents are highly similar in content and meaning.") |
| elif similarity >= 0.4: |
| st.warning("These documents have moderate similarity - some shared topics.") |
| else: |
| st.info("These documents are quite different in content.") |
|
|
| |
| col1, col2 = st.columns(2) |
|
|
| with col1: |
| st.markdown(f"#### π {doc1_name.split(' (')[0]}") |
| if doc1_info["source"] == "state": |
| doc = doc1_info["doc"] |
| st.metric("Pages", doc.page_count) |
| st.metric("Chunks", len(doc.chunks)) |
| st.metric("Characters", f"{len(doc.raw_text):,}") |
| else: |
| doc = doc1_info["doc"] |
| st.metric("Chunks", doc.get("chunk_count", "N/A")) |
|
|
| with col2: |
| st.markdown(f"#### π {doc2_name.split(' (')[0]}") |
| if doc2_info["source"] == "state": |
| doc = doc2_info["doc"] |
| st.metric("Pages", doc.page_count) |
| st.metric("Chunks", len(doc.chunks)) |
| st.metric("Characters", f"{len(doc.raw_text):,}") |
| else: |
| doc = doc2_info["doc"] |
| st.metric("Chunks", doc.get("chunk_count", "N/A")) |
|
|
| elif comparison_type == "Structure Analysis": |
| st.markdown("### Document Structure Comparison") |
|
|
| col1, col2 = st.columns(2) |
|
|
| |
| def get_structure(info): |
| if info["source"] == "state": |
| doc = info["doc"] |
| return { |
| "Pages": doc.page_count, |
| "Chunks": len(doc.chunks), |
| "OCR Regions": len(doc.ocr_regions), |
| "Layout Regions": len(doc.layout_data.get("regions", [])), |
| "Characters": len(doc.raw_text), |
| "Words": len(doc.raw_text.split()), |
| } |
| else: |
| doc = info["doc"] |
| return { |
| "Chunks": doc.get("chunk_count", 0), |
| "Source": doc.get("source_path", "N/A"), |
| } |
|
|
| struct1 = get_structure(doc1_info) |
| struct2 = get_structure(doc2_info) |
|
|
| with col1: |
| st.markdown(f"#### π {doc1_name.split(' (')[0]}") |
| for key, value in struct1.items(): |
| if isinstance(value, int) and value > 1000: |
| st.metric(key, f"{value:,}") |
| else: |
| st.metric(key, value) |
|
|
| with col2: |
| st.markdown(f"#### π {doc2_name.split(' (')[0]}") |
| for key, value in struct2.items(): |
| if isinstance(value, int) and value > 1000: |
| st.metric(key, f"{value:,}") |
| else: |
| st.metric(key, value) |
|
|
| |
| st.markdown("---") |
| st.markdown("### Comparison Chart") |
|
|
| common_keys = [k for k in struct1.keys() if k in struct2 and isinstance(struct1[k], (int, float))] |
| if common_keys: |
| comparison_df = pd.DataFrame({ |
| "Metric": common_keys, |
| doc1_name.split(' (')[0]: [struct1[k] for k in common_keys], |
| doc2_name.split(' (')[0]: [struct2[k] for k in common_keys], |
| }) |
| st.bar_chart(comparison_df.set_index("Metric")) |
|
|
| |
| if doc1_info["source"] == "state" and doc2_info["source"] == "state": |
| st.markdown("---") |
| st.markdown("### Chunk Type Distribution") |
|
|
| def get_chunk_types(doc): |
| types = {} |
| for chunk in doc.chunks: |
| t = chunk.get("chunk_type", "unknown") |
| types[t] = types.get(t, 0) + 1 |
| return types |
|
|
| types1 = get_chunk_types(doc1_info["doc"]) |
| types2 = get_chunk_types(doc2_info["doc"]) |
|
|
| all_types = set(types1.keys()) | set(types2.keys()) |
|
|
| type_df = pd.DataFrame({ |
| "Type": list(all_types), |
| doc1_name.split(' (')[0]: [types1.get(t, 0) for t in all_types], |
| doc2_name.split(' (')[0]: [types2.get(t, 0) for t in all_types], |
| }) |
| st.dataframe(type_df, width='stretch', hide_index=True) |
|
|
| else: |
| st.markdown("### Content Comparison") |
|
|
| if doc1_info["source"] == "state" and doc2_info["source"] == "state": |
| doc1 = doc1_info["doc"] |
| doc2 = doc2_info["doc"] |
|
|
| |
| words1 = set(doc1.raw_text.lower().split()) |
| words2 = set(doc2.raw_text.lower().split()) |
|
|
| common_words = words1 & words2 |
| only_doc1 = words1 - words2 |
| only_doc2 = words2 - words1 |
|
|
| |
| metric_cols = st.columns(4) |
| metric_cols[0].markdown(f""" |
| <div class="metric-card"> |
| <div class="metric-value" style="color: #4ECDC4;">{len(common_words):,}</div> |
| <div class="metric-label">Common Words</div> |
| </div> |
| """, unsafe_allow_html=True) |
| metric_cols[1].markdown(f""" |
| <div class="metric-card"> |
| <div class="metric-value" style="color: #FF6B6B;">{len(only_doc1):,}</div> |
| <div class="metric-label">Only in Doc 1</div> |
| </div> |
| """, unsafe_allow_html=True) |
| metric_cols[2].markdown(f""" |
| <div class="metric-card"> |
| <div class="metric-value" style="color: #45B7D1;">{len(only_doc2):,}</div> |
| <div class="metric-label">Only in Doc 2</div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| overlap_pct = len(common_words) / max(len(words1 | words2), 1) |
| metric_cols[3].markdown(f""" |
| <div class="metric-card"> |
| <div class="metric-value" style="color: #ffc107;">{overlap_pct:.0%}</div> |
| <div class="metric-label">Word Overlap</div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| st.markdown("---") |
| st.markdown("### Similar Passages") |
|
|
| |
| with st.spinner("Finding similar passages..."): |
| similar_passages = [] |
|
|
| |
| for i, chunk1 in enumerate(doc1.chunks[:10]): |
| text1 = chunk1.get("text", "") |
| words_c1 = set(text1.lower().split()) |
|
|
| best_match = None |
| best_score = 0 |
|
|
| for j, chunk2 in enumerate(doc2.chunks): |
| text2 = chunk2.get("text", "") |
| words_c2 = set(text2.lower().split()) |
|
|
| |
| if words_c1 and words_c2: |
| score = len(words_c1 & words_c2) / len(words_c1 | words_c2) |
| if score > best_score and score > 0.3: |
| best_score = score |
| best_match = { |
| "doc1_chunk": i, |
| "doc2_chunk": j, |
| "doc1_text": text1[:200], |
| "doc2_text": text2[:200], |
| "similarity": score, |
| } |
|
|
| if best_match: |
| similar_passages.append(best_match) |
|
|
| if similar_passages: |
| |
| similar_passages.sort(key=lambda x: x["similarity"], reverse=True) |
|
|
| for i, match in enumerate(similar_passages[:5]): |
| sim_color = get_similarity_color(match["similarity"]) |
| with st.expander(f"Match {i+1} - Similarity: {match['similarity']:.0%}"): |
| col1, col2 = st.columns(2) |
| with col1: |
| st.markdown(f"**{doc1_name.split(' (')[0]}** (Chunk {match['doc1_chunk']+1})") |
| st.markdown(f""" |
| <div class="chunk-match diff-common"> |
| {match['doc1_text']}... |
| </div> |
| """, unsafe_allow_html=True) |
| with col2: |
| st.markdown(f"**{doc2_name.split(' (')[0]}** (Chunk {match['doc2_chunk']+1})") |
| st.markdown(f""" |
| <div class="chunk-match diff-common"> |
| {match['doc2_text']}... |
| </div> |
| """, unsafe_allow_html=True) |
| else: |
| st.info("No significantly similar passages found between documents") |
|
|
| |
| st.markdown("---") |
| st.markdown("### Key Terms Comparison") |
|
|
| |
| from collections import Counter |
|
|
| def get_top_words(text, n=20): |
| words = text.lower().split() |
| |
| stopwords = {"the", "a", "an", "is", "are", "was", "were", "be", "been", "being", |
| "have", "has", "had", "do", "does", "did", "will", "would", "could", |
| "should", "may", "might", "must", "and", "or", "but", "if", "then", |
| "so", "to", "of", "in", "for", "on", "with", "at", "by", "from", |
| "this", "that", "these", "those", "it", "its"} |
| words = [w for w in words if len(w) > 3 and w not in stopwords] |
| return Counter(words).most_common(n) |
|
|
| top1 = get_top_words(doc1.raw_text) |
| top2 = get_top_words(doc2.raw_text) |
|
|
| col1, col2 = st.columns(2) |
| with col1: |
| st.markdown(f"**Top terms in {doc1_name.split(' (')[0]}:**") |
| for word, count in top1[:10]: |
| in_doc2 = word in [w for w, c in top2] |
| color = "#4ECDC4" if in_doc2 else "#8b949e" |
| st.markdown(f"<span style='color: {color};'>β’ {word}</span> ({count})", unsafe_allow_html=True) |
|
|
| with col2: |
| st.markdown(f"**Top terms in {doc2_name.split(' (')[0]}:**") |
| for word, count in top2[:10]: |
| in_doc1 = word in [w for w, c in top1] |
| color = "#4ECDC4" if in_doc1 else "#8b949e" |
| st.markdown(f"<span style='color: {color};'>β’ {word}</span> ({count})", unsafe_allow_html=True) |
|
|
| else: |
| st.info("Content comparison requires both documents to be in processed state") |
|
|
| |
| st.markdown("---") |
| st.markdown("### Export Comparison") |
|
|
| export_cols = st.columns(3) |
| with export_cols[0]: |
| if st.button("π Export as JSON", use_container_width=True): |
| import json |
| export_data = { |
| "document1": doc1_name, |
| "document2": doc2_name, |
| "comparison_type": comparison_type, |
| } |
| st.json(export_data) |
| with export_cols[1]: |
| st.button("π Export as CSV", disabled=True, use_container_width=True) |
| with export_cols[2]: |
| st.button("π Export as PDF", disabled=True, use_container_width=True) |
|
|
| |
| st.markdown("---") |
| st.markdown("### Navigation") |
| nav_cols = st.columns(4) |
|
|
| with nav_cols[0]: |
| if st.button("π¬ Live Processing", use_container_width=True): |
| st.switch_page("pages/1_π¬_Live_Processing.py") |
| with nav_cols[1]: |
| if st.button("π¬ Interactive RAG", use_container_width=True): |
| st.switch_page("pages/2_π¬_Interactive_RAG.py") |
| with nav_cols[2]: |
| if st.button("π― Evidence Viewer", use_container_width=True): |
| st.switch_page("pages/4_π―_Evidence_Viewer.py") |
| with nav_cols[3]: |
| if st.button("π Document Viewer", use_container_width=True): |
| st.switch_page("pages/5_π_Document_Viewer.py") |
|
|