import streamlit as st import pandas as pd import os import tempfile from backend import SemanticAnalyzer st.set_page_config(page_title="Semantic Document Analyzer", layout="wide") st.markdown(""" """, unsafe_allow_html=True) st.title("🧠 Semantic Document Analyzer") st.markdown("""

Holistic Document Understanding

This AI system leverages Sentence-BERT and Cross-Encoders to perform deep semantic analysis across long documents. It goes beyond simple keyword matching to understand context, detecting subtle contradictions and semantic duplicates.

""", unsafe_allow_html=True) # Sidebar with st.sidebar: st.header("Upload Documents") uploaded_files = st.file_uploader("Upload PDF files", type=['pdf'], accept_multiple_files=True) analyze_btn = st.button("Analyze Documents", type="primary") if analyze_btn and uploaded_files: if len(uploaded_files) == 0: st.error("Please upload at least one document.") else: with st.spinner("Processing documents... This may take a while for large files."): # Save uploaded files temporarily temp_dir = tempfile.mkdtemp() file_paths = [] for uploaded_file in uploaded_files: path = os.path.join(temp_dir, uploaded_file.name) with open(path, "wb") as f: f.write(uploaded_file.getbuffer()) file_paths.append(path) # Initialize Analyzer try: analyzer = SemanticAnalyzer() results = analyzer.analyze_documents(file_paths) # Cleanup # for path in file_paths: os.remove(path) # os.rmdir(temp_dir) if "error" in results: st.error(results["error"]) else: # Dashboard Layout col1, col2 = st.columns(2) with col1: st.metric("Total Documents", results['stats']['total_docs']) with col2: st.metric("Total Text Chunks", results['stats']['total_chunks']) st.divider() # 1. Duplicates st.subheader(f"⚠️ Potential Duplicates Detected ({len(results['duplicates'])})") if results['duplicates']: for dup in results['duplicates']: with st.expander(f"Similarity Score: {dup['score']:.4f}"): c1, c2 = st.columns(2) with c1: st.caption(f"Source: {dup['chunk_a']['source']}") st.info(dup['chunk_a']['text']) with c2: st.caption(f"Source: {dup['chunk_b']['source']}") st.info(dup['chunk_b']['text']) else: st.success("No duplicates found.") st.divider() # 2. Contradictions st.subheader(f"🛑 Contradictions / Inconsistencies ({len(results['contradictions'])})") if results['contradictions']: for contra in results['contradictions']: with st.expander(f"Contradiction Confidence: {contra['confidence']:.4f}"): c1, c2 = st.columns(2) with c1: st.caption(f"Source: {contra['chunk_a']['source']}") st.warning(contra['chunk_a']['text']) with c2: st.caption(f"Source: {contra['chunk_b']['source']}") st.warning(contra['chunk_b']['text']) # Export Report report_text = f"# Semantic Analysis Report\n\n" report_text += f"Total Documents: {results['stats']['total_docs']}\n" report_text += f"Total Chunks: {results['stats']['total_chunks']}\n\n" report_text += "## Duplicates\n" if results['duplicates']: for d in results['duplicates']: report_text += f"- Score: {d['score']:.4f}\n" report_text += f" - Source A: {d['chunk_a']['source']} | \"{d['chunk_a']['text'][:100]}...\"\n" report_text += f" - Source B: {d['chunk_b']['source']} | \"{d['chunk_b']['text'][:100]}...\"\n\n" else: report_text += "No duplicates found.\n\n" report_text += "## Contradictions\n" if results['contradictions']: for c in results['contradictions']: report_text += f"- Confidence: {c['confidence']:.4f}\n" report_text += f" - Source A: {c['chunk_a']['source']} | \"{c['chunk_a']['text']}\"\n" report_text += f" - Source B: {c['chunk_b']['source']} | \"{c['chunk_b']['text']}\"\n\n" else: report_text += "No contradictions found.\n" st.download_button( label="Download Report (Markdown)", data=report_text, file_name="analysis_report.md", mime="text/markdown" ) except Exception as e: st.error(f"An error occurred during analysis: {str(e)}") import traceback st.write(traceback.format_exc()) else: st.info("Upload documents and click Analyze to start.")