import streamlit as st
import pandas as pd
import os
import tempfile
from backend import SemanticAnalyzer
st.set_page_config(page_title="Semantic Document Analyzer", layout="wide")
st.markdown("""
""", unsafe_allow_html=True)
st.title("🧠 Semantic Document Analyzer")
st.markdown("""
Holistic Document Understanding
This AI system leverages Sentence-BERT and Cross-Encoders to perform deep semantic analysis across long documents.
It goes beyond simple keyword matching to understand context, detecting subtle contradictions and semantic duplicates.
""", unsafe_allow_html=True)
# Sidebar
with st.sidebar:
st.header("Upload Documents")
uploaded_files = st.file_uploader("Upload PDF files", type=['pdf'], accept_multiple_files=True)
analyze_btn = st.button("Analyze Documents", type="primary")
if analyze_btn and uploaded_files:
if len(uploaded_files) == 0:
st.error("Please upload at least one document.")
else:
with st.spinner("Processing documents... This may take a while for large files."):
# Save uploaded files temporarily
temp_dir = tempfile.mkdtemp()
file_paths = []
for uploaded_file in uploaded_files:
path = os.path.join(temp_dir, uploaded_file.name)
with open(path, "wb") as f:
f.write(uploaded_file.getbuffer())
file_paths.append(path)
# Initialize Analyzer
try:
analyzer = SemanticAnalyzer()
results = analyzer.analyze_documents(file_paths)
# Cleanup
# for path in file_paths: os.remove(path)
# os.rmdir(temp_dir)
if "error" in results:
st.error(results["error"])
else:
# Dashboard Layout
col1, col2 = st.columns(2)
with col1:
st.metric("Total Documents", results['stats']['total_docs'])
with col2:
st.metric("Total Text Chunks", results['stats']['total_chunks'])
st.divider()
# 1. Duplicates
st.subheader(f"⚠️ Potential Duplicates Detected ({len(results['duplicates'])})")
if results['duplicates']:
for dup in results['duplicates']:
with st.expander(f"Similarity Score: {dup['score']:.4f}"):
c1, c2 = st.columns(2)
with c1:
st.caption(f"Source: {dup['chunk_a']['source']}")
st.info(dup['chunk_a']['text'])
with c2:
st.caption(f"Source: {dup['chunk_b']['source']}")
st.info(dup['chunk_b']['text'])
else:
st.success("No duplicates found.")
st.divider()
# 2. Contradictions
st.subheader(f"🛑 Contradictions / Inconsistencies ({len(results['contradictions'])})")
if results['contradictions']:
for contra in results['contradictions']:
with st.expander(f"Contradiction Confidence: {contra['confidence']:.4f}"):
c1, c2 = st.columns(2)
with c1:
st.caption(f"Source: {contra['chunk_a']['source']}")
st.warning(contra['chunk_a']['text'])
with c2:
st.caption(f"Source: {contra['chunk_b']['source']}")
st.warning(contra['chunk_b']['text'])
# Export Report
report_text = f"# Semantic Analysis Report\n\n"
report_text += f"Total Documents: {results['stats']['total_docs']}\n"
report_text += f"Total Chunks: {results['stats']['total_chunks']}\n\n"
report_text += "## Duplicates\n"
if results['duplicates']:
for d in results['duplicates']:
report_text += f"- Score: {d['score']:.4f}\n"
report_text += f" - Source A: {d['chunk_a']['source']} | \"{d['chunk_a']['text'][:100]}...\"\n"
report_text += f" - Source B: {d['chunk_b']['source']} | \"{d['chunk_b']['text'][:100]}...\"\n\n"
else:
report_text += "No duplicates found.\n\n"
report_text += "## Contradictions\n"
if results['contradictions']:
for c in results['contradictions']:
report_text += f"- Confidence: {c['confidence']:.4f}\n"
report_text += f" - Source A: {c['chunk_a']['source']} | \"{c['chunk_a']['text']}\"\n"
report_text += f" - Source B: {c['chunk_b']['source']} | \"{c['chunk_b']['text']}\"\n\n"
else:
report_text += "No contradictions found.\n"
st.download_button(
label="Download Report (Markdown)",
data=report_text,
file_name="analysis_report.md",
mime="text/markdown"
)
except Exception as e:
st.error(f"An error occurred during analysis: {str(e)}")
import traceback
st.write(traceback.format_exc())
else:
st.info("Upload documents and click Analyze to start.")