Spaces:
Configuration error
Configuration error
File size: 7,314 Bytes
253246d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | import streamlit as st
import pandas as pd
import os
import tempfile
from backend import SemanticAnalyzer
st.set_page_config(page_title="Semantic Document Analyzer", layout="wide")
st.markdown("""
<style>
/* Premium Look & Feel */
.stApp {
background: linear-gradient(to right, #f8f9fa, #e9ecef);
font-family: 'Inter', sans-serif;
}
.stButton>button {
background: linear-gradient(45deg, #4f46e5, #7c3aed);
color: white;
border: none;
border-radius: 8px;
padding: 0.75rem 1.5rem;
font-weight: 600;
transition: all 0.3s ease;
}
.stButton>button:hover {
transform: translateY(-2px);
box-shadow: 0 4px 12px rgba(79, 70, 229, 0.3);
}
div[data-testid="stMetricValue"] {
color: #111827;
font-weight: 700;
}
h1 {
background: -webkit-linear-gradient(45deg, #1e3a8a, #3b82f6);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-weight: 800 !important;
}
.css-1d391kg {
background-color: #ffffff;
border-radius: 12px;
padding: 1.5rem;
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
}
</style>
""", unsafe_allow_html=True)
st.title("🧠 Semantic Document Analyzer")
st.markdown("""
<div style='background-color: white; padding: 1.5rem; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.05); margin-bottom: 2rem;'>
<h4 style='margin-top:0'>Holistic Document Understanding</h4>
<p style='color: #4b5563;'>
This AI system leverages <b>Sentence-BERT</b> and <b>Cross-Encoders</b> to perform deep semantic analysis across long documents.
It goes beyond simple keyword matching to understand context, detecting subtle contradictions and semantic duplicates.
</p>
</div>
""", unsafe_allow_html=True)
# Sidebar
with st.sidebar:
st.header("Upload Documents")
uploaded_files = st.file_uploader("Upload PDF files", type=['pdf'], accept_multiple_files=True)
analyze_btn = st.button("Analyze Documents", type="primary")
if analyze_btn and uploaded_files:
if len(uploaded_files) == 0:
st.error("Please upload at least one document.")
else:
with st.spinner("Processing documents... This may take a while for large files."):
# Save uploaded files temporarily
temp_dir = tempfile.mkdtemp()
file_paths = []
for uploaded_file in uploaded_files:
path = os.path.join(temp_dir, uploaded_file.name)
with open(path, "wb") as f:
f.write(uploaded_file.getbuffer())
file_paths.append(path)
# Initialize Analyzer
try:
analyzer = SemanticAnalyzer()
results = analyzer.analyze_documents(file_paths)
# Cleanup
# for path in file_paths: os.remove(path)
# os.rmdir(temp_dir)
if "error" in results:
st.error(results["error"])
else:
# Dashboard Layout
col1, col2 = st.columns(2)
with col1:
st.metric("Total Documents", results['stats']['total_docs'])
with col2:
st.metric("Total Text Chunks", results['stats']['total_chunks'])
st.divider()
# 1. Duplicates
st.subheader(f"⚠️ Potential Duplicates Detected ({len(results['duplicates'])})")
if results['duplicates']:
for dup in results['duplicates']:
with st.expander(f"Similarity Score: {dup['score']:.4f}"):
c1, c2 = st.columns(2)
with c1:
st.caption(f"Source: {dup['chunk_a']['source']}")
st.info(dup['chunk_a']['text'])
with c2:
st.caption(f"Source: {dup['chunk_b']['source']}")
st.info(dup['chunk_b']['text'])
else:
st.success("No duplicates found.")
st.divider()
# 2. Contradictions
st.subheader(f"🛑 Contradictions / Inconsistencies ({len(results['contradictions'])})")
if results['contradictions']:
for contra in results['contradictions']:
with st.expander(f"Contradiction Confidence: {contra['confidence']:.4f}"):
c1, c2 = st.columns(2)
with c1:
st.caption(f"Source: {contra['chunk_a']['source']}")
st.warning(contra['chunk_a']['text'])
with c2:
st.caption(f"Source: {contra['chunk_b']['source']}")
st.warning(contra['chunk_b']['text'])
# Export Report
report_text = f"# Semantic Analysis Report\n\n"
report_text += f"Total Documents: {results['stats']['total_docs']}\n"
report_text += f"Total Chunks: {results['stats']['total_chunks']}\n\n"
report_text += "## Duplicates\n"
if results['duplicates']:
for d in results['duplicates']:
report_text += f"- Score: {d['score']:.4f}\n"
report_text += f" - Source A: {d['chunk_a']['source']} | \"{d['chunk_a']['text'][:100]}...\"\n"
report_text += f" - Source B: {d['chunk_b']['source']} | \"{d['chunk_b']['text'][:100]}...\"\n\n"
else:
report_text += "No duplicates found.\n\n"
report_text += "## Contradictions\n"
if results['contradictions']:
for c in results['contradictions']:
report_text += f"- Confidence: {c['confidence']:.4f}\n"
report_text += f" - Source A: {c['chunk_a']['source']} | \"{c['chunk_a']['text']}\"\n"
report_text += f" - Source B: {c['chunk_b']['source']} | \"{c['chunk_b']['text']}\"\n\n"
else:
report_text += "No contradictions found.\n"
st.download_button(
label="Download Report (Markdown)",
data=report_text,
file_name="analysis_report.md",
mime="text/markdown"
)
except Exception as e:
st.error(f"An error occurred during analysis: {str(e)}")
import traceback
st.write(traceback.format_exc())
else:
st.info("Upload documents and click Analyze to start.")
|