File size: 7,314 Bytes
253246d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import streamlit as st
import pandas as pd
import os
import tempfile
from backend import SemanticAnalyzer

st.set_page_config(page_title="Semantic Document Analyzer", layout="wide")

st.markdown("""
    <style>
    /* Premium Look & Feel */
    .stApp {
        background: linear-gradient(to right, #f8f9fa, #e9ecef);
        font-family: 'Inter', sans-serif;
    }
    .stButton>button {
        background: linear-gradient(45deg, #4f46e5, #7c3aed);
        color: white;
        border: none;
        border-radius: 8px;
        padding: 0.75rem 1.5rem;
        font-weight: 600;
        transition: all 0.3s ease;
    }
    .stButton>button:hover {
        transform: translateY(-2px);
        box-shadow: 0 4px 12px rgba(79, 70, 229, 0.3);
    }
    div[data-testid="stMetricValue"] {
        color: #111827;
        font-weight: 700;
    }
    h1 {
        background: -webkit-linear-gradient(45deg, #1e3a8a, #3b82f6);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        font-weight: 800 !important;
    }
    .css-1d391kg {
        background-color: #ffffff;
        border-radius: 12px;
        padding: 1.5rem;
        box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
    }
    </style>
""", unsafe_allow_html=True)

st.title("🧠 Semantic Document Analyzer")
st.markdown("""
<div style='background-color: white; padding: 1.5rem; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.05); margin-bottom: 2rem;'>
    <h4 style='margin-top:0'>Holistic Document Understanding</h4>
    <p style='color: #4b5563;'>
    This AI system leverages <b>Sentence-BERT</b> and <b>Cross-Encoders</b> to perform deep semantic analysis across long documents.
    It goes beyond simple keyword matching to understand context, detecting subtle contradictions and semantic duplicates.
    </p>
</div>
""", unsafe_allow_html=True)

# Sidebar
with st.sidebar:
    st.header("Upload Documents")
    uploaded_files = st.file_uploader("Upload PDF files", type=['pdf'], accept_multiple_files=True)
    analyze_btn = st.button("Analyze Documents", type="primary")

if analyze_btn and uploaded_files:
    if len(uploaded_files) == 0:
        st.error("Please upload at least one document.")
    else:
        with st.spinner("Processing documents... This may take a while for large files."):
            # Save uploaded files temporarily
            temp_dir = tempfile.mkdtemp()
            file_paths = []
            for uploaded_file in uploaded_files:
                path = os.path.join(temp_dir, uploaded_file.name)
                with open(path, "wb") as f:
                    f.write(uploaded_file.getbuffer())
                file_paths.append(path)
            
            # Initialize Analyzer
            try:
                analyzer = SemanticAnalyzer()
                results = analyzer.analyze_documents(file_paths)
                
                # Cleanup
                # for path in file_paths: os.remove(path)
                # os.rmdir(temp_dir)
                
                if "error" in results:
                    st.error(results["error"])
                else:
                    # Dashboard Layout
                    col1, col2 = st.columns(2)
                    with col1:
                        st.metric("Total Documents", results['stats']['total_docs'])
                    with col2:
                        st.metric("Total Text Chunks", results['stats']['total_chunks'])
                        
                    st.divider()
                    
                    # 1. Duplicates
                    st.subheader(f"⚠️ Potential Duplicates Detected ({len(results['duplicates'])})")
                    if results['duplicates']:
                        for dup in results['duplicates']:
                            with st.expander(f"Similarity Score: {dup['score']:.4f}"):
                                c1, c2 = st.columns(2)
                                with c1:
                                    st.caption(f"Source: {dup['chunk_a']['source']}")
                                    st.info(dup['chunk_a']['text'])
                                with c2:
                                    st.caption(f"Source: {dup['chunk_b']['source']}")
                                    st.info(dup['chunk_b']['text'])
                    else:
                        st.success("No duplicates found.")
                        
                    st.divider()
                    
                    # 2. Contradictions
                    st.subheader(f"🛑 Contradictions / Inconsistencies ({len(results['contradictions'])})")
                    if results['contradictions']:
                        for contra in results['contradictions']:
                            with st.expander(f"Contradiction Confidence: {contra['confidence']:.4f}"):
                                c1, c2 = st.columns(2)
                                with c1:
                                    st.caption(f"Source: {contra['chunk_a']['source']}")
                                    st.warning(contra['chunk_a']['text'])
                                with c2:
                                    st.caption(f"Source: {contra['chunk_b']['source']}")
                                    st.warning(contra['chunk_b']['text'])
                            # Export Report
                    report_text = f"# Semantic Analysis Report\n\n"
                    report_text += f"Total Documents: {results['stats']['total_docs']}\n"
                    report_text += f"Total Chunks: {results['stats']['total_chunks']}\n\n"
                    
                    report_text += "## Duplicates\n"
                    if results['duplicates']:
                        for d in results['duplicates']:
                            report_text += f"- Score: {d['score']:.4f}\n"
                            report_text += f"  - Source A: {d['chunk_a']['source']} | \"{d['chunk_a']['text'][:100]}...\"\n"
                            report_text += f"  - Source B: {d['chunk_b']['source']} | \"{d['chunk_b']['text'][:100]}...\"\n\n"
                    else:
                        report_text += "No duplicates found.\n\n"
                        
                    report_text += "## Contradictions\n"
                    if results['contradictions']:
                         for c in results['contradictions']:
                            report_text += f"- Confidence: {c['confidence']:.4f}\n"
                            report_text += f"  - Source A: {c['chunk_a']['source']} | \"{c['chunk_a']['text']}\"\n"
                            report_text += f"  - Source B: {c['chunk_b']['source']} | \"{c['chunk_b']['text']}\"\n\n"
                    else:
                        report_text += "No contradictions found.\n"

                    st.download_button(
                        label="Download Report (Markdown)",
                        data=report_text,
                        file_name="analysis_report.md",
                        mime="text/markdown"
                    )


            except Exception as e:
                st.error(f"An error occurred during analysis: {str(e)}")
                import traceback
                st.write(traceback.format_exc())

else:
    st.info("Upload documents and click Analyze to start.")