Spaces:
Configuration error
Configuration error
Upload 8 files
Browse files- README.md +55 -19
- app.py +166 -0
- backend.py +200 -0
- doc_a.txt +3 -0
- doc_b.txt +3 -0
- requirements.txt +6 -2
- run_app.sh +12 -0
- verify_backend.py +55 -0
README.md
CHANGED
|
@@ -1,19 +1,55 @@
|
|
| 1 |
-
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Long-Context Document Semantic Analysis System
|
| 2 |
+
|
| 3 |
+
This intelligent AI system analyzes long documents to automatically detect duplicates, contradictions, and inconsistencies using state-of-the-art Natural Language Processing (NLP) techniques.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
- **Duplicate Detection**: Identifies semantically identical or near-identical text segments using SBERT embeddings and FAISS vector search.
|
| 7 |
+
- **Contradiction Detection**: Uses a Cross-Encoder Natural Language Inference (NLI) model to flag logically conflicting statements.
|
| 8 |
+
- **Holistic Analysis**: Processes multiple documents (PDF, TXT) to find inconsistencies across the entire corpus.
|
| 9 |
+
- **Evidence-Based Reporting**: Generates a downloadable Markdown report with source references and confidence scores.
|
| 10 |
+
|
| 11 |
+
## Architecture
|
| 12 |
+
1. **Document Processing**: Extracts text from PDFs/TXTs and chunks it into overlapping segments.
|
| 13 |
+
2. **Embedding Generation**: `sentence-transformers/all-MiniLM-L6-v2` maps chunks to dense vector space.
|
| 14 |
+
3. **Similarity Search**: `FAISS` efficiently finds potential duplicate candidates.
|
| 15 |
+
4. **Logical Inference**: `cross-encoder/nli-distilroberta-base` verifies logical relationships (Contradiction/Entailment) between similar chunks.
|
| 16 |
+
|
| 17 |
+
## Installation
|
| 18 |
+
|
| 19 |
+
1. **Create a Virtual Environment** (Recommended):
|
| 20 |
+
```bash
|
| 21 |
+
python3 -m venv venv
|
| 22 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
2. **Install Dependencies**:
|
| 26 |
+
```bash
|
| 27 |
+
pip install -r requirements.txt
|
| 28 |
+
```
|
| 29 |
+
*Note: PyTorch installation might take a few minutes.*
|
| 30 |
+
|
| 31 |
+
## Usage
|
| 32 |
+
|
| 33 |
+
1. **Start the Application**:
|
| 34 |
+
```bash
|
| 35 |
+
streamlit run app.py
|
| 36 |
+
```
|
| 37 |
+
OR using the venv directly:
|
| 38 |
+
```bash
|
| 39 |
+
./venv/bin/streamlit run app.py
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
2. **Navigate to the UI**:
|
| 43 |
+
Open your browser at `http://localhost:8501`.
|
| 44 |
+
|
| 45 |
+
3. **Analyze**:
|
| 46 |
+
- Upload PDF or TXT files via the sidebar.
|
| 47 |
+
- Click "Analyze Documents".
|
| 48 |
+
- View results on the dashboard and download the report.
|
| 49 |
+
|
| 50 |
+
## Verification
|
| 51 |
+
To verify the core logic without the UI:
|
| 52 |
+
```bash
|
| 53 |
+
./venv/bin/python verify_backend.py
|
| 54 |
+
```
|
| 55 |
+
This generates sample contradictory documents and checks if the system flags them correctly.
|
app.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import os
|
| 4 |
+
import tempfile
|
| 5 |
+
from backend import SemanticAnalyzer
|
| 6 |
+
|
| 7 |
+
st.set_page_config(page_title="Semantic Document Analyzer", layout="wide")
|
| 8 |
+
|
| 9 |
+
st.markdown("""
|
| 10 |
+
<style>
|
| 11 |
+
/* Premium Look & Feel */
|
| 12 |
+
.stApp {
|
| 13 |
+
background: linear-gradient(to right, #f8f9fa, #e9ecef);
|
| 14 |
+
font-family: 'Inter', sans-serif;
|
| 15 |
+
}
|
| 16 |
+
.stButton>button {
|
| 17 |
+
background: linear-gradient(45deg, #4f46e5, #7c3aed);
|
| 18 |
+
color: white;
|
| 19 |
+
border: none;
|
| 20 |
+
border-radius: 8px;
|
| 21 |
+
padding: 0.75rem 1.5rem;
|
| 22 |
+
font-weight: 600;
|
| 23 |
+
transition: all 0.3s ease;
|
| 24 |
+
}
|
| 25 |
+
.stButton>button:hover {
|
| 26 |
+
transform: translateY(-2px);
|
| 27 |
+
box-shadow: 0 4px 12px rgba(79, 70, 229, 0.3);
|
| 28 |
+
}
|
| 29 |
+
div[data-testid="stMetricValue"] {
|
| 30 |
+
color: #111827;
|
| 31 |
+
font-weight: 700;
|
| 32 |
+
}
|
| 33 |
+
h1 {
|
| 34 |
+
background: -webkit-linear-gradient(45deg, #1e3a8a, #3b82f6);
|
| 35 |
+
-webkit-background-clip: text;
|
| 36 |
+
-webkit-text-fill-color: transparent;
|
| 37 |
+
font-weight: 800 !important;
|
| 38 |
+
}
|
| 39 |
+
.css-1d391kg {
|
| 40 |
+
background-color: #ffffff;
|
| 41 |
+
border-radius: 12px;
|
| 42 |
+
padding: 1.5rem;
|
| 43 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
|
| 44 |
+
}
|
| 45 |
+
</style>
|
| 46 |
+
""", unsafe_allow_html=True)
|
| 47 |
+
|
| 48 |
+
st.title("🧠 Semantic Document Analyzer")
|
| 49 |
+
st.markdown("""
|
| 50 |
+
<div style='background-color: white; padding: 1.5rem; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.05); margin-bottom: 2rem;'>
|
| 51 |
+
<h4 style='margin-top:0'>Holistic Document Understanding</h4>
|
| 52 |
+
<p style='color: #4b5563;'>
|
| 53 |
+
This AI system leverages <b>Sentence-BERT</b> and <b>Cross-Encoders</b> to perform deep semantic analysis across long documents.
|
| 54 |
+
It goes beyond simple keyword matching to understand context, detecting subtle contradictions and semantic duplicates.
|
| 55 |
+
</p>
|
| 56 |
+
</div>
|
| 57 |
+
""", unsafe_allow_html=True)
|
| 58 |
+
|
| 59 |
+
# Sidebar
|
| 60 |
+
with st.sidebar:
|
| 61 |
+
st.header("Upload Documents")
|
| 62 |
+
uploaded_files = st.file_uploader("Upload PDF files", type=['pdf'], accept_multiple_files=True)
|
| 63 |
+
analyze_btn = st.button("Analyze Documents", type="primary")
|
| 64 |
+
|
| 65 |
+
if analyze_btn and uploaded_files:
|
| 66 |
+
if len(uploaded_files) == 0:
|
| 67 |
+
st.error("Please upload at least one document.")
|
| 68 |
+
else:
|
| 69 |
+
with st.spinner("Processing documents... This may take a while for large files."):
|
| 70 |
+
# Save uploaded files temporarily
|
| 71 |
+
temp_dir = tempfile.mkdtemp()
|
| 72 |
+
file_paths = []
|
| 73 |
+
for uploaded_file in uploaded_files:
|
| 74 |
+
path = os.path.join(temp_dir, uploaded_file.name)
|
| 75 |
+
with open(path, "wb") as f:
|
| 76 |
+
f.write(uploaded_file.getbuffer())
|
| 77 |
+
file_paths.append(path)
|
| 78 |
+
|
| 79 |
+
# Initialize Analyzer
|
| 80 |
+
try:
|
| 81 |
+
analyzer = SemanticAnalyzer()
|
| 82 |
+
results = analyzer.analyze_documents(file_paths)
|
| 83 |
+
|
| 84 |
+
# Cleanup
|
| 85 |
+
# for path in file_paths: os.remove(path)
|
| 86 |
+
# os.rmdir(temp_dir)
|
| 87 |
+
|
| 88 |
+
if "error" in results:
|
| 89 |
+
st.error(results["error"])
|
| 90 |
+
else:
|
| 91 |
+
# Dashboard Layout
|
| 92 |
+
col1, col2 = st.columns(2)
|
| 93 |
+
with col1:
|
| 94 |
+
st.metric("Total Documents", results['stats']['total_docs'])
|
| 95 |
+
with col2:
|
| 96 |
+
st.metric("Total Text Chunks", results['stats']['total_chunks'])
|
| 97 |
+
|
| 98 |
+
st.divider()
|
| 99 |
+
|
| 100 |
+
# 1. Duplicates
|
| 101 |
+
st.subheader(f"⚠️ Potential Duplicates Detected ({len(results['duplicates'])})")
|
| 102 |
+
if results['duplicates']:
|
| 103 |
+
for dup in results['duplicates']:
|
| 104 |
+
with st.expander(f"Similarity Score: {dup['score']:.4f}"):
|
| 105 |
+
c1, c2 = st.columns(2)
|
| 106 |
+
with c1:
|
| 107 |
+
st.caption(f"Source: {dup['chunk_a']['source']}")
|
| 108 |
+
st.info(dup['chunk_a']['text'])
|
| 109 |
+
with c2:
|
| 110 |
+
st.caption(f"Source: {dup['chunk_b']['source']}")
|
| 111 |
+
st.info(dup['chunk_b']['text'])
|
| 112 |
+
else:
|
| 113 |
+
st.success("No duplicates found.")
|
| 114 |
+
|
| 115 |
+
st.divider()
|
| 116 |
+
|
| 117 |
+
# 2. Contradictions
|
| 118 |
+
st.subheader(f"🛑 Contradictions / Inconsistencies ({len(results['contradictions'])})")
|
| 119 |
+
if results['contradictions']:
|
| 120 |
+
for contra in results['contradictions']:
|
| 121 |
+
with st.expander(f"Contradiction Confidence: {contra['confidence']:.4f}"):
|
| 122 |
+
c1, c2 = st.columns(2)
|
| 123 |
+
with c1:
|
| 124 |
+
st.caption(f"Source: {contra['chunk_a']['source']}")
|
| 125 |
+
st.warning(contra['chunk_a']['text'])
|
| 126 |
+
with c2:
|
| 127 |
+
st.caption(f"Source: {contra['chunk_b']['source']}")
|
| 128 |
+
st.warning(contra['chunk_b']['text'])
|
| 129 |
+
# Export Report
|
| 130 |
+
report_text = f"# Semantic Analysis Report\n\n"
|
| 131 |
+
report_text += f"Total Documents: {results['stats']['total_docs']}\n"
|
| 132 |
+
report_text += f"Total Chunks: {results['stats']['total_chunks']}\n\n"
|
| 133 |
+
|
| 134 |
+
report_text += "## Duplicates\n"
|
| 135 |
+
if results['duplicates']:
|
| 136 |
+
for d in results['duplicates']:
|
| 137 |
+
report_text += f"- Score: {d['score']:.4f}\n"
|
| 138 |
+
report_text += f" - Source A: {d['chunk_a']['source']} | \"{d['chunk_a']['text'][:100]}...\"\n"
|
| 139 |
+
report_text += f" - Source B: {d['chunk_b']['source']} | \"{d['chunk_b']['text'][:100]}...\"\n\n"
|
| 140 |
+
else:
|
| 141 |
+
report_text += "No duplicates found.\n\n"
|
| 142 |
+
|
| 143 |
+
report_text += "## Contradictions\n"
|
| 144 |
+
if results['contradictions']:
|
| 145 |
+
for c in results['contradictions']:
|
| 146 |
+
report_text += f"- Confidence: {c['confidence']:.4f}\n"
|
| 147 |
+
report_text += f" - Source A: {c['chunk_a']['source']} | \"{c['chunk_a']['text']}\"\n"
|
| 148 |
+
report_text += f" - Source B: {c['chunk_b']['source']} | \"{c['chunk_b']['text']}\"\n\n"
|
| 149 |
+
else:
|
| 150 |
+
report_text += "No contradictions found.\n"
|
| 151 |
+
|
| 152 |
+
st.download_button(
|
| 153 |
+
label="Download Report (Markdown)",
|
| 154 |
+
data=report_text,
|
| 155 |
+
file_name="analysis_report.md",
|
| 156 |
+
mime="text/markdown"
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
st.error(f"An error occurred during analysis: {str(e)}")
|
| 162 |
+
import traceback
|
| 163 |
+
st.write(traceback.format_exc())
|
| 164 |
+
|
| 165 |
+
else:
|
| 166 |
+
st.info("Upload documents and click Analyze to start.")
|
backend.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List, Dict, Tuple
|
| 3 |
+
import pypdf
|
| 4 |
+
import numpy as np
|
| 5 |
+
import faiss
|
| 6 |
+
import torch
|
| 7 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 8 |
+
|
| 9 |
+
class DocumentProcessor:
|
| 10 |
+
@staticmethod
|
| 11 |
+
def extract_text(file_path: str) -> str:
|
| 12 |
+
ext = os.path.splitext(file_path)[1].lower()
|
| 13 |
+
if ext == '.pdf':
|
| 14 |
+
with open(file_path, 'rb') as f:
|
| 15 |
+
reader = pypdf.PdfReader(f)
|
| 16 |
+
text = ""
|
| 17 |
+
for page in reader.pages:
|
| 18 |
+
page_text = page.extract_text()
|
| 19 |
+
if page_text:
|
| 20 |
+
text += page_text + "\n"
|
| 21 |
+
return text
|
| 22 |
+
elif ext == '.txt':
|
| 23 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 24 |
+
return f.read()
|
| 25 |
+
else:
|
| 26 |
+
return ""
|
| 27 |
+
|
| 28 |
+
@staticmethod
|
| 29 |
+
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[Dict]:
|
| 30 |
+
"""
|
| 31 |
+
Splits text into chunks with overlap.
|
| 32 |
+
Returns a list of dicts with 'id', 'text', 'start_idx', 'end_idx'.
|
| 33 |
+
"""
|
| 34 |
+
# Simple sliding window based on characters for simplicity,
|
| 35 |
+
# ideally this would be token-based or sentence-based.
|
| 36 |
+
chunks = []
|
| 37 |
+
text_len = len(text)
|
| 38 |
+
start = 0
|
| 39 |
+
chunk_id = 0
|
| 40 |
+
|
| 41 |
+
while start < text_len:
|
| 42 |
+
end = min(start + chunk_size, text_len)
|
| 43 |
+
chunk_text = text[start:end]
|
| 44 |
+
|
| 45 |
+
# Try to cut at the last newline or period to be cleaner
|
| 46 |
+
if end < text_len:
|
| 47 |
+
last_period = chunk_text.rfind('.')
|
| 48 |
+
last_newline = chunk_text.rfind('\n')
|
| 49 |
+
break_point = max(last_period, last_newline)
|
| 50 |
+
if break_point != -1 and break_point > chunk_size * 0.5:
|
| 51 |
+
end = start + break_point + 1
|
| 52 |
+
chunk_text = text[start:end]
|
| 53 |
+
|
| 54 |
+
chunks.append({
|
| 55 |
+
'id': chunk_id,
|
| 56 |
+
'text': chunk_text.strip(),
|
| 57 |
+
'start_char': start,
|
| 58 |
+
'end_char': end
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
start = end - overlap
|
| 62 |
+
chunk_id += 1
|
| 63 |
+
if start >= text_len:
|
| 64 |
+
break
|
| 65 |
+
|
| 66 |
+
return chunks
|
| 67 |
+
|
| 68 |
+
class EmbeddingEngine:
|
| 69 |
+
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
|
| 70 |
+
# Force CPU if no CUDA, though usually auto-detected.
|
| 71 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 72 |
+
self.model = SentenceTransformer(model_name, device=device)
|
| 73 |
+
|
| 74 |
+
def encode(self, texts: List[str]) -> np.ndarray:
|
| 75 |
+
embeddings = self.model.encode(texts, convert_to_numpy=True)
|
| 76 |
+
# Normalize for cosine similarity in FAISS
|
| 77 |
+
faiss.normalize_L2(embeddings)
|
| 78 |
+
return embeddings
|
| 79 |
+
|
| 80 |
+
class VectorStore:
|
| 81 |
+
def __init__(self, dimension: int):
|
| 82 |
+
self.dimension = dimension
|
| 83 |
+
self.index = faiss.IndexFlatIP(dimension) # Inner Product + Normalized = Cosine Similarity
|
| 84 |
+
|
| 85 |
+
def add(self, embeddings: np.ndarray):
|
| 86 |
+
self.index.add(embeddings)
|
| 87 |
+
|
| 88 |
+
def search(self, query_embeddings: np.ndarray, k: int = 5) -> Tuple[np.ndarray, np.ndarray]:
|
| 89 |
+
return self.index.search(query_embeddings, k)
|
| 90 |
+
|
| 91 |
+
class SemanticAnalyzer:
|
| 92 |
+
def __init__(self):
|
| 93 |
+
self.embedding_engine = EmbeddingEngine()
|
| 94 |
+
# NLI model for contradiction detection
|
| 95 |
+
# We load it lazily or here. Keeping it here for now.
|
| 96 |
+
# This model outputs logits for [Contradiction, Entailment, Neutral] or similar depending on training.
|
| 97 |
+
# cross-encoder/nli-distilroberta-base outputs: [contradiction, entailment, neutral] usually?
|
| 98 |
+
# Actually checking HuggingFace: cross-encoder/nli-distilroberta-base
|
| 99 |
+
# Label mapping: 0: contradiction, 1: entailment, 2: neutral (Check specific model card if unsure, usually standard)
|
| 100 |
+
self.nli_model = CrossEncoder('cross-encoder/nli-distilroberta-base')
|
| 101 |
+
|
| 102 |
+
def analyze_documents(self, file_paths: List[str]) -> Dict:
|
| 103 |
+
"""
|
| 104 |
+
Main pipeline function.
|
| 105 |
+
"""
|
| 106 |
+
all_chunks = []
|
| 107 |
+
doc_map = {} # chunk_id -> source_doc
|
| 108 |
+
|
| 109 |
+
# 1. Load and Chunk
|
| 110 |
+
global_chunk_id = 0
|
| 111 |
+
for fpath in file_paths:
|
| 112 |
+
fname = os.path.basename(fpath)
|
| 113 |
+
raw_text = DocumentProcessor.extract_text(fpath)
|
| 114 |
+
chunks = DocumentProcessor.chunk_text(raw_text)
|
| 115 |
+
for c in chunks:
|
| 116 |
+
c['global_id'] = global_chunk_id
|
| 117 |
+
c['source'] = fname
|
| 118 |
+
all_chunks.append(c)
|
| 119 |
+
global_chunk_id += 1
|
| 120 |
+
|
| 121 |
+
if not all_chunks:
|
| 122 |
+
return {"error": "No text extracted"}
|
| 123 |
+
|
| 124 |
+
texts = [c['text'] for c in all_chunks]
|
| 125 |
+
|
| 126 |
+
# 2. Embed
|
| 127 |
+
embeddings = self.embedding_engine.encode(texts)
|
| 128 |
+
|
| 129 |
+
# 3. Build Index
|
| 130 |
+
d = embeddings.shape[1]
|
| 131 |
+
vector_store = VectorStore(d)
|
| 132 |
+
vector_store.add(embeddings)
|
| 133 |
+
|
| 134 |
+
results = {
|
| 135 |
+
"duplicates": [],
|
| 136 |
+
"contradictions": [],
|
| 137 |
+
"stats": {
|
| 138 |
+
"total_docs": len(file_paths),
|
| 139 |
+
"total_chunks": len(all_chunks)
|
| 140 |
+
}
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
# 4. Detect Duplicates & Contradictions
|
| 144 |
+
# For every chunk, look for similar chunks
|
| 145 |
+
# k=10 neighbors
|
| 146 |
+
D, I = vector_store.search(embeddings, k=min(10, len(all_chunks)))
|
| 147 |
+
|
| 148 |
+
checked_pairs = set()
|
| 149 |
+
|
| 150 |
+
for i in range(len(all_chunks)):
|
| 151 |
+
for rank, j in enumerate(I[i]):
|
| 152 |
+
if i == j: continue # Skip self
|
| 153 |
+
|
| 154 |
+
sim_score = D[i][rank]
|
| 155 |
+
if sim_score < 0.5: continue # optimization: ignore low similarity
|
| 156 |
+
|
| 157 |
+
# Sort indices to avoid double checking (i,j) vs (j,i)
|
| 158 |
+
pair = tuple(sorted((i, j)))
|
| 159 |
+
if pair in checked_pairs:
|
| 160 |
+
continue
|
| 161 |
+
checked_pairs.add(pair)
|
| 162 |
+
|
| 163 |
+
chunk_a = all_chunks[i]
|
| 164 |
+
chunk_b = all_chunks[j]
|
| 165 |
+
|
| 166 |
+
# DUPLICATE DETECTION
|
| 167 |
+
# Threshold > 0.95 usually implies near duplicate
|
| 168 |
+
if sim_score > 0.95:
|
| 169 |
+
results["duplicates"].append({
|
| 170 |
+
"score": float(sim_score),
|
| 171 |
+
"chunk_a": chunk_a,
|
| 172 |
+
"chunk_b": chunk_b
|
| 173 |
+
})
|
| 174 |
+
continue # If it's a duplicate, we barely care if it contradicts (it shouldn't)
|
| 175 |
+
|
| 176 |
+
# CONTRADICTION DETECTION
|
| 177 |
+
# If they are talking about the same thing (high similarity) but not identical
|
| 178 |
+
# Run NLI
|
| 179 |
+
if sim_score > 0.65:
|
| 180 |
+
# CrossEncoder input is list of pairs
|
| 181 |
+
scores = self.nli_model.predict([(chunk_a['text'], chunk_b['text'])])
|
| 182 |
+
# scores is [logit_contradiction, logit_entailment, logit_neutral]
|
| 183 |
+
# argmax 0 -> contradiction
|
| 184 |
+
label = scores[0].argmax()
|
| 185 |
+
|
| 186 |
+
# Assuming mapping: 0: contradiction, 1: entailment, 2: neutral
|
| 187 |
+
# We need to verify this specific model's mapping.
|
| 188 |
+
# Most nli models on HF: 0: contradiction, 1: entailment, 2: neutral.
|
| 189 |
+
# verify: cross-encoder/nli-distilroberta-base
|
| 190 |
+
# documentation says: label2id: {'contradiction': 0, 'entailment': 1, 'neutral': 2}
|
| 191 |
+
|
| 192 |
+
if label == 0: # Contradiction
|
| 193 |
+
results["contradictions"].append({
|
| 194 |
+
"similarity": float(sim_score),
|
| 195 |
+
"confidence": float(scores[0][0]), # logit strength? convert to prob with softmax if needed
|
| 196 |
+
"chunk_a": chunk_a,
|
| 197 |
+
"chunk_b": chunk_b
|
| 198 |
+
})
|
| 199 |
+
|
| 200 |
+
return results
|
doc_a.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The software release is scheduled for Q3 2024.
|
| 2 |
+
Machine learning models require vast amounts of data.
|
| 3 |
+
This is a generic statement about AI capabilities.
|
doc_b.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The software release is strictly scheduled for Q4 2025.
|
| 2 |
+
Machine learning models require vast amounts of data.
|
| 3 |
+
AI generates images from text prompts.
|
requirements.txt
CHANGED
|
@@ -1,3 +1,7 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
pandas
|
| 3 |
-
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
sentence-transformers
|
| 3 |
+
faiss-cpu
|
| 4 |
+
torch
|
| 5 |
+
numpy
|
| 6 |
pandas
|
| 7 |
+
pypdf
|
run_app.sh
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Ensure we are using the virtual environment's streamlit
|
| 3 |
+
# This resolves ModuleNotFoundError for packages installed in venv
|
| 4 |
+
|
| 5 |
+
# Check if venv exists
|
| 6 |
+
if [ -d "venv" ]; then
|
| 7 |
+
echo "Starting Semantic Analyzer from venv..."
|
| 8 |
+
./venv/bin/streamlit run app.py
|
| 9 |
+
else
|
| 10 |
+
echo "Error: Virtual environment 'venv' not found."
|
| 11 |
+
echo "Please run: python3 -m venv venv && ./venv/bin/pip install -r requirements.txt"
|
| 12 |
+
fi
|
verify_backend.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
from backend import SemanticAnalyzer
|
| 4 |
+
|
| 5 |
+
def create_dummy_files():
|
| 6 |
+
if os.path.exists("test_data"):
|
| 7 |
+
shutil.rmtree("test_data")
|
| 8 |
+
os.makedirs("test_data")
|
| 9 |
+
|
| 10 |
+
# File A
|
| 11 |
+
with open("test_data/doc_a.txt", "w") as f:
|
| 12 |
+
f.write("The software release is scheduled for Q3 2024.\n")
|
| 13 |
+
f.write("Machine learning models require vast amounts of data.\n")
|
| 14 |
+
f.write("This is a generic statement about AI capabilities.\n")
|
| 15 |
+
|
| 16 |
+
# File B
|
| 17 |
+
with open("test_data/doc_b.txt", "w") as f:
|
| 18 |
+
f.write("The software release is strictly scheduled for Q4 2025.\n") # Contradiction
|
| 19 |
+
f.write("Machine learning models require vast amounts of data.\n") # Duplicate
|
| 20 |
+
f.write("AI generates images from text prompts.\n") # Unrelated
|
| 21 |
+
|
| 22 |
+
def run_test():
|
| 23 |
+
create_dummy_files()
|
| 24 |
+
files = ["test_data/doc_a.txt", "test_data/doc_b.txt"]
|
| 25 |
+
|
| 26 |
+
print("Initializing Analyzer...")
|
| 27 |
+
analyzer = SemanticAnalyzer()
|
| 28 |
+
|
| 29 |
+
print("Analyzing...")
|
| 30 |
+
results = analyzer.analyze_documents(files)
|
| 31 |
+
|
| 32 |
+
print("\n=== RESULTS ===")
|
| 33 |
+
print(f"Duplicates found: {len(results['duplicates'])}")
|
| 34 |
+
for d in results['duplicates']:
|
| 35 |
+
print(f" [Match] ({d['score']:.4f})")
|
| 36 |
+
print(f" A: {d['chunk_a']['text']}")
|
| 37 |
+
print(f" B: {d['chunk_b']['text']}")
|
| 38 |
+
|
| 39 |
+
print(f"\nContradictions found: {len(results['contradictions'])}")
|
| 40 |
+
for c in results['contradictions']:
|
| 41 |
+
print(f" [Conflict] (Conf: {c['confidence']:.4f})")
|
| 42 |
+
print(f" A: {c['chunk_a']['text']}")
|
| 43 |
+
print(f" B: {c['chunk_b']['text']}")
|
| 44 |
+
|
| 45 |
+
# Validation logic
|
| 46 |
+
has_dup = any("vast amounts of data" in d['chunk_a']['text'] for d in results['duplicates'])
|
| 47 |
+
has_contra = any("software release" in c['chunk_a']['text'] for c in results['contradictions'])
|
| 48 |
+
|
| 49 |
+
if has_dup and has_contra:
|
| 50 |
+
print("\n✅ VERIFICATION PASSED: Core logic works.")
|
| 51 |
+
else:
|
| 52 |
+
print("\n❌ VERIFICATION FAILED: Missing expected detections.")
|
| 53 |
+
|
| 54 |
+
if __name__ == "__main__":
|
| 55 |
+
run_test()
|