Commit Β·
f02f2d2
1
Parent(s): b663e26
Upload full Streamlit app from GitHub
Browse filesThis view is limited to 50 files because it contains too many changes. Β See raw diff
- requirements.txt +35 -3
- src/README.md +159 -0
- src/TESTING.md +170 -0
- src/__pycache__/config.cpython-312.pyc +0 -0
- src/__pycache__/config.cpython-313.pyc +0 -0
- src/__pycache__/streamlit_app.cpython-313.pyc +0 -0
- src/agents/__init__.py +0 -0
- src/agents/__pycache__/__init__.cpython-312.pyc +0 -0
- src/agents/__pycache__/__init__.cpython-313.pyc +0 -0
- src/agents/__pycache__/base_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/ingestion_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/ingestion_agent.cpython-313.pyc +0 -0
- src/agents/__pycache__/table_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/text_agent.cpython-312.pyc +0 -0
- src/agents/base_agent.py +44 -0
- src/agents/ingestion_agent.py +176 -0
- src/agents/ingestion_agent_alternative.py +167 -0
- src/agents/table_agent.py +156 -0
- src/agents/text_agent.py +137 -0
- src/config.py +49 -0
- src/create_test_docs.py +189 -0
- src/models/__init__.py +14 -0
- src/models/__pycache__/__init__.cpython-312.pyc +0 -0
- src/models/__pycache__/__init__.cpython-313.pyc +0 -0
- src/models/__pycache__/document.cpython-312.pyc +0 -0
- src/models/__pycache__/document.cpython-313.pyc +0 -0
- src/models/__pycache__/similarity.cpython-312.pyc +0 -0
- src/models/__pycache__/similarity.cpython-313.pyc +0 -0
- src/models/document.py +142 -0
- src/models/similarity.py +40 -0
- src/orchestrator/__init__.py +0 -0
- src/orchestrator/__pycache__/__init__.cpython-312.pyc +0 -0
- src/orchestrator/__pycache__/__init__.cpython-313.pyc +0 -0
- src/orchestrator/__pycache__/scorers.cpython-312.pyc +0 -0
- src/orchestrator/__pycache__/scorers.cpython-313.pyc +0 -0
- src/orchestrator/__pycache__/similarity_orchestrator.cpython-312.pyc +0 -0
- src/orchestrator/__pycache__/similarity_orchestrator.cpython-313.pyc +0 -0
- src/orchestrator/scorers.py +197 -0
- src/orchestrator/similarity_orchestrator.py +130 -0
- src/requirements-alternative.txt +37 -0
- src/requirements.txt +35 -0
- src/storage/__init__.py +0 -0
- src/storage/vector_store.py +183 -0
- src/streamlit_app.py +296 -35
- src/utils/__init__.py +0 -0
- src/utils/__pycache__/__init__.cpython-312.pyc +0 -0
- src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
- src/utils/__pycache__/file_handler.cpython-312.pyc +0 -0
- src/utils/__pycache__/visualization.cpython-312.pyc +0 -0
- src/utils/__pycache__/visualization.cpython-313.pyc +0 -0
requirements.txt
CHANGED
|
@@ -1,3 +1,35 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core framework
|
| 2 |
+
streamlit>=1.31.0
|
| 3 |
+
|
| 4 |
+
# Data models
|
| 5 |
+
pydantic>=2.6.0
|
| 6 |
+
|
| 7 |
+
# Document parsing - using versions compatible with Python 3.13
|
| 8 |
+
# Use pypdf if PyMuPDF has DLL issues on Windows
|
| 9 |
+
pypdf>=4.0.0 # Fallback PDF parser (pure Python, no DLL dependencies)
|
| 10 |
+
python-docx>=1.1.0
|
| 11 |
+
pdfplumber>=0.10.0
|
| 12 |
+
|
| 13 |
+
# ML & Embeddings
|
| 14 |
+
sentence-transformers>=2.3.0
|
| 15 |
+
torch>=2.2.0
|
| 16 |
+
|
| 17 |
+
# Vector storage
|
| 18 |
+
faiss-cpu>=1.7.0
|
| 19 |
+
|
| 20 |
+
# Data processing
|
| 21 |
+
numpy>=1.26.0
|
| 22 |
+
pandas>=2.2.0
|
| 23 |
+
Pillow>=10.2.0
|
| 24 |
+
|
| 25 |
+
# Utilities
|
| 26 |
+
python-dotenv>=1.0.0
|
| 27 |
+
|
| 28 |
+
# Visualization
|
| 29 |
+
plotly>=5.18.0
|
| 30 |
+
|
| 31 |
+
# Async
|
| 32 |
+
aiofiles>=23.2.0
|
| 33 |
+
|
| 34 |
+
# Similarity metrics
|
| 35 |
+
scikit-learn>=1.3.0
|
src/README.md
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# agentic-multimodal-doc-comparator
|
| 2 |
+
|
| 3 |
+
An agentic system to accurately match document similarity of two docs containing complex design
|
| 4 |
+
|
| 5 |
+

|
| 6 |
+
|
| 7 |
+
## Features (Phase 1)
|
| 8 |
+
|
| 9 |
+
- **Multi-modal document analysis**: Text and table extraction
|
| 10 |
+
- **Semantic similarity**: Uses sentence-transformers for embeddings
|
| 11 |
+
- **Interactive Streamlit UI**: Easy-to-use web interface
|
| 12 |
+
- **Support for PDF and DOCX**: Compare documents in multiple formats
|
| 13 |
+
- **Detailed similarity reports**: Per-modality breakdown and matched sections
|
| 14 |
+
- **Configurable weights**: Adjust importance of text vs. tables
|
| 15 |
+
|
| 16 |
+
## System Architecture
|
| 17 |
+
|
| 18 |
+
The system implements a 6-layer architecture:
|
| 19 |
+
|
| 20 |
+
1. **Input Layer**: Accepts PDF/DOCX documents
|
| 21 |
+
2. **Ingestion Layer**: Extracts raw content (text, tables)
|
| 22 |
+
3. **Modality Extractors**: Specialized agents for text and table processing
|
| 23 |
+
4. **Vector Store**: FAISS-based similarity search
|
| 24 |
+
5. **Orchestrator**: Coordinates comparison and aggregates scores
|
| 25 |
+
6. **Output Layer**: Similarity report with visualizations
|
| 26 |
+
|
| 27 |
+
## Installation
|
| 28 |
+
|
| 29 |
+
### Prerequisites
|
| 30 |
+
|
| 31 |
+
- Python 3.8+
|
| 32 |
+
- pip
|
| 33 |
+
|
| 34 |
+
### Setup
|
| 35 |
+
|
| 36 |
+
1. Clone the repository:
|
| 37 |
+
```bash
|
| 38 |
+
git clone <repository-url>
|
| 39 |
+
cd agentic-multimodal-doc-comparator
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
2. Install dependencies:
|
| 43 |
+
```bash
|
| 44 |
+
pip install -r requirements.txt
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
3. (Optional) Set up environment variables:
|
| 48 |
+
```bash
|
| 49 |
+
cp .env.example .env
|
| 50 |
+
# Edit .env with your API keys (for Phase 2 features)
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
## Usage
|
| 54 |
+
|
| 55 |
+
### Running the Streamlit App
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
streamlit run streamlit_app.py
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
The app will open in your browser at `http://localhost:8501`
|
| 62 |
+
|
| 63 |
+
### Using the App
|
| 64 |
+
|
| 65 |
+
1. **Upload Documents**: Upload two documents (PDF or DOCX) in the designated areas
|
| 66 |
+
2. **Adjust Weights**: Use the sidebar to adjust the weight given to text vs. table comparison
|
| 67 |
+
3. **Compare**: Click the "Compare Documents" button
|
| 68 |
+
4. **View Results**:
|
| 69 |
+
- Overall similarity score (0-100%)
|
| 70 |
+
- Per-modality breakdown (text and table scores)
|
| 71 |
+
- Top matched sections from both documents
|
| 72 |
+
5. **Download Report**: Export results as JSON for further analysis
|
| 73 |
+
|
| 74 |
+
## Project Structure
|
| 75 |
+
|
| 76 |
+
```
|
| 77 |
+
agentic-multimodal-doc-comparator/
|
| 78 |
+
βββ agents/ # Modality extraction agents
|
| 79 |
+
β βββ base_agent.py # Abstract base class
|
| 80 |
+
β βββ ingestion_agent.py # PDF/DOCX parsing
|
| 81 |
+
β βββ text_agent.py # Text chunking & embeddings
|
| 82 |
+
β βββ table_agent.py # Table extraction & embeddings
|
| 83 |
+
βββ orchestrator/ # Similarity orchestration
|
| 84 |
+
β βββ scorers.py # Per-modality scoring
|
| 85 |
+
β βββ similarity_orchestrator.py # Main orchestrator
|
| 86 |
+
βββ storage/ # Vector storage
|
| 87 |
+
β βββ vector_store.py # FAISS wrapper
|
| 88 |
+
βββ models/ # Data models
|
| 89 |
+
β βββ document.py # Document structures
|
| 90 |
+
β βββ similarity.py # Similarity report structures
|
| 91 |
+
βββ utils/ # Utilities
|
| 92 |
+
β βββ file_handler.py # File upload/validation
|
| 93 |
+
β βββ visualization.py # Result visualization
|
| 94 |
+
βββ config.py # Configuration
|
| 95 |
+
βββ streamlit_app.py # Main Streamlit UI
|
| 96 |
+
βββ requirements.txt # Dependencies
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
## Configuration
|
| 100 |
+
|
| 101 |
+
Edit `config.py` to customize:
|
| 102 |
+
|
| 103 |
+
- **Embedding model**: Default is `all-MiniLM-L6-v2`
|
| 104 |
+
- **Chunk size**: Default 512 tokens with 50-token overlap
|
| 105 |
+
- **Modality weights**: Default 60% text, 40% tables
|
| 106 |
+
- **File limits**: Default 50MB max file size
|
| 107 |
+
|
| 108 |
+
## Phase 2 Roadmap
|
| 109 |
+
|
| 110 |
+
Future enhancements include:
|
| 111 |
+
|
| 112 |
+
- **Image Agent**: Extract and compare images using CLIP embeddings
|
| 113 |
+
- **Layout Agent**: Analyze document structure and section hierarchy
|
| 114 |
+
- **Meta Agent**: Compare metadata (title, author, date, keywords)
|
| 115 |
+
- **Batch Comparison**: Compare 1 document against N documents
|
| 116 |
+
- **Enhanced UI**: Visual diff, interactive navigation, filtering
|
| 117 |
+
|
| 118 |
+
## Technical Details
|
| 119 |
+
|
| 120 |
+
### Models & Libraries
|
| 121 |
+
|
| 122 |
+
- **Embedding**: sentence-transformers (all-MiniLM-L6-v2, 384 dimensions)
|
| 123 |
+
- **PDF Parsing**: PyMuPDF (text) + pdfplumber (tables)
|
| 124 |
+
- **DOCX Parsing**: python-docx
|
| 125 |
+
- **Vector Search**: FAISS (cosine similarity)
|
| 126 |
+
- **UI**: Streamlit with Plotly visualizations
|
| 127 |
+
|
| 128 |
+
### Similarity Scoring
|
| 129 |
+
|
| 130 |
+
- **Text**: Cosine similarity between chunk embeddings, averaged over best matches
|
| 131 |
+
- **Tables**: Schema and content similarity using linearized table embeddings
|
| 132 |
+
- **Overall**: Weighted combination of modality scores
|
| 133 |
+
|
| 134 |
+
## Troubleshooting
|
| 135 |
+
|
| 136 |
+
### Common Issues
|
| 137 |
+
|
| 138 |
+
1. **"Module not found" errors**: Run `pip install -r requirements.txt`
|
| 139 |
+
2. **Large files timing out**: Reduce document size or increase timeout in config
|
| 140 |
+
3. **Memory errors**: Process smaller documents or reduce chunk overlap
|
| 141 |
+
4. **No matches found**: Documents may be too dissimilar or use different terminology
|
| 142 |
+
|
| 143 |
+
## Contributing
|
| 144 |
+
|
| 145 |
+
Contributions welcome! Please:
|
| 146 |
+
|
| 147 |
+
1. Fork the repository
|
| 148 |
+
2. Create a feature branch
|
| 149 |
+
3. Make your changes
|
| 150 |
+
4. Submit a pull request
|
| 151 |
+
|
| 152 |
+
## License
|
| 153 |
+
|
| 154 |
+
MIT License
|
| 155 |
+
|
| 156 |
+
## Acknowledgments
|
| 157 |
+
|
| 158 |
+
- Architecture inspired by multi-agent RAG systems
|
| 159 |
+
- Built with Streamlit, sentence-transformers, and FAISS
|
src/TESTING.md
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Testing Guide for Document Comparison App
|
| 2 |
+
|
| 3 |
+
## Quick Start Testing
|
| 4 |
+
|
| 5 |
+
### 1. Install Dependencies (if not done)
|
| 6 |
+
|
| 7 |
+
```bash
|
| 8 |
+
pip install -r requirements.txt
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
**Expected time**: 5-10 minutes (large packages like PyTorch)
|
| 12 |
+
|
| 13 |
+
### 2. Create Test Documents
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
python create_test_docs.py
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
This creates three test documents:
|
| 20 |
+
- `test_doc1.docx` - Product requirements document
|
| 21 |
+
- `test_doc2.docx` - Similar document with differences
|
| 22 |
+
- `test_doc3_identical.docx` - Identical to doc1
|
| 23 |
+
|
| 24 |
+
### 3. Run the App
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
streamlit run streamlit_app.py
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
The app will open at: `http://localhost:8501`
|
| 31 |
+
|
| 32 |
+
### 4. Test Scenarios
|
| 33 |
+
|
| 34 |
+
#### Test Case 1: Similar Documents (Expected: 60-80% similarity)
|
| 35 |
+
- **Document 1**: test_doc1.docx
|
| 36 |
+
- **Document 2**: test_doc2.docx
|
| 37 |
+
- **What to expect**:
|
| 38 |
+
- Overall similarity: ~65-75%
|
| 39 |
+
- Text similarity: ~70-80% (similar topics, some wording differences)
|
| 40 |
+
- Table similarity: ~50-60% (different tech stacks)
|
| 41 |
+
- Matched sections showing overlapping features and overview
|
| 42 |
+
|
| 43 |
+
#### Test Case 2: Identical Documents (Expected: ~100% similarity)
|
| 44 |
+
- **Document 1**: test_doc1.docx
|
| 45 |
+
- **Document 2**: test_doc3_identical.docx
|
| 46 |
+
- **What to expect**:
|
| 47 |
+
- Overall similarity: ~95-100%
|
| 48 |
+
- Text similarity: ~100%
|
| 49 |
+
- Table similarity: ~100%
|
| 50 |
+
- All sections matched
|
| 51 |
+
|
| 52 |
+
#### Test Case 3: Test with Your Own Documents
|
| 53 |
+
- Upload any two PDF or DOCX files (max 50MB each)
|
| 54 |
+
- Adjust text/table weights in sidebar
|
| 55 |
+
- View detailed comparison results
|
| 56 |
+
|
| 57 |
+
## What to Look For
|
| 58 |
+
|
| 59 |
+
### β
Successful Run Indicators
|
| 60 |
+
|
| 61 |
+
1. **Progress bar completes** through all stages:
|
| 62 |
+
- Ingesting documents
|
| 63 |
+
- Extracting and embedding text
|
| 64 |
+
- Extracting and embedding tables
|
| 65 |
+
- Comparing documents
|
| 66 |
+
|
| 67 |
+
2. **Results display shows**:
|
| 68 |
+
- Overall similarity gauge (0-100%)
|
| 69 |
+
- Bar chart with text and table scores
|
| 70 |
+
- Matched sections with content snippets
|
| 71 |
+
- Page numbers for each match
|
| 72 |
+
|
| 73 |
+
3. **Download button** works and exports JSON report
|
| 74 |
+
|
| 75 |
+
### β οΈ Common Issues to Check
|
| 76 |
+
|
| 77 |
+
1. **"Module not found" errors**
|
| 78 |
+
- Run: `pip install -r requirements.txt`
|
| 79 |
+
|
| 80 |
+
2. **Model download on first run**
|
| 81 |
+
- sentence-transformers will download ~90MB model first time
|
| 82 |
+
- This is normal and only happens once
|
| 83 |
+
|
| 84 |
+
3. **Memory warnings**
|
| 85 |
+
- Test with smaller documents first
|
| 86 |
+
- Close other applications if needed
|
| 87 |
+
|
| 88 |
+
4. **Table extraction issues**
|
| 89 |
+
- Some PDFs may have tables in image format (won't extract)
|
| 90 |
+
- DOCX tables extract more reliably
|
| 91 |
+
|
| 92 |
+
## Expected Performance
|
| 93 |
+
|
| 94 |
+
- **Small documents** (< 5 pages): 5-15 seconds
|
| 95 |
+
- **Medium documents** (5-20 pages): 15-45 seconds
|
| 96 |
+
- **Large documents** (> 20 pages): 45+ seconds
|
| 97 |
+
|
| 98 |
+
## Verifying Results
|
| 99 |
+
|
| 100 |
+
### Text Similarity
|
| 101 |
+
- Check "Matched Sections" to see side-by-side text comparisons
|
| 102 |
+
- Higher scores = more semantic overlap
|
| 103 |
+
- Look for similar topics even with different wording
|
| 104 |
+
|
| 105 |
+
### Table Similarity
|
| 106 |
+
- Compares table schemas (headers) and content
|
| 107 |
+
- Identical tables = high score
|
| 108 |
+
- Different schemas = lower score
|
| 109 |
+
|
| 110 |
+
### Overall Score
|
| 111 |
+
- Weighted combination (default: 60% text, 40% table)
|
| 112 |
+
- Adjust weights in sidebar to change emphasis
|
| 113 |
+
|
| 114 |
+
## Troubleshooting
|
| 115 |
+
|
| 116 |
+
### App won't start
|
| 117 |
+
```bash
|
| 118 |
+
# Check Python version (need 3.8+)
|
| 119 |
+
python --version
|
| 120 |
+
|
| 121 |
+
# Reinstall streamlit
|
| 122 |
+
pip install --upgrade streamlit
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
### Embeddings slow
|
| 126 |
+
- First run downloads model (~90MB)
|
| 127 |
+
- Subsequent runs use cached model
|
| 128 |
+
- Consider using GPU if available (change to faiss-gpu in requirements)
|
| 129 |
+
|
| 130 |
+
### No matches found
|
| 131 |
+
- Documents may be too different
|
| 132 |
+
- Try adjusting chunk size in config.py
|
| 133 |
+
- Check if documents have extractable text (not scanned images)
|
| 134 |
+
|
| 135 |
+
## Advanced Testing
|
| 136 |
+
|
| 137 |
+
### Modify Configuration
|
| 138 |
+
Edit `config.py` to adjust:
|
| 139 |
+
```python
|
| 140 |
+
TEXT_CHUNK_SIZE = 512 # Increase for longer context
|
| 141 |
+
TEXT_CHUNK_OVERLAP = 50 # Increase for better matching
|
| 142 |
+
MODALITY_WEIGHTS = {"text": 0.60, "table": 0.40} # Adjust importance
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
### Test Different Document Types
|
| 146 |
+
1. **Highly similar**: Same document, minor edits
|
| 147 |
+
2. **Moderately similar**: Same topic, different authors
|
| 148 |
+
3. **Dissimilar**: Completely different topics
|
| 149 |
+
|
| 150 |
+
### Validate Accuracy
|
| 151 |
+
Compare app results with manual review:
|
| 152 |
+
- Do matched sections make sense?
|
| 153 |
+
- Are similarity percentages reasonable?
|
| 154 |
+
- Are table comparisons accurate?
|
| 155 |
+
|
| 156 |
+
## Next Steps
|
| 157 |
+
|
| 158 |
+
After successful testing:
|
| 159 |
+
1. Test with your real documents
|
| 160 |
+
2. Adjust weights based on your use case
|
| 161 |
+
3. Consider Phase 2 features (image, layout, metadata comparison)
|
| 162 |
+
4. Provide feedback for improvements
|
| 163 |
+
|
| 164 |
+
## Support
|
| 165 |
+
|
| 166 |
+
If you encounter issues:
|
| 167 |
+
1. Check error message in terminal
|
| 168 |
+
2. Verify all dependencies installed
|
| 169 |
+
3. Ensure documents are valid PDF/DOCX
|
| 170 |
+
4. Check file size limits (50MB default)
|
src/__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (1.39 kB). View file
|
|
|
src/__pycache__/config.cpython-313.pyc
ADDED
|
Binary file (1.38 kB). View file
|
|
|
src/__pycache__/streamlit_app.cpython-313.pyc
ADDED
|
Binary file (12.2 kB). View file
|
|
|
src/agents/__init__.py
ADDED
|
File without changes
|
src/agents/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (179 Bytes). View file
|
|
|
src/agents/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (179 Bytes). View file
|
|
|
src/agents/__pycache__/base_agent.cpython-312.pyc
ADDED
|
Binary file (1.91 kB). View file
|
|
|
src/agents/__pycache__/ingestion_agent.cpython-312.pyc
ADDED
|
Binary file (6.74 kB). View file
|
|
|
src/agents/__pycache__/ingestion_agent.cpython-313.pyc
ADDED
|
Binary file (6.7 kB). View file
|
|
|
src/agents/__pycache__/table_agent.cpython-312.pyc
ADDED
|
Binary file (6.42 kB). View file
|
|
|
src/agents/__pycache__/text_agent.cpython-312.pyc
ADDED
|
Binary file (5.14 kB). View file
|
|
|
src/agents/base_agent.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Abstract base class for all modality agents.
|
| 3 |
+
"""
|
| 4 |
+
from abc import ABC, abstractmethod
|
| 5 |
+
from typing import Any, Dict
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class BaseAgent(ABC):
|
| 9 |
+
"""Abstract base class for all modality agents in the system."""
|
| 10 |
+
|
| 11 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 12 |
+
"""
|
| 13 |
+
Initialize the agent with configuration.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
config: Configuration dictionary
|
| 17 |
+
"""
|
| 18 |
+
self.config = config or {}
|
| 19 |
+
|
| 20 |
+
@abstractmethod
|
| 21 |
+
async def process(self, input_data: Any) -> Any:
|
| 22 |
+
"""
|
| 23 |
+
Process input data and return structured output.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
input_data: Input data to process
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
Processed output specific to the agent type
|
| 30 |
+
"""
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
+
@abstractmethod
|
| 34 |
+
def get_agent_name(self) -> str:
|
| 35 |
+
"""
|
| 36 |
+
Return the name of this agent for logging/tracking.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
Agent name as string
|
| 40 |
+
"""
|
| 41 |
+
pass
|
| 42 |
+
|
| 43 |
+
def __repr__(self) -> str:
|
| 44 |
+
return f"{self.get_agent_name()}(config={self.config})"
|
src/agents/ingestion_agent.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document ingestion agent for extracting content from PDF and DOCX files.
|
| 3 |
+
Supports both PyMuPDF and pypdf for PDF parsing.
|
| 4 |
+
"""
|
| 5 |
+
import pdfplumber
|
| 6 |
+
from docx import Document
|
| 7 |
+
from typing import Dict, List, Any
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from agents.base_agent import BaseAgent
|
| 11 |
+
from models.document import RawDocument
|
| 12 |
+
|
| 13 |
+
# Try to import PyMuPDF, fallback to pypdf if not available
|
| 14 |
+
try:
|
| 15 |
+
import fitz # PyMuPDF
|
| 16 |
+
USING_PYMUPDF = True
|
| 17 |
+
print("β Using PyMuPDF for PDF text extraction")
|
| 18 |
+
except (ImportError, OSError) as e:
|
| 19 |
+
print(f"β PyMuPDF not available ({e}), falling back to pypdf")
|
| 20 |
+
try:
|
| 21 |
+
from pypdf import PdfReader
|
| 22 |
+
USING_PYMUPDF = False
|
| 23 |
+
print("β Using pypdf for PDF text extraction")
|
| 24 |
+
except ImportError:
|
| 25 |
+
raise ImportError(
|
| 26 |
+
"Neither PyMuPDF nor pypdf is available. "
|
| 27 |
+
"Install one of them: pip install PyMuPDF or pip install pypdf"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class IngestionAgent(BaseAgent):
|
| 32 |
+
"""Agent responsible for extracting raw content from documents."""
|
| 33 |
+
|
| 34 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 35 |
+
super().__init__(config)
|
| 36 |
+
|
| 37 |
+
def get_agent_name(self) -> str:
|
| 38 |
+
return "IngestionAgent"
|
| 39 |
+
|
| 40 |
+
async def process(self, file_path: str) -> RawDocument:
|
| 41 |
+
"""
|
| 42 |
+
Process a document file and extract raw content.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
file_path: Path to PDF or DOCX file
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
RawDocument containing extracted content
|
| 49 |
+
"""
|
| 50 |
+
file_type = self._detect_file_type(file_path)
|
| 51 |
+
|
| 52 |
+
if file_type == "pdf":
|
| 53 |
+
return await self._ingest_pdf(file_path)
|
| 54 |
+
elif file_type == "docx":
|
| 55 |
+
return await self._ingest_docx(file_path)
|
| 56 |
+
else:
|
| 57 |
+
raise ValueError(f"Unsupported file type: {file_type}")
|
| 58 |
+
|
| 59 |
+
def _detect_file_type(self, file_path: str) -> str:
|
| 60 |
+
"""Detect file type from extension."""
|
| 61 |
+
extension = Path(file_path).suffix.lower()
|
| 62 |
+
if extension == ".pdf":
|
| 63 |
+
return "pdf"
|
| 64 |
+
elif extension in [".docx", ".doc"]:
|
| 65 |
+
return "docx"
|
| 66 |
+
else:
|
| 67 |
+
raise ValueError(f"Unsupported file extension: {extension}")
|
| 68 |
+
|
| 69 |
+
async def _ingest_pdf(self, file_path: str) -> RawDocument:
|
| 70 |
+
"""
|
| 71 |
+
Extract content from PDF file.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
file_path: Path to PDF file
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
RawDocument with extracted content
|
| 78 |
+
"""
|
| 79 |
+
pages = []
|
| 80 |
+
raw_text = ""
|
| 81 |
+
raw_tables = []
|
| 82 |
+
|
| 83 |
+
# Extract text using PyMuPDF or pypdf
|
| 84 |
+
if USING_PYMUPDF:
|
| 85 |
+
# Extract text using PyMuPDF
|
| 86 |
+
with fitz.open(file_path) as pdf_doc:
|
| 87 |
+
for page_num, page in enumerate(pdf_doc, start=1):
|
| 88 |
+
page_text = page.get_text()
|
| 89 |
+
raw_text += page_text + "\n"
|
| 90 |
+
pages.append({
|
| 91 |
+
"page_num": page_num,
|
| 92 |
+
"text": page_text
|
| 93 |
+
})
|
| 94 |
+
else:
|
| 95 |
+
# Extract text using pypdf
|
| 96 |
+
reader = PdfReader(file_path)
|
| 97 |
+
for page_num, page in enumerate(reader.pages, start=1):
|
| 98 |
+
page_text = page.extract_text() or ""
|
| 99 |
+
raw_text += page_text + "\n"
|
| 100 |
+
pages.append({
|
| 101 |
+
"page_num": page_num,
|
| 102 |
+
"text": page_text
|
| 103 |
+
})
|
| 104 |
+
|
| 105 |
+
# Extract tables using pdfplumber (works with both)
|
| 106 |
+
with pdfplumber.open(file_path) as pdf:
|
| 107 |
+
for page_num, page in enumerate(pdf.pages, start=1):
|
| 108 |
+
tables_on_page = page.extract_tables()
|
| 109 |
+
if tables_on_page:
|
| 110 |
+
for table_idx, table in enumerate(tables_on_page):
|
| 111 |
+
if table: # Skip empty tables
|
| 112 |
+
raw_tables.append({
|
| 113 |
+
"page_num": page_num,
|
| 114 |
+
"table_idx": table_idx,
|
| 115 |
+
"data": table
|
| 116 |
+
})
|
| 117 |
+
|
| 118 |
+
return RawDocument(
|
| 119 |
+
filename=Path(file_path).name,
|
| 120 |
+
file_type="pdf",
|
| 121 |
+
pages=pages,
|
| 122 |
+
raw_text=raw_text.strip(),
|
| 123 |
+
raw_tables=raw_tables,
|
| 124 |
+
total_pages=len(pages)
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
async def _ingest_docx(self, file_path: str) -> RawDocument:
|
| 128 |
+
"""
|
| 129 |
+
Extract content from DOCX file.
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
file_path: Path to DOCX file
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
RawDocument with extracted content
|
| 136 |
+
"""
|
| 137 |
+
doc = Document(file_path)
|
| 138 |
+
pages = []
|
| 139 |
+
raw_text = ""
|
| 140 |
+
raw_tables = []
|
| 141 |
+
|
| 142 |
+
# Extract text from paragraphs
|
| 143 |
+
# Note: DOCX doesn't have "pages" like PDF, so we simulate page 1
|
| 144 |
+
page_text = ""
|
| 145 |
+
for para in doc.paragraphs:
|
| 146 |
+
if para.text.strip():
|
| 147 |
+
page_text += para.text + "\n"
|
| 148 |
+
raw_text += para.text + "\n"
|
| 149 |
+
|
| 150 |
+
pages.append({
|
| 151 |
+
"page_num": 1,
|
| 152 |
+
"text": page_text
|
| 153 |
+
})
|
| 154 |
+
|
| 155 |
+
# Extract tables
|
| 156 |
+
for table_idx, table in enumerate(doc.tables):
|
| 157 |
+
table_data = []
|
| 158 |
+
for row in table.rows:
|
| 159 |
+
row_data = [cell.text.strip() for cell in row.cells]
|
| 160 |
+
table_data.append(row_data)
|
| 161 |
+
|
| 162 |
+
if table_data: # Skip empty tables
|
| 163 |
+
raw_tables.append({
|
| 164 |
+
"page_num": 1,
|
| 165 |
+
"table_idx": table_idx,
|
| 166 |
+
"data": table_data
|
| 167 |
+
})
|
| 168 |
+
|
| 169 |
+
return RawDocument(
|
| 170 |
+
filename=Path(file_path).name,
|
| 171 |
+
file_type="docx",
|
| 172 |
+
pages=pages,
|
| 173 |
+
raw_text=raw_text.strip(),
|
| 174 |
+
raw_tables=raw_tables,
|
| 175 |
+
total_pages=1 # DOCX treated as single page
|
| 176 |
+
)
|
src/agents/ingestion_agent_alternative.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document ingestion agent with fallback support for pypdf.
|
| 3 |
+
Use this version if PyMuPDF installation issues persist.
|
| 4 |
+
"""
|
| 5 |
+
from docx import Document
|
| 6 |
+
from typing import Dict, List, Any
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
from agents.base_agent import BaseAgent
|
| 10 |
+
from models.document import RawDocument
|
| 11 |
+
|
| 12 |
+
# Try to import fitz (PyMuPDF), fallback to pypdf
|
| 13 |
+
try:
|
| 14 |
+
import fitz # PyMuPDF
|
| 15 |
+
USING_PYMUPDF = True
|
| 16 |
+
except ImportError:
|
| 17 |
+
from pypdf import PdfReader
|
| 18 |
+
USING_PYMUPDF = False
|
| 19 |
+
print("Using pypdf (PyMuPDF not available)")
|
| 20 |
+
|
| 21 |
+
import pdfplumber
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class IngestionAgent(BaseAgent):
|
| 25 |
+
"""Agent responsible for extracting raw content from documents."""
|
| 26 |
+
|
| 27 |
+
def __init__(self, config: Dict[str, Any] = None):
|
| 28 |
+
super().__init__(config)
|
| 29 |
+
|
| 30 |
+
def get_agent_name(self) -> str:
|
| 31 |
+
return "IngestionAgent"
|
| 32 |
+
|
| 33 |
+
async def process(self, file_path: str) -> RawDocument:
|
| 34 |
+
"""
|
| 35 |
+
Process a document file and extract raw content.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
file_path: Path to PDF or DOCX file
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
RawDocument containing extracted content
|
| 42 |
+
"""
|
| 43 |
+
file_type = self._detect_file_type(file_path)
|
| 44 |
+
|
| 45 |
+
if file_type == "pdf":
|
| 46 |
+
return await self._ingest_pdf(file_path)
|
| 47 |
+
elif file_type == "docx":
|
| 48 |
+
return await self._ingest_docx(file_path)
|
| 49 |
+
else:
|
| 50 |
+
raise ValueError(f"Unsupported file type: {file_type}")
|
| 51 |
+
|
| 52 |
+
def _detect_file_type(self, file_path: str) -> str:
|
| 53 |
+
"""Detect file type from extension."""
|
| 54 |
+
extension = Path(file_path).suffix.lower()
|
| 55 |
+
if extension == ".pdf":
|
| 56 |
+
return "pdf"
|
| 57 |
+
elif extension in [".docx", ".doc"]:
|
| 58 |
+
return "docx"
|
| 59 |
+
else:
|
| 60 |
+
raise ValueError(f"Unsupported file extension: {extension}")
|
| 61 |
+
|
| 62 |
+
async def _ingest_pdf(self, file_path: str) -> RawDocument:
|
| 63 |
+
"""
|
| 64 |
+
Extract content from PDF file.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
file_path: Path to PDF file
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
RawDocument with extracted content
|
| 71 |
+
"""
|
| 72 |
+
pages = []
|
| 73 |
+
raw_text = ""
|
| 74 |
+
raw_tables = []
|
| 75 |
+
|
| 76 |
+
if USING_PYMUPDF:
|
| 77 |
+
# Extract text using PyMuPDF (fitz)
|
| 78 |
+
with fitz.open(file_path) as pdf_doc:
|
| 79 |
+
for page_num, page in enumerate(pdf_doc, start=1):
|
| 80 |
+
page_text = page.get_text()
|
| 81 |
+
raw_text += page_text + "\n"
|
| 82 |
+
pages.append({
|
| 83 |
+
"page_num": page_num,
|
| 84 |
+
"text": page_text
|
| 85 |
+
})
|
| 86 |
+
else:
|
| 87 |
+
# Extract text using pypdf
|
| 88 |
+
reader = PdfReader(file_path)
|
| 89 |
+
for page_num, page in enumerate(reader.pages, start=1):
|
| 90 |
+
page_text = page.extract_text()
|
| 91 |
+
raw_text += page_text + "\n"
|
| 92 |
+
pages.append({
|
| 93 |
+
"page_num": page_num,
|
| 94 |
+
"text": page_text
|
| 95 |
+
})
|
| 96 |
+
|
| 97 |
+
# Extract tables using pdfplumber (works with both)
|
| 98 |
+
with pdfplumber.open(file_path) as pdf:
|
| 99 |
+
for page_num, page in enumerate(pdf.pages, start=1):
|
| 100 |
+
tables_on_page = page.extract_tables()
|
| 101 |
+
if tables_on_page:
|
| 102 |
+
for table_idx, table in enumerate(tables_on_page):
|
| 103 |
+
if table: # Skip empty tables
|
| 104 |
+
raw_tables.append({
|
| 105 |
+
"page_num": page_num,
|
| 106 |
+
"table_idx": table_idx,
|
| 107 |
+
"data": table
|
| 108 |
+
})
|
| 109 |
+
|
| 110 |
+
return RawDocument(
|
| 111 |
+
filename=Path(file_path).name,
|
| 112 |
+
file_type="pdf",
|
| 113 |
+
pages=pages,
|
| 114 |
+
raw_text=raw_text.strip(),
|
| 115 |
+
raw_tables=raw_tables,
|
| 116 |
+
total_pages=len(pages)
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
async def _ingest_docx(self, file_path: str) -> RawDocument:
|
| 120 |
+
"""
|
| 121 |
+
Extract content from DOCX file.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
file_path: Path to DOCX file
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
RawDocument with extracted content
|
| 128 |
+
"""
|
| 129 |
+
doc = Document(file_path)
|
| 130 |
+
pages = []
|
| 131 |
+
raw_text = ""
|
| 132 |
+
raw_tables = []
|
| 133 |
+
|
| 134 |
+
# Extract text from paragraphs
|
| 135 |
+
page_text = ""
|
| 136 |
+
for para in doc.paragraphs:
|
| 137 |
+
if para.text.strip():
|
| 138 |
+
page_text += para.text + "\n"
|
| 139 |
+
raw_text += para.text + "\n"
|
| 140 |
+
|
| 141 |
+
pages.append({
|
| 142 |
+
"page_num": 1,
|
| 143 |
+
"text": page_text
|
| 144 |
+
})
|
| 145 |
+
|
| 146 |
+
# Extract tables
|
| 147 |
+
for table_idx, table in enumerate(doc.tables):
|
| 148 |
+
table_data = []
|
| 149 |
+
for row in table.rows:
|
| 150 |
+
row_data = [cell.text.strip() for cell in row.cells]
|
| 151 |
+
table_data.append(row_data)
|
| 152 |
+
|
| 153 |
+
if table_data: # Skip empty tables
|
| 154 |
+
raw_tables.append({
|
| 155 |
+
"page_num": 1,
|
| 156 |
+
"table_idx": table_idx,
|
| 157 |
+
"data": table_data
|
| 158 |
+
})
|
| 159 |
+
|
| 160 |
+
return RawDocument(
|
| 161 |
+
filename=Path(file_path).name,
|
| 162 |
+
file_type="docx",
|
| 163 |
+
pages=pages,
|
| 164 |
+
raw_text=raw_text.strip(),
|
| 165 |
+
raw_tables=raw_tables,
|
| 166 |
+
total_pages=1 # DOCX treated as single page
|
| 167 |
+
)
|
src/agents/table_agent.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Table agent for extracting and embedding table data.
|
| 3 |
+
"""
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List, Tuple, Dict, Any
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
from agents.base_agent import BaseAgent
|
| 9 |
+
from models.document import TableExtraction, RawDocument
|
| 10 |
+
import config
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class TableAgent(BaseAgent):
|
| 14 |
+
"""Agent responsible for table extraction and embedding generation."""
|
| 15 |
+
|
| 16 |
+
def __init__(self, config_dict: Dict[str, Any] = None):
|
| 17 |
+
super().__init__(config_dict)
|
| 18 |
+
# Load embedding model (same as text agent for consistency)
|
| 19 |
+
self.model = SentenceTransformer(config.TEXT_EMBEDDING_MODEL)
|
| 20 |
+
|
| 21 |
+
def get_agent_name(self) -> str:
|
| 22 |
+
return "TableAgent"
|
| 23 |
+
|
| 24 |
+
async def process(self, raw_document: RawDocument) -> Tuple[List[TableExtraction], np.ndarray]:
|
| 25 |
+
"""
|
| 26 |
+
Process raw tables into structured format and embeddings.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
raw_document: Raw document with extracted tables
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
Tuple of (list of TableExtraction objects, numpy array of embeddings)
|
| 33 |
+
"""
|
| 34 |
+
# Parse tables
|
| 35 |
+
tables = self.parse_tables(raw_document.raw_tables)
|
| 36 |
+
|
| 37 |
+
# Generate embeddings
|
| 38 |
+
if tables:
|
| 39 |
+
table_texts = [self.linearize_table(table) for table in tables]
|
| 40 |
+
embeddings = self.generate_embeddings(table_texts)
|
| 41 |
+
else:
|
| 42 |
+
embeddings = np.array([])
|
| 43 |
+
|
| 44 |
+
return tables, embeddings
|
| 45 |
+
|
| 46 |
+
def parse_tables(self, raw_tables: List[Dict[str, Any]]) -> List[TableExtraction]:
|
| 47 |
+
"""
|
| 48 |
+
Parse raw table data into structured TableExtraction objects.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
raw_tables: List of raw table dictionaries
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
List of TableExtraction objects
|
| 55 |
+
"""
|
| 56 |
+
tables = []
|
| 57 |
+
|
| 58 |
+
for raw_table in raw_tables:
|
| 59 |
+
table_data = raw_table.get("data", [])
|
| 60 |
+
if not table_data or len(table_data) < 1:
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
# First row is usually headers
|
| 64 |
+
headers = [str(cell).strip() for cell in table_data[0]] if table_data else []
|
| 65 |
+
|
| 66 |
+
# Remaining rows are data
|
| 67 |
+
rows = []
|
| 68 |
+
for row_data in table_data[1:]:
|
| 69 |
+
row = [str(cell).strip() for cell in row_data]
|
| 70 |
+
rows.append(row)
|
| 71 |
+
|
| 72 |
+
# Generate schema summary
|
| 73 |
+
schema_summary = self._generate_schema_summary(headers, rows)
|
| 74 |
+
|
| 75 |
+
table = TableExtraction(
|
| 76 |
+
headers=headers,
|
| 77 |
+
rows=rows,
|
| 78 |
+
page_number=raw_table.get("page_num", 1),
|
| 79 |
+
schema_summary=schema_summary
|
| 80 |
+
)
|
| 81 |
+
tables.append(table)
|
| 82 |
+
|
| 83 |
+
return tables
|
| 84 |
+
|
| 85 |
+
def _generate_schema_summary(self, headers: List[str], rows: List[List[str]]) -> str:
|
| 86 |
+
"""
|
| 87 |
+
Generate a summary of the table schema.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
headers: Table headers
|
| 91 |
+
rows: Table rows
|
| 92 |
+
|
| 93 |
+
Returns:
|
| 94 |
+
Schema summary string
|
| 95 |
+
"""
|
| 96 |
+
num_columns = len(headers)
|
| 97 |
+
num_rows = len(rows)
|
| 98 |
+
|
| 99 |
+
summary = f"Table with {num_columns} columns and {num_rows} rows. "
|
| 100 |
+
summary += f"Columns: {', '.join(headers[:5])}" # Show first 5 headers
|
| 101 |
+
|
| 102 |
+
if len(headers) > 5:
|
| 103 |
+
summary += f" and {len(headers) - 5} more"
|
| 104 |
+
|
| 105 |
+
return summary
|
| 106 |
+
|
| 107 |
+
def linearize_table(self, table: TableExtraction) -> str:
|
| 108 |
+
"""
|
| 109 |
+
Convert table to linear text format for embedding.
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
table: TableExtraction object
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
Linearized table as string
|
| 116 |
+
"""
|
| 117 |
+
# Format: "Header1: value1, Header2: value2, ..."
|
| 118 |
+
lines = []
|
| 119 |
+
|
| 120 |
+
# Add schema summary
|
| 121 |
+
lines.append(table.schema_summary)
|
| 122 |
+
|
| 123 |
+
# Add headers
|
| 124 |
+
if table.headers:
|
| 125 |
+
lines.append(f"Headers: {' | '.join(table.headers)}")
|
| 126 |
+
|
| 127 |
+
# Add rows (sample first few for embedding)
|
| 128 |
+
max_rows = 10 # Limit to avoid very long text
|
| 129 |
+
for idx, row in enumerate(table.rows[:max_rows], start=1):
|
| 130 |
+
if row:
|
| 131 |
+
# Create row representation
|
| 132 |
+
row_text = f"Row {idx}: {' | '.join(row)}"
|
| 133 |
+
lines.append(row_text)
|
| 134 |
+
|
| 135 |
+
if len(table.rows) > max_rows:
|
| 136 |
+
lines.append(f"... and {len(table.rows) - max_rows} more rows")
|
| 137 |
+
|
| 138 |
+
return "\n".join(lines)
|
| 139 |
+
|
| 140 |
+
def generate_embeddings(self, texts: List[str]) -> np.ndarray:
|
| 141 |
+
"""
|
| 142 |
+
Generate embeddings for linearized tables.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
texts: List of linearized table texts
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
Numpy array of embeddings (shape: num_tables x embedding_dim)
|
| 149 |
+
"""
|
| 150 |
+
if not texts:
|
| 151 |
+
return np.array([])
|
| 152 |
+
|
| 153 |
+
# Generate embeddings using sentence-transformers
|
| 154 |
+
embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
|
| 155 |
+
|
| 156 |
+
return embeddings
|
src/agents/text_agent.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text agent for chunking text and generating embeddings.
|
| 3 |
+
"""
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List, Tuple, Dict, Any
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
from agents.base_agent import BaseAgent
|
| 9 |
+
from models.document import DocumentChunk, RawDocument
|
| 10 |
+
import config
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class TextAgent(BaseAgent):
|
| 14 |
+
"""Agent responsible for text chunking and embedding generation."""
|
| 15 |
+
|
| 16 |
+
def __init__(self, config_dict: Dict[str, Any] = None):
|
| 17 |
+
super().__init__(config_dict)
|
| 18 |
+
# Load embedding model
|
| 19 |
+
self.model = SentenceTransformer(config.TEXT_EMBEDDING_MODEL)
|
| 20 |
+
|
| 21 |
+
def get_agent_name(self) -> str:
|
| 22 |
+
return "TextAgent"
|
| 23 |
+
|
| 24 |
+
async def process(self, raw_document: RawDocument) -> Tuple[List[DocumentChunk], np.ndarray]:
|
| 25 |
+
"""
|
| 26 |
+
Process raw document text into chunks and embeddings.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
raw_document: Raw document with extracted text
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
Tuple of (list of DocumentChunks, numpy array of embeddings)
|
| 33 |
+
"""
|
| 34 |
+
# Chunk the text
|
| 35 |
+
chunks = self.chunk_text(raw_document.raw_text, raw_document)
|
| 36 |
+
|
| 37 |
+
# Generate embeddings
|
| 38 |
+
if chunks:
|
| 39 |
+
chunk_texts = [chunk.content for chunk in chunks]
|
| 40 |
+
embeddings = self.generate_embeddings(chunk_texts)
|
| 41 |
+
else:
|
| 42 |
+
embeddings = np.array([])
|
| 43 |
+
|
| 44 |
+
return chunks, embeddings
|
| 45 |
+
|
| 46 |
+
def chunk_text(self, text: str, raw_document: RawDocument) -> List[DocumentChunk]:
|
| 47 |
+
"""
|
| 48 |
+
Split text into chunks with overlap.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
text: Text to chunk
|
| 52 |
+
raw_document: Original document for metadata
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
List of DocumentChunk objects
|
| 56 |
+
"""
|
| 57 |
+
if not text or not text.strip():
|
| 58 |
+
return []
|
| 59 |
+
|
| 60 |
+
chunks = []
|
| 61 |
+
|
| 62 |
+
# Simple character-based chunking (approximate token-based chunking)
|
| 63 |
+
# Approximate: 1 token ~= 4 characters
|
| 64 |
+
char_chunk_size = config.TEXT_CHUNK_SIZE * 4
|
| 65 |
+
char_overlap = config.TEXT_CHUNK_OVERLAP * 4
|
| 66 |
+
|
| 67 |
+
text_length = len(text)
|
| 68 |
+
start = 0
|
| 69 |
+
chunk_idx = 0
|
| 70 |
+
|
| 71 |
+
while start < text_length:
|
| 72 |
+
end = min(start + char_chunk_size, text_length)
|
| 73 |
+
|
| 74 |
+
# Extract chunk
|
| 75 |
+
chunk_text = text[start:end].strip()
|
| 76 |
+
|
| 77 |
+
if chunk_text:
|
| 78 |
+
# Try to find the page number for this chunk
|
| 79 |
+
page_num = self._estimate_page_number(start, raw_document)
|
| 80 |
+
|
| 81 |
+
chunk = DocumentChunk(
|
| 82 |
+
content=chunk_text,
|
| 83 |
+
chunk_type="text",
|
| 84 |
+
page_number=page_num,
|
| 85 |
+
metadata={
|
| 86 |
+
"chunk_index": chunk_idx,
|
| 87 |
+
"start_char": start,
|
| 88 |
+
"end_char": end
|
| 89 |
+
}
|
| 90 |
+
)
|
| 91 |
+
chunks.append(chunk)
|
| 92 |
+
chunk_idx += 1
|
| 93 |
+
|
| 94 |
+
# Move to next chunk with overlap
|
| 95 |
+
start = end - char_overlap if end < text_length else text_length
|
| 96 |
+
|
| 97 |
+
return chunks
|
| 98 |
+
|
| 99 |
+
def _estimate_page_number(self, char_position: int, raw_document: RawDocument) -> int:
|
| 100 |
+
"""
|
| 101 |
+
Estimate page number based on character position.
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
char_position: Character position in full text
|
| 105 |
+
raw_document: Original document
|
| 106 |
+
|
| 107 |
+
Returns:
|
| 108 |
+
Estimated page number (1-indexed)
|
| 109 |
+
"""
|
| 110 |
+
# Calculate based on pages
|
| 111 |
+
current_pos = 0
|
| 112 |
+
for page in raw_document.pages:
|
| 113 |
+
page_text = page.get("text", "")
|
| 114 |
+
current_pos += len(page_text)
|
| 115 |
+
if char_position < current_pos:
|
| 116 |
+
return page.get("page_num", 1)
|
| 117 |
+
|
| 118 |
+
# Default to last page if not found
|
| 119 |
+
return raw_document.total_pages if raw_document.total_pages > 0 else 1
|
| 120 |
+
|
| 121 |
+
def generate_embeddings(self, texts: List[str]) -> np.ndarray:
|
| 122 |
+
"""
|
| 123 |
+
Generate embeddings for list of texts.
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
texts: List of text strings
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
Numpy array of embeddings (shape: num_texts x embedding_dim)
|
| 130 |
+
"""
|
| 131 |
+
if not texts:
|
| 132 |
+
return np.array([])
|
| 133 |
+
|
| 134 |
+
# Generate embeddings using sentence-transformers
|
| 135 |
+
embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
|
| 136 |
+
|
| 137 |
+
return embeddings
|
src/config.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Central configuration for the multi-agent document comparison system.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
# Load environment variables
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
# Paths
|
| 12 |
+
PROJECT_ROOT = Path(__file__).parent
|
| 13 |
+
DATA_DIR = PROJECT_ROOT / "data"
|
| 14 |
+
UPLOAD_DIR = DATA_DIR / "uploads"
|
| 15 |
+
VECTOR_STORE_DIR = DATA_DIR / "vector_stores"
|
| 16 |
+
|
| 17 |
+
# Create directories if they don't exist
|
| 18 |
+
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
| 19 |
+
VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
|
| 20 |
+
|
| 21 |
+
# Embedding configuration
|
| 22 |
+
TEXT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 23 |
+
EMBEDDING_DIMENSION = 384 # MiniLM output dimension
|
| 24 |
+
|
| 25 |
+
# Chunking parameters
|
| 26 |
+
TEXT_CHUNK_SIZE = 512 # tokens
|
| 27 |
+
TEXT_CHUNK_OVERLAP = 50 # tokens
|
| 28 |
+
|
| 29 |
+
# Similarity parameters
|
| 30 |
+
TOP_K_MATCHES = 10 # Number of similar chunks to retrieve
|
| 31 |
+
|
| 32 |
+
# Modality weights (Phase 1: text + tables only)
|
| 33 |
+
# These weights must sum to 1.0
|
| 34 |
+
MODALITY_WEIGHTS = {
|
| 35 |
+
"text": 0.60,
|
| 36 |
+
"table": 0.40
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
# File constraints
|
| 40 |
+
MAX_FILE_SIZE_MB = 50
|
| 41 |
+
ALLOWED_EXTENSIONS = [".pdf", ".docx"]
|
| 42 |
+
|
| 43 |
+
# Future: LLM API keys (Phase 2)
|
| 44 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 45 |
+
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
| 46 |
+
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
|
| 47 |
+
|
| 48 |
+
# Logging
|
| 49 |
+
LOG_LEVEL = "INFO"
|
src/create_test_docs.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Script to create sample test documents for testing the document comparison app.
|
| 3 |
+
"""
|
| 4 |
+
from docx import Document
|
| 5 |
+
from docx.shared import Inches, Pt
|
| 6 |
+
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def create_test_doc1():
|
| 10 |
+
"""Create first test document."""
|
| 11 |
+
doc = Document()
|
| 12 |
+
|
| 13 |
+
# Add title
|
| 14 |
+
title = doc.add_heading('Product Requirements Document', 0)
|
| 15 |
+
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 16 |
+
|
| 17 |
+
# Overview section
|
| 18 |
+
doc.add_heading('1. Overview', 1)
|
| 19 |
+
doc.add_paragraph(
|
| 20 |
+
'This document outlines the requirements for the new mobile application. '
|
| 21 |
+
'The app will provide users with real-time notifications and task management '
|
| 22 |
+
'capabilities. Our goal is to create an intuitive, user-friendly interface '
|
| 23 |
+
'that enhances productivity.'
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Features section
|
| 27 |
+
doc.add_heading('2. Features', 1)
|
| 28 |
+
doc.add_paragraph('The application will include the following key features:')
|
| 29 |
+
|
| 30 |
+
features = [
|
| 31 |
+
'User authentication with OAuth2 protocol',
|
| 32 |
+
'Push notifications for task updates and reminders',
|
| 33 |
+
'Calendar integration with Google Calendar and Outlook',
|
| 34 |
+
'Collaborative task sharing with team members',
|
| 35 |
+
'Real-time synchronization across devices'
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
for feature in features:
|
| 39 |
+
doc.add_paragraph(feature, style='List Bullet')
|
| 40 |
+
|
| 41 |
+
# Technical Specifications
|
| 42 |
+
doc.add_heading('3. Technical Specifications', 1)
|
| 43 |
+
doc.add_paragraph('The technology stack for this project:')
|
| 44 |
+
|
| 45 |
+
table = doc.add_table(rows=4, cols=3)
|
| 46 |
+
table.style = 'Medium Grid 1 Accent 1'
|
| 47 |
+
|
| 48 |
+
# Header row
|
| 49 |
+
hdr_cells = table.rows[0].cells
|
| 50 |
+
hdr_cells[0].text = 'Component'
|
| 51 |
+
hdr_cells[1].text = 'Technology'
|
| 52 |
+
hdr_cells[2].text = 'Version'
|
| 53 |
+
|
| 54 |
+
# Data rows
|
| 55 |
+
data = [
|
| 56 |
+
('Frontend', 'React Native', '0.72'),
|
| 57 |
+
('Backend', 'Node.js', '18.x'),
|
| 58 |
+
('Database', 'PostgreSQL', '15.0')
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
for i, (comp, tech, ver) in enumerate(data, start=1):
|
| 62 |
+
row = table.rows[i].cells
|
| 63 |
+
row[0].text = comp
|
| 64 |
+
row[1].text = tech
|
| 65 |
+
row[2].text = ver
|
| 66 |
+
|
| 67 |
+
# Timeline
|
| 68 |
+
doc.add_heading('4. Timeline', 1)
|
| 69 |
+
doc.add_paragraph(
|
| 70 |
+
'Phase 1: Requirements gathering - 2 weeks\n'
|
| 71 |
+
'Phase 2: Design and architecture - 3 weeks\n'
|
| 72 |
+
'Phase 3: Development - 8 weeks\n'
|
| 73 |
+
'Phase 4: Testing and QA - 2 weeks\n'
|
| 74 |
+
'Phase 5: Deployment - 1 week'
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
doc.save('data/uploads/test_doc1.docx')
|
| 78 |
+
print('β
Created test_doc1.docx')
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def create_test_doc2():
|
| 82 |
+
"""Create second test document (similar but with differences)."""
|
| 83 |
+
doc = Document()
|
| 84 |
+
|
| 85 |
+
# Add title
|
| 86 |
+
title = doc.add_heading('Product Requirements Document', 0)
|
| 87 |
+
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 88 |
+
|
| 89 |
+
# Overview section (similar wording)
|
| 90 |
+
doc.add_heading('1. Overview', 1)
|
| 91 |
+
doc.add_paragraph(
|
| 92 |
+
'This document describes the specifications for a new mobile application. '
|
| 93 |
+
'The application will offer users real-time alerts and project management '
|
| 94 |
+
'features. We aim to build a streamlined, easy-to-use platform that '
|
| 95 |
+
'boosts team efficiency.'
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Features section (some overlap, some new)
|
| 99 |
+
doc.add_heading('2. Core Features', 1)
|
| 100 |
+
doc.add_paragraph('Key functionality includes:')
|
| 101 |
+
|
| 102 |
+
features = [
|
| 103 |
+
'User login with OAuth2 authentication',
|
| 104 |
+
'Real-time push notifications for updates',
|
| 105 |
+
'Calendar synchronization with multiple platforms',
|
| 106 |
+
'Team collaboration tools and shared workspaces',
|
| 107 |
+
'Offline mode support for uninterrupted work',
|
| 108 |
+
'File attachment and sharing capabilities'
|
| 109 |
+
]
|
| 110 |
+
|
| 111 |
+
for feature in features:
|
| 112 |
+
doc.add_paragraph(feature, style='List Bullet')
|
| 113 |
+
|
| 114 |
+
# Technical Specifications (different technologies)
|
| 115 |
+
doc.add_heading('3. Technology Stack', 1)
|
| 116 |
+
doc.add_paragraph('Proposed technology choices:')
|
| 117 |
+
|
| 118 |
+
table = doc.add_table(rows=5, cols=3)
|
| 119 |
+
table.style = 'Medium Grid 1 Accent 1'
|
| 120 |
+
|
| 121 |
+
# Header row
|
| 122 |
+
hdr_cells = table.rows[0].cells
|
| 123 |
+
hdr_cells[0].text = 'Component'
|
| 124 |
+
hdr_cells[1].text = 'Technology'
|
| 125 |
+
hdr_cells[2].text = 'Version'
|
| 126 |
+
|
| 127 |
+
# Data rows (some different)
|
| 128 |
+
data = [
|
| 129 |
+
('Frontend', 'React Native', '0.72'),
|
| 130 |
+
('Backend', 'Express.js', '4.18'),
|
| 131 |
+
('Database', 'MongoDB', '6.0'),
|
| 132 |
+
('Cache', 'Redis', '7.0')
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
for i, (comp, tech, ver) in enumerate(data, start=1):
|
| 136 |
+
row = table.rows[i].cells
|
| 137 |
+
row[0].text = comp
|
| 138 |
+
row[1].text = tech
|
| 139 |
+
row[2].text = ver
|
| 140 |
+
|
| 141 |
+
# Project Schedule (different from doc1)
|
| 142 |
+
doc.add_heading('4. Project Schedule', 1)
|
| 143 |
+
doc.add_paragraph(
|
| 144 |
+
'Sprint 1: Planning and setup - 2 weeks\n'
|
| 145 |
+
'Sprint 2-3: Core development - 6 weeks\n'
|
| 146 |
+
'Sprint 4: Feature completion - 3 weeks\n'
|
| 147 |
+
'Sprint 5: Testing phase - 3 weeks\n'
|
| 148 |
+
'Sprint 6: Launch preparation - 1 week'
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Additional section (not in doc1)
|
| 152 |
+
doc.add_heading('5. Budget Estimates', 1)
|
| 153 |
+
doc.add_paragraph(
|
| 154 |
+
'Development costs: $150,000\n'
|
| 155 |
+
'Infrastructure: $20,000/year\n'
|
| 156 |
+
'Maintenance: $30,000/year'
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
doc.save('data/uploads/test_doc2.docx')
|
| 160 |
+
print('β
Created test_doc2.docx')
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def create_identical_doc():
|
| 164 |
+
"""Create a third document identical to doc1 for testing perfect match."""
|
| 165 |
+
doc = Document()
|
| 166 |
+
|
| 167 |
+
# Same as doc1
|
| 168 |
+
title = doc.add_heading('Product Requirements Document', 0)
|
| 169 |
+
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 170 |
+
|
| 171 |
+
doc.add_heading('1. Overview', 1)
|
| 172 |
+
doc.add_paragraph(
|
| 173 |
+
'This document outlines the requirements for the new mobile application. '
|
| 174 |
+
'The app will provide users with real-time notifications and task management '
|
| 175 |
+
'capabilities. Our goal is to create an intuitive, user-friendly interface '
|
| 176 |
+
'that enhances productivity.'
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
doc.save('data/uploads/test_doc3_identical.docx')
|
| 180 |
+
print('β
Created test_doc3_identical.docx (identical to doc1)')
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
if __name__ == '__main__':
|
| 184 |
+
print('Creating test documents...')
|
| 185 |
+
create_test_doc1()
|
| 186 |
+
create_test_doc2()
|
| 187 |
+
create_identical_doc()
|
| 188 |
+
print('\nβ
All test documents created successfully!')
|
| 189 |
+
print('Documents saved in: data/uploads/')
|
src/models/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Models package for document and chunk data structures.
|
| 3 |
+
"""
|
| 4 |
+
from models.document import RawDocument, DocumentChunk, ProcessedDocument, TableExtraction
|
| 5 |
+
from models.similarity import ModalityScore, SimilarityReport
|
| 6 |
+
|
| 7 |
+
__all__ = [
|
| 8 |
+
"RawDocument",
|
| 9 |
+
"DocumentChunk",
|
| 10 |
+
"ProcessedDocument",
|
| 11 |
+
"TableExtraction",
|
| 12 |
+
"ModalityScore",
|
| 13 |
+
"SimilarityReport",
|
| 14 |
+
]
|
src/models/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (532 Bytes). View file
|
|
|
src/models/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (532 Bytes). View file
|
|
|
src/models/__pycache__/document.cpython-312.pyc
ADDED
|
Binary file (6.33 kB). View file
|
|
|
src/models/__pycache__/document.cpython-313.pyc
ADDED
|
Binary file (6.28 kB). View file
|
|
|
src/models/__pycache__/similarity.cpython-312.pyc
ADDED
|
Binary file (3.3 kB). View file
|
|
|
src/models/__pycache__/similarity.cpython-313.pyc
ADDED
|
Binary file (3.35 kB). View file
|
|
|
src/models/document.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data models for documents and document chunks.
|
| 3 |
+
"""
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
+
import uuid
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class RawDocument:
|
| 9 |
+
"""Represents a raw document with extracted content."""
|
| 10 |
+
|
| 11 |
+
def __init__(
|
| 12 |
+
self,
|
| 13 |
+
filename: str,
|
| 14 |
+
file_type: str,
|
| 15 |
+
pages: List[Dict[str, Any]],
|
| 16 |
+
raw_text: str,
|
| 17 |
+
raw_tables: List[Dict[str, Any]],
|
| 18 |
+
total_pages: int,
|
| 19 |
+
):
|
| 20 |
+
"""
|
| 21 |
+
Initialize a RawDocument.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
filename: Name of the document file
|
| 25 |
+
file_type: Type of file (e.g., 'pdf', 'docx')
|
| 26 |
+
pages: List of page dictionaries with 'page_num' and 'text' keys
|
| 27 |
+
raw_text: Full extracted text from the document
|
| 28 |
+
raw_tables: List of tables extracted from the document
|
| 29 |
+
total_pages: Total number of pages in the document
|
| 30 |
+
"""
|
| 31 |
+
self.filename = filename
|
| 32 |
+
self.file_type = file_type
|
| 33 |
+
self.pages = pages
|
| 34 |
+
self.raw_text = raw_text
|
| 35 |
+
self.raw_tables = raw_tables
|
| 36 |
+
self.total_pages = total_pages
|
| 37 |
+
|
| 38 |
+
def __repr__(self) -> str:
|
| 39 |
+
return f"RawDocument(filename={self.filename}, pages={self.total_pages})"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class DocumentChunk:
|
| 43 |
+
"""Represents a chunk of document content with metadata."""
|
| 44 |
+
|
| 45 |
+
def __init__(
|
| 46 |
+
self,
|
| 47 |
+
content: str,
|
| 48 |
+
chunk_type: str,
|
| 49 |
+
page_number: int,
|
| 50 |
+
metadata: Dict[str, Any] = None,
|
| 51 |
+
chunk_id: str = None,
|
| 52 |
+
):
|
| 53 |
+
"""
|
| 54 |
+
Initialize a DocumentChunk.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
content: The text content of the chunk
|
| 58 |
+
chunk_type: Type of chunk (e.g., 'text', 'table')
|
| 59 |
+
page_number: Page number where this chunk appears
|
| 60 |
+
metadata: Additional metadata about the chunk
|
| 61 |
+
chunk_id: Unique identifier for the chunk (auto-generated if not provided)
|
| 62 |
+
"""
|
| 63 |
+
self.content = content
|
| 64 |
+
self.chunk_type = chunk_type
|
| 65 |
+
self.page_number = page_number
|
| 66 |
+
self.metadata = metadata or {}
|
| 67 |
+
self.chunk_id = chunk_id or str(uuid.uuid4())
|
| 68 |
+
|
| 69 |
+
def __repr__(self) -> str:
|
| 70 |
+
return (
|
| 71 |
+
f"DocumentChunk(type={self.chunk_type}, page={self.page_number}, "
|
| 72 |
+
f"length={len(self.content)})"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class TableExtraction:
|
| 77 |
+
"""Represents a table extracted from a document."""
|
| 78 |
+
|
| 79 |
+
def __init__(
|
| 80 |
+
self,
|
| 81 |
+
headers: List[str],
|
| 82 |
+
rows: List[List[str]],
|
| 83 |
+
page_number: int,
|
| 84 |
+
schema_summary: str,
|
| 85 |
+
table_id: str = None,
|
| 86 |
+
):
|
| 87 |
+
"""
|
| 88 |
+
Initialize a TableExtraction.
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
headers: List of column headers
|
| 92 |
+
rows: List of rows, each containing cell values
|
| 93 |
+
page_number: Page number where this table appears
|
| 94 |
+
schema_summary: Summary description of the table schema
|
| 95 |
+
table_id: Unique identifier for the table (auto-generated if not provided)
|
| 96 |
+
"""
|
| 97 |
+
self.headers = headers
|
| 98 |
+
self.rows = rows
|
| 99 |
+
self.page_number = page_number
|
| 100 |
+
self.schema_summary = schema_summary
|
| 101 |
+
self.table_id = table_id or str(uuid.uuid4())
|
| 102 |
+
|
| 103 |
+
def __repr__(self) -> str:
|
| 104 |
+
return (
|
| 105 |
+
f"TableExtraction(columns={len(self.headers)}, "
|
| 106 |
+
f"rows={len(self.rows)}, page={self.page_number})"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class ProcessedDocument:
|
| 111 |
+
"""Represents a fully processed document with text chunks and tables."""
|
| 112 |
+
|
| 113 |
+
def __init__(
|
| 114 |
+
self,
|
| 115 |
+
filename: str,
|
| 116 |
+
text_chunks: List[DocumentChunk],
|
| 117 |
+
tables: List["TableExtraction"],
|
| 118 |
+
total_pages: int,
|
| 119 |
+
file_type: str,
|
| 120 |
+
):
|
| 121 |
+
"""
|
| 122 |
+
Initialize a ProcessedDocument.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
filename: Name of the document file
|
| 126 |
+
text_chunks: List of text chunks extracted from the document
|
| 127 |
+
tables: List of tables extracted from the document
|
| 128 |
+
total_pages: Total number of pages in the document
|
| 129 |
+
file_type: Type of file (e.g., 'pdf', 'docx')
|
| 130 |
+
"""
|
| 131 |
+
self.filename = filename
|
| 132 |
+
self.text_chunks = text_chunks
|
| 133 |
+
self.tables = tables
|
| 134 |
+
self.total_pages = total_pages
|
| 135 |
+
self.file_type = file_type
|
| 136 |
+
|
| 137 |
+
def __repr__(self) -> str:
|
| 138 |
+
return (
|
| 139 |
+
f"ProcessedDocument(filename={self.filename}, "
|
| 140 |
+
f"text_chunks={len(self.text_chunks)}, "
|
| 141 |
+
f"tables={len(self.tables)})"
|
| 142 |
+
)
|
src/models/similarity.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data models for similarity scoring and comparison results.
|
| 3 |
+
"""
|
| 4 |
+
from typing import Dict, Any, List
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ModalityScore(BaseModel):
|
| 10 |
+
"""Represents similarity score for a specific modality (text, table, etc.)."""
|
| 11 |
+
|
| 12 |
+
modality: str = Field(..., description="Type of modality (e.g., 'text', 'table')")
|
| 13 |
+
score: float = Field(..., ge=0.0, le=1.0, description="Similarity score (0.0 to 1.0)")
|
| 14 |
+
details: Dict[str, Any] = Field(default_factory=dict, description="Additional details about the scoring")
|
| 15 |
+
matched_items: List[Dict[str, Any]] = Field(default_factory=list, description="List of matched items between documents")
|
| 16 |
+
|
| 17 |
+
def __repr__(self) -> str:
|
| 18 |
+
return f"ModalityScore(modality={self.modality}, score={self.score:.3f})"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class SimilarityReport(BaseModel):
|
| 22 |
+
"""Contains comprehensive similarity comparison results between two documents."""
|
| 23 |
+
|
| 24 |
+
doc1_name: str = Field(..., description="Name of first document")
|
| 25 |
+
doc2_name: str = Field(..., description="Name of second document")
|
| 26 |
+
overall_score: float = Field(..., ge=0.0, le=1.0, description="Overall similarity score (0.0 to 1.0)")
|
| 27 |
+
text_score: ModalityScore = Field(..., description="ModalityScore for text")
|
| 28 |
+
table_score: ModalityScore = Field(..., description="ModalityScore for tables")
|
| 29 |
+
matched_sections: List[Dict[str, Any]] = Field(default_factory=list, description="List of matched sections with details")
|
| 30 |
+
weights_used: Dict[str, float] = Field(default_factory=dict, description="Weights used for modality scoring")
|
| 31 |
+
timestamp: datetime = Field(default_factory=datetime.now, description="Time when report was generated")
|
| 32 |
+
|
| 33 |
+
class Config:
|
| 34 |
+
arbitrary_types_allowed = True
|
| 35 |
+
|
| 36 |
+
def __repr__(self) -> str:
|
| 37 |
+
return (
|
| 38 |
+
f"SimilarityReport(docs={self.doc1_name} vs {self.doc2_name}, "
|
| 39 |
+
f"score={self.overall_score:.3f})"
|
| 40 |
+
)
|
src/orchestrator/__init__.py
ADDED
|
File without changes
|
src/orchestrator/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (185 Bytes). View file
|
|
|
src/orchestrator/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (185 Bytes). View file
|
|
|
src/orchestrator/__pycache__/scorers.cpython-312.pyc
ADDED
|
Binary file (6.52 kB). View file
|
|
|
src/orchestrator/__pycache__/scorers.cpython-313.pyc
ADDED
|
Binary file (6.37 kB). View file
|
|
|
src/orchestrator/__pycache__/similarity_orchestrator.cpython-312.pyc
ADDED
|
Binary file (4.64 kB). View file
|
|
|
src/orchestrator/__pycache__/similarity_orchestrator.cpython-313.pyc
ADDED
|
Binary file (4.55 kB). View file
|
|
|
src/orchestrator/scorers.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Similarity scorers for different modalities.
|
| 3 |
+
"""
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List, Dict, Any, Tuple
|
| 6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 7 |
+
|
| 8 |
+
from models.similarity import ModalityScore
|
| 9 |
+
from models.document import DocumentChunk, TableExtraction
|
| 10 |
+
import config
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def compute_text_similarity(
|
| 14 |
+
doc1_chunks: List[DocumentChunk],
|
| 15 |
+
doc1_embeddings: np.ndarray,
|
| 16 |
+
doc2_chunks: List[DocumentChunk],
|
| 17 |
+
doc2_embeddings: np.ndarray
|
| 18 |
+
) -> ModalityScore:
|
| 19 |
+
"""
|
| 20 |
+
Compute text similarity between two documents.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
doc1_chunks: Chunks from document 1
|
| 24 |
+
doc1_embeddings: Embeddings for document 1
|
| 25 |
+
doc2_chunks: Chunks from document 2
|
| 26 |
+
doc2_embeddings: Embeddings for document 2
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
ModalityScore with text similarity details
|
| 30 |
+
"""
|
| 31 |
+
if len(doc1_embeddings) == 0 or len(doc2_embeddings) == 0:
|
| 32 |
+
return ModalityScore(
|
| 33 |
+
modality="text",
|
| 34 |
+
score=0.0,
|
| 35 |
+
details={"reason": "One or both documents have no text"},
|
| 36 |
+
matched_items=[]
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Compute pairwise cosine similarities
|
| 40 |
+
similarities = cosine_similarity(doc1_embeddings, doc2_embeddings)
|
| 41 |
+
|
| 42 |
+
# Find best matches for each chunk in doc1
|
| 43 |
+
matched_items = []
|
| 44 |
+
similarity_scores = []
|
| 45 |
+
|
| 46 |
+
for i, chunk1 in enumerate(doc1_chunks):
|
| 47 |
+
# Find best matching chunk in doc2
|
| 48 |
+
best_match_idx = np.argmax(similarities[i])
|
| 49 |
+
best_score = similarities[i][best_match_idx]
|
| 50 |
+
|
| 51 |
+
if best_score > 0.5: # Only include matches above threshold
|
| 52 |
+
chunk2 = doc2_chunks[best_match_idx]
|
| 53 |
+
|
| 54 |
+
matched_items.append({
|
| 55 |
+
"doc1_chunk_id": chunk1.chunk_id,
|
| 56 |
+
"doc2_chunk_id": chunk2.chunk_id,
|
| 57 |
+
"doc1_content": chunk1.content[:200] + "..." if len(chunk1.content) > 200 else chunk1.content,
|
| 58 |
+
"doc2_content": chunk2.content[:200] + "..." if len(chunk2.content) > 200 else chunk2.content,
|
| 59 |
+
"similarity": float(best_score),
|
| 60 |
+
"doc1_page": chunk1.page_number,
|
| 61 |
+
"doc2_page": chunk2.page_number
|
| 62 |
+
})
|
| 63 |
+
|
| 64 |
+
similarity_scores.append(best_score)
|
| 65 |
+
|
| 66 |
+
# Overall text score (mean of best matches)
|
| 67 |
+
overall_score = float(np.mean(similarity_scores)) if similarity_scores else 0.0
|
| 68 |
+
|
| 69 |
+
# Sort matched items by similarity (descending)
|
| 70 |
+
matched_items.sort(key=lambda x: x["similarity"], reverse=True)
|
| 71 |
+
|
| 72 |
+
return ModalityScore(
|
| 73 |
+
modality="text",
|
| 74 |
+
score=overall_score,
|
| 75 |
+
details={
|
| 76 |
+
"num_doc1_chunks": len(doc1_chunks),
|
| 77 |
+
"num_doc2_chunks": len(doc2_chunks),
|
| 78 |
+
"num_matches": len(matched_items),
|
| 79 |
+
"average_similarity": overall_score
|
| 80 |
+
},
|
| 81 |
+
matched_items=matched_items[:config.TOP_K_MATCHES] # Limit to top K
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def compute_table_similarity(
|
| 86 |
+
doc1_tables: List[TableExtraction],
|
| 87 |
+
doc1_embeddings: np.ndarray,
|
| 88 |
+
doc2_tables: List[TableExtraction],
|
| 89 |
+
doc2_embeddings: np.ndarray
|
| 90 |
+
) -> ModalityScore:
|
| 91 |
+
"""
|
| 92 |
+
Compute table similarity between two documents.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
doc1_tables: Tables from document 1
|
| 96 |
+
doc1_embeddings: Embeddings for document 1 tables
|
| 97 |
+
doc2_tables: Tables from document 2
|
| 98 |
+
doc2_embeddings: Embeddings for document 2 tables
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
ModalityScore with table similarity details
|
| 102 |
+
"""
|
| 103 |
+
if len(doc1_tables) == 0 and len(doc2_tables) == 0:
|
| 104 |
+
# Both documents have no tables - perfectly similar in this modality
|
| 105 |
+
return ModalityScore(
|
| 106 |
+
modality="table",
|
| 107 |
+
score=1.0,
|
| 108 |
+
details={"reason": "Neither document has tables"},
|
| 109 |
+
matched_items=[]
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
if len(doc1_embeddings) == 0 or len(doc2_embeddings) == 0:
|
| 113 |
+
# One has tables, the other doesn't
|
| 114 |
+
return ModalityScore(
|
| 115 |
+
modality="table",
|
| 116 |
+
score=0.0,
|
| 117 |
+
details={"reason": "One document has tables, the other doesn't"},
|
| 118 |
+
matched_items=[]
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# Compute pairwise cosine similarities
|
| 122 |
+
similarities = cosine_similarity(doc1_embeddings, doc2_embeddings)
|
| 123 |
+
|
| 124 |
+
# Find best matches
|
| 125 |
+
matched_items = []
|
| 126 |
+
similarity_scores = []
|
| 127 |
+
|
| 128 |
+
for i, table1 in enumerate(doc1_tables):
|
| 129 |
+
# Find best matching table in doc2
|
| 130 |
+
best_match_idx = np.argmax(similarities[i])
|
| 131 |
+
best_score = similarities[i][best_match_idx]
|
| 132 |
+
|
| 133 |
+
if best_score > 0.3: # Lower threshold for tables
|
| 134 |
+
table2 = doc2_tables[best_match_idx]
|
| 135 |
+
|
| 136 |
+
matched_items.append({
|
| 137 |
+
"doc1_table_id": table1.table_id,
|
| 138 |
+
"doc2_table_id": table2.table_id,
|
| 139 |
+
"doc1_schema": table1.schema_summary,
|
| 140 |
+
"doc2_schema": table2.schema_summary,
|
| 141 |
+
"similarity": float(best_score),
|
| 142 |
+
"doc1_page": table1.page_number,
|
| 143 |
+
"doc2_page": table2.page_number
|
| 144 |
+
})
|
| 145 |
+
|
| 146 |
+
similarity_scores.append(best_score)
|
| 147 |
+
|
| 148 |
+
# Overall table score
|
| 149 |
+
overall_score = float(np.mean(similarity_scores)) if similarity_scores else 0.0
|
| 150 |
+
|
| 151 |
+
# Sort matched items by similarity
|
| 152 |
+
matched_items.sort(key=lambda x: x["similarity"], reverse=True)
|
| 153 |
+
|
| 154 |
+
return ModalityScore(
|
| 155 |
+
modality="table",
|
| 156 |
+
score=overall_score,
|
| 157 |
+
details={
|
| 158 |
+
"num_doc1_tables": len(doc1_tables),
|
| 159 |
+
"num_doc2_tables": len(doc2_tables),
|
| 160 |
+
"num_matches": len(matched_items),
|
| 161 |
+
"average_similarity": overall_score
|
| 162 |
+
},
|
| 163 |
+
matched_items=matched_items
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def compute_weighted_score(
|
| 168 |
+
modality_scores: Dict[str, ModalityScore],
|
| 169 |
+
weights: Dict[str, float] = None
|
| 170 |
+
) -> float:
|
| 171 |
+
"""
|
| 172 |
+
Compute weighted overall similarity score.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
modality_scores: Dictionary of modality -> ModalityScore
|
| 176 |
+
weights: Dictionary of modality -> weight (defaults to config.MODALITY_WEIGHTS)
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
Weighted overall score (0.0 to 1.0)
|
| 180 |
+
"""
|
| 181 |
+
if weights is None:
|
| 182 |
+
weights = config.MODALITY_WEIGHTS
|
| 183 |
+
|
| 184 |
+
total_score = 0.0
|
| 185 |
+
total_weight = 0.0
|
| 186 |
+
|
| 187 |
+
for modality, score_obj in modality_scores.items():
|
| 188 |
+
if modality in weights:
|
| 189 |
+
weight = weights[modality]
|
| 190 |
+
total_score += score_obj.score * weight
|
| 191 |
+
total_weight += weight
|
| 192 |
+
|
| 193 |
+
# Normalize by total weight
|
| 194 |
+
if total_weight > 0:
|
| 195 |
+
return total_score / total_weight
|
| 196 |
+
else:
|
| 197 |
+
return 0.0
|
src/orchestrator/similarity_orchestrator.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Similarity orchestrator for coordinating document comparison across modalities.
|
| 3 |
+
"""
|
| 4 |
+
from typing import Dict, Tuple
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
from models.document import ProcessedDocument
|
| 8 |
+
from models.similarity import SimilarityReport, ModalityScore
|
| 9 |
+
from orchestrator.scorers import (
|
| 10 |
+
compute_text_similarity,
|
| 11 |
+
compute_table_similarity,
|
| 12 |
+
compute_weighted_score
|
| 13 |
+
)
|
| 14 |
+
import config
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class SimilarityOrchestrator:
|
| 18 |
+
"""Orchestrates similarity comparison across multiple modalities."""
|
| 19 |
+
|
| 20 |
+
def __init__(self, weights: Dict[str, float] = None):
|
| 21 |
+
"""
|
| 22 |
+
Initialize orchestrator.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
weights: Custom modality weights (defaults to config.MODALITY_WEIGHTS)
|
| 26 |
+
"""
|
| 27 |
+
self.weights = weights or config.MODALITY_WEIGHTS
|
| 28 |
+
|
| 29 |
+
async def compare_documents(
|
| 30 |
+
self,
|
| 31 |
+
doc1: ProcessedDocument,
|
| 32 |
+
doc1_text_embeddings: np.ndarray,
|
| 33 |
+
doc1_table_embeddings: np.ndarray,
|
| 34 |
+
doc2: ProcessedDocument,
|
| 35 |
+
doc2_text_embeddings: np.ndarray,
|
| 36 |
+
doc2_table_embeddings: np.ndarray
|
| 37 |
+
) -> SimilarityReport:
|
| 38 |
+
"""
|
| 39 |
+
Compare two processed documents across all modalities.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
doc1: First processed document
|
| 43 |
+
doc1_text_embeddings: Text embeddings for doc1
|
| 44 |
+
doc1_table_embeddings: Table embeddings for doc1
|
| 45 |
+
doc2: Second processed document
|
| 46 |
+
doc2_text_embeddings: Text embeddings for doc2
|
| 47 |
+
doc2_table_embeddings: Table embeddings for doc2
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
SimilarityReport with overall score and per-modality details
|
| 51 |
+
"""
|
| 52 |
+
# Compute text similarity
|
| 53 |
+
text_score = compute_text_similarity(
|
| 54 |
+
doc1.text_chunks,
|
| 55 |
+
doc1_text_embeddings,
|
| 56 |
+
doc2.text_chunks,
|
| 57 |
+
doc2_text_embeddings
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Compute table similarity
|
| 61 |
+
table_score = compute_table_similarity(
|
| 62 |
+
doc1.tables,
|
| 63 |
+
doc1_table_embeddings,
|
| 64 |
+
doc2.tables,
|
| 65 |
+
doc2_table_embeddings
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Compute weighted overall score
|
| 69 |
+
modality_scores = {
|
| 70 |
+
"text": text_score,
|
| 71 |
+
"table": table_score
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
overall_score = compute_weighted_score(modality_scores, self.weights)
|
| 75 |
+
|
| 76 |
+
# Compile matched sections from both modalities
|
| 77 |
+
matched_sections = []
|
| 78 |
+
|
| 79 |
+
# Add top text matches
|
| 80 |
+
for match in text_score.matched_items[:5]: # Top 5 text matches
|
| 81 |
+
matched_sections.append({
|
| 82 |
+
"type": "text",
|
| 83 |
+
"doc1_content": match["doc1_content"],
|
| 84 |
+
"doc2_content": match["doc2_content"],
|
| 85 |
+
"similarity": match["similarity"],
|
| 86 |
+
"doc1_page": match["doc1_page"],
|
| 87 |
+
"doc2_page": match["doc2_page"]
|
| 88 |
+
})
|
| 89 |
+
|
| 90 |
+
# Add top table matches
|
| 91 |
+
for match in table_score.matched_items[:3]: # Top 3 table matches
|
| 92 |
+
matched_sections.append({
|
| 93 |
+
"type": "table",
|
| 94 |
+
"doc1_schema": match["doc1_schema"],
|
| 95 |
+
"doc2_schema": match["doc2_schema"],
|
| 96 |
+
"similarity": match["similarity"],
|
| 97 |
+
"doc1_page": match["doc1_page"],
|
| 98 |
+
"doc2_page": match["doc2_page"]
|
| 99 |
+
})
|
| 100 |
+
|
| 101 |
+
# Sort all matched sections by similarity
|
| 102 |
+
matched_sections.sort(key=lambda x: x["similarity"], reverse=True)
|
| 103 |
+
|
| 104 |
+
# Create report
|
| 105 |
+
report = SimilarityReport(
|
| 106 |
+
doc1_name=doc1.filename,
|
| 107 |
+
doc2_name=doc2.filename,
|
| 108 |
+
overall_score=overall_score,
|
| 109 |
+
text_score=text_score,
|
| 110 |
+
table_score=table_score,
|
| 111 |
+
matched_sections=matched_sections,
|
| 112 |
+
weights_used=self.weights
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
return report
|
| 116 |
+
|
| 117 |
+
def adjust_weights(self, new_weights: Dict[str, float]) -> None:
|
| 118 |
+
"""
|
| 119 |
+
Adjust modality weights.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
new_weights: New weight dictionary
|
| 123 |
+
"""
|
| 124 |
+
# Validate weights sum to 1.0
|
| 125 |
+
total = sum(new_weights.values())
|
| 126 |
+
if abs(total - 1.0) > 0.01:
|
| 127 |
+
# Normalize weights
|
| 128 |
+
self.weights = {k: v / total for k, v in new_weights.items()}
|
| 129 |
+
else:
|
| 130 |
+
self.weights = new_weights
|
src/requirements-alternative.txt
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Alternative requirements.txt using pypdf instead of PyMuPDF
|
| 2 |
+
# Use this if PyMuPDF installation fails
|
| 3 |
+
|
| 4 |
+
# Core framework
|
| 5 |
+
streamlit>=1.31.0
|
| 6 |
+
|
| 7 |
+
# Data models
|
| 8 |
+
pydantic>=2.6.0
|
| 9 |
+
|
| 10 |
+
# Document parsing - Alternative approach
|
| 11 |
+
pypdf>=4.0.0 # Simpler PDF library with pure Python implementation
|
| 12 |
+
python-docx>=1.1.0
|
| 13 |
+
pdfplumber>=0.10.0
|
| 14 |
+
|
| 15 |
+
# ML & Embeddings
|
| 16 |
+
sentence-transformers>=2.3.0
|
| 17 |
+
torch>=2.2.0
|
| 18 |
+
|
| 19 |
+
# Vector storage
|
| 20 |
+
faiss-cpu>=1.7.0
|
| 21 |
+
|
| 22 |
+
# Data processing
|
| 23 |
+
numpy>=1.26.0
|
| 24 |
+
pandas>=2.2.0
|
| 25 |
+
Pillow>=10.2.0
|
| 26 |
+
|
| 27 |
+
# Utilities
|
| 28 |
+
python-dotenv>=1.0.0
|
| 29 |
+
|
| 30 |
+
# Visualization
|
| 31 |
+
plotly>=5.18.0
|
| 32 |
+
|
| 33 |
+
# Async
|
| 34 |
+
aiofiles>=23.2.0
|
| 35 |
+
|
| 36 |
+
# Similarity metrics
|
| 37 |
+
scikit-learn>=1.3.0
|
src/requirements.txt
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core framework
|
| 2 |
+
streamlit>=1.31.0
|
| 3 |
+
|
| 4 |
+
# Data models
|
| 5 |
+
pydantic>=2.6.0
|
| 6 |
+
|
| 7 |
+
# Document parsing - using versions compatible with Python 3.13
|
| 8 |
+
# Use pypdf if PyMuPDF has DLL issues on Windows
|
| 9 |
+
pypdf>=4.0.0 # Fallback PDF parser (pure Python, no DLL dependencies)
|
| 10 |
+
python-docx>=1.1.0
|
| 11 |
+
pdfplumber>=0.10.0
|
| 12 |
+
|
| 13 |
+
# ML & Embeddings
|
| 14 |
+
sentence-transformers>=2.3.0
|
| 15 |
+
torch>=2.2.0
|
| 16 |
+
|
| 17 |
+
# Vector storage
|
| 18 |
+
faiss-cpu>=1.7.0
|
| 19 |
+
|
| 20 |
+
# Data processing
|
| 21 |
+
numpy>=1.26.0
|
| 22 |
+
pandas>=2.2.0
|
| 23 |
+
Pillow>=10.2.0
|
| 24 |
+
|
| 25 |
+
# Utilities
|
| 26 |
+
python-dotenv>=1.0.0
|
| 27 |
+
|
| 28 |
+
# Visualization
|
| 29 |
+
plotly>=5.18.0
|
| 30 |
+
|
| 31 |
+
# Async
|
| 32 |
+
aiofiles>=23.2.0
|
| 33 |
+
|
| 34 |
+
# Similarity metrics
|
| 35 |
+
scikit-learn>=1.3.0
|
src/storage/__init__.py
ADDED
|
File without changes
|
src/storage/vector_store.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vector storage using FAISS for similarity search.
|
| 3 |
+
"""
|
| 4 |
+
import faiss
|
| 5 |
+
import numpy as np
|
| 6 |
+
from typing import Dict, List, Tuple, Optional, Any
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import pickle
|
| 9 |
+
|
| 10 |
+
import config
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class MultiModalVectorStore:
|
| 14 |
+
"""Vector store for managing multi-modal embeddings using FAISS."""
|
| 15 |
+
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.indices: Dict[str, faiss.Index] = {} # modality -> FAISS index
|
| 18 |
+
self.metadata: Dict[str, List[Dict[str, Any]]] = {} # modality -> list of metadata
|
| 19 |
+
self.dimension = config.EMBEDDING_DIMENSION
|
| 20 |
+
|
| 21 |
+
def add_vectors(
|
| 22 |
+
self,
|
| 23 |
+
modality: str,
|
| 24 |
+
embeddings: np.ndarray,
|
| 25 |
+
metadata: List[Dict[str, Any]]
|
| 26 |
+
) -> None:
|
| 27 |
+
"""
|
| 28 |
+
Add vectors to the store for a specific modality.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
modality: Modality type ('text' or 'table')
|
| 32 |
+
embeddings: Numpy array of embeddings (num_vectors x dimension)
|
| 33 |
+
metadata: List of metadata dicts for each vector
|
| 34 |
+
"""
|
| 35 |
+
if len(embeddings) == 0:
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
# Ensure embeddings are float32 (required by FAISS)
|
| 39 |
+
embeddings = embeddings.astype(np.float32)
|
| 40 |
+
|
| 41 |
+
# Create index if it doesn't exist
|
| 42 |
+
if modality not in self.indices:
|
| 43 |
+
self.indices[modality] = faiss.IndexFlatL2(self.dimension)
|
| 44 |
+
self.metadata[modality] = []
|
| 45 |
+
|
| 46 |
+
# Add vectors to index
|
| 47 |
+
self.indices[modality].add(embeddings)
|
| 48 |
+
|
| 49 |
+
# Add metadata
|
| 50 |
+
self.metadata[modality].extend(metadata)
|
| 51 |
+
|
| 52 |
+
def query_similar(
|
| 53 |
+
self,
|
| 54 |
+
modality: str,
|
| 55 |
+
query_vector: np.ndarray,
|
| 56 |
+
k: int = 10
|
| 57 |
+
) -> List[Tuple[int, float, Dict[str, Any]]]:
|
| 58 |
+
"""
|
| 59 |
+
Query for similar vectors.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
modality: Modality type to search in
|
| 63 |
+
query_vector: Query vector (1D array of dimension)
|
| 64 |
+
k: Number of results to return
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
List of (index, distance, metadata) tuples
|
| 68 |
+
"""
|
| 69 |
+
if modality not in self.indices or self.indices[modality].ntotal == 0:
|
| 70 |
+
return []
|
| 71 |
+
|
| 72 |
+
# Ensure query vector is 2D and float32
|
| 73 |
+
if query_vector.ndim == 1:
|
| 74 |
+
query_vector = query_vector.reshape(1, -1)
|
| 75 |
+
query_vector = query_vector.astype(np.float32)
|
| 76 |
+
|
| 77 |
+
# Search
|
| 78 |
+
k = min(k, self.indices[modality].ntotal)
|
| 79 |
+
distances, indices = self.indices[modality].search(query_vector, k)
|
| 80 |
+
|
| 81 |
+
# Compile results
|
| 82 |
+
results = []
|
| 83 |
+
for idx, distance in zip(indices[0], distances[0]):
|
| 84 |
+
if idx < len(self.metadata[modality]):
|
| 85 |
+
results.append((
|
| 86 |
+
int(idx),
|
| 87 |
+
float(distance),
|
| 88 |
+
self.metadata[modality][idx]
|
| 89 |
+
))
|
| 90 |
+
|
| 91 |
+
return results
|
| 92 |
+
|
| 93 |
+
def get_all_vectors(self, modality: str) -> Tuple[Optional[np.ndarray], List[Dict[str, Any]]]:
|
| 94 |
+
"""
|
| 95 |
+
Get all vectors and metadata for a modality.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
modality: Modality type
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Tuple of (embeddings array, metadata list)
|
| 102 |
+
"""
|
| 103 |
+
if modality not in self.indices or self.indices[modality].ntotal == 0:
|
| 104 |
+
return None, []
|
| 105 |
+
|
| 106 |
+
# Reconstruct vectors from index
|
| 107 |
+
num_vectors = self.indices[modality].ntotal
|
| 108 |
+
embeddings = faiss.rev_swig_ptr(
|
| 109 |
+
self.indices[modality].get_xb(),
|
| 110 |
+
num_vectors * self.dimension
|
| 111 |
+
).reshape(num_vectors, self.dimension)
|
| 112 |
+
|
| 113 |
+
return embeddings, self.metadata[modality]
|
| 114 |
+
|
| 115 |
+
def get_num_vectors(self, modality: str) -> int:
|
| 116 |
+
"""
|
| 117 |
+
Get number of vectors stored for a modality.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
modality: Modality type
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
Number of vectors
|
| 124 |
+
"""
|
| 125 |
+
if modality not in self.indices:
|
| 126 |
+
return 0
|
| 127 |
+
return self.indices[modality].ntotal
|
| 128 |
+
|
| 129 |
+
def save(self, filename_prefix: str) -> None:
|
| 130 |
+
"""
|
| 131 |
+
Save indices and metadata to disk.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
filename_prefix: Prefix for saved files
|
| 135 |
+
"""
|
| 136 |
+
save_dir = config.VECTOR_STORE_DIR
|
| 137 |
+
save_dir.mkdir(parents=True, exist_ok=True)
|
| 138 |
+
|
| 139 |
+
for modality, index in self.indices.items():
|
| 140 |
+
# Save FAISS index
|
| 141 |
+
index_path = save_dir / f"{filename_prefix}_{modality}.faiss"
|
| 142 |
+
faiss.write_index(index, str(index_path))
|
| 143 |
+
|
| 144 |
+
# Save metadata
|
| 145 |
+
metadata_path = save_dir / f"{filename_prefix}_{modality}_metadata.pkl"
|
| 146 |
+
with open(metadata_path, "wb") as f:
|
| 147 |
+
pickle.dump(self.metadata[modality], f)
|
| 148 |
+
|
| 149 |
+
def load(self, filename_prefix: str) -> bool:
|
| 150 |
+
"""
|
| 151 |
+
Load indices and metadata from disk.
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
filename_prefix: Prefix of saved files
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
True if loaded successfully, False otherwise
|
| 158 |
+
"""
|
| 159 |
+
load_dir = config.VECTOR_STORE_DIR
|
| 160 |
+
|
| 161 |
+
try:
|
| 162 |
+
# Find all index files with this prefix
|
| 163 |
+
for modality in ["text", "table"]:
|
| 164 |
+
index_path = load_dir / f"{filename_prefix}_{modality}.faiss"
|
| 165 |
+
metadata_path = load_dir / f"{filename_prefix}_{modality}_metadata.pkl"
|
| 166 |
+
|
| 167 |
+
if index_path.exists() and metadata_path.exists():
|
| 168 |
+
# Load FAISS index
|
| 169 |
+
self.indices[modality] = faiss.read_index(str(index_path))
|
| 170 |
+
|
| 171 |
+
# Load metadata
|
| 172 |
+
with open(metadata_path, "rb") as f:
|
| 173 |
+
self.metadata[modality] = pickle.load(f)
|
| 174 |
+
|
| 175 |
+
return True
|
| 176 |
+
except Exception as e:
|
| 177 |
+
print(f"Error loading vector store: {e}")
|
| 178 |
+
return False
|
| 179 |
+
|
| 180 |
+
def clear(self) -> None:
|
| 181 |
+
"""Clear all indices and metadata."""
|
| 182 |
+
self.indices.clear()
|
| 183 |
+
self.metadata.clear()
|
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,301 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import streamlit as st
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
| 10 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
-
forums](https://discuss.streamlit.io).
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
indices = np.linspace(0, 1, num_points)
|
| 20 |
-
theta = 2 * np.pi * num_turns * indices
|
| 21 |
-
radius = indices
|
| 22 |
-
|
| 23 |
-
x = radius * np.cos(theta)
|
| 24 |
-
y = radius * np.sin(theta)
|
| 25 |
-
|
| 26 |
-
df = pd.DataFrame({
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Multi-Agent Document Comparison Streamlit App
|
| 3 |
+
"""
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
# Add project root to Python path for imports
|
| 8 |
+
project_root = Path(__file__).parent
|
| 9 |
+
if str(project_root) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(project_root))
|
| 11 |
+
|
| 12 |
import streamlit as st
|
| 13 |
+
import asyncio
|
| 14 |
+
import json
|
| 15 |
|
| 16 |
+
# Import agents and utilities
|
| 17 |
+
from agents.ingestion_agent import IngestionAgent
|
| 18 |
+
from agents.text_agent import TextAgent
|
| 19 |
+
from agents.table_agent import TableAgent
|
| 20 |
+
from orchestrator.similarity_orchestrator import SimilarityOrchestrator
|
| 21 |
+
from utils.file_handler import save_uploaded_file, validate_file, get_file_type
|
| 22 |
+
from utils.visualization import (
|
| 23 |
+
create_similarity_gauge,
|
| 24 |
+
create_modality_breakdown_chart,
|
| 25 |
+
format_matched_sections,
|
| 26 |
+
create_score_legend
|
| 27 |
+
)
|
| 28 |
+
from models.document import ProcessedDocument
|
| 29 |
+
import config
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
# Page configuration
|
| 33 |
+
st.set_page_config(
|
| 34 |
+
page_title="Multi-Agent Document Comparator",
|
| 35 |
+
page_icon="π",
|
| 36 |
+
layout="wide",
|
| 37 |
+
initial_sidebar_state="expanded"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def main():
|
| 42 |
+
"""Main application function."""
|
| 43 |
+
|
| 44 |
+
# Header
|
| 45 |
+
st.title("π Multi-Agent Document Comparator")
|
| 46 |
+
st.markdown("**An agentic system to accurately match document similarity**")
|
| 47 |
+
|
| 48 |
+
# Show architecture diagram
|
| 49 |
+
with st.expander("ποΈ View System Architecture", expanded=False):
|
| 50 |
+
arch_path = Path("img/multi_agent_doc_similarity_architecture.svg")
|
| 51 |
+
if arch_path.exists():
|
| 52 |
+
st.image(str(arch_path), use_container_width=True)
|
| 53 |
+
else:
|
| 54 |
+
st.info("Architecture diagram not found")
|
| 55 |
+
|
| 56 |
+
st.markdown("---")
|
| 57 |
+
|
| 58 |
+
# Sidebar configuration
|
| 59 |
+
with st.sidebar:
|
| 60 |
+
st.header("βοΈ Configuration")
|
| 61 |
+
|
| 62 |
+
# Modality weights
|
| 63 |
+
st.subheader("Modality Weights")
|
| 64 |
+
text_weight = st.slider(
|
| 65 |
+
"Text Weight",
|
| 66 |
+
min_value=0.0,
|
| 67 |
+
max_value=1.0,
|
| 68 |
+
value=config.MODALITY_WEIGHTS["text"],
|
| 69 |
+
step=0.05
|
| 70 |
+
)
|
| 71 |
+
table_weight = 1.0 - text_weight
|
| 72 |
+
|
| 73 |
+
st.write(f"Table Weight: {table_weight:.2f}")
|
| 74 |
+
|
| 75 |
+
# Phase info
|
| 76 |
+
st.markdown("---")
|
| 77 |
+
st.subheader("π Phase 1 Implementation")
|
| 78 |
+
st.write("β
Text comparison")
|
| 79 |
+
st.write("β
Table comparison")
|
| 80 |
+
st.write("β³ Image comparison (Phase 2)")
|
| 81 |
+
st.write("β³ Layout comparison (Phase 2)")
|
| 82 |
+
st.write("β³ Metadata comparison (Phase 2)")
|
| 83 |
+
|
| 84 |
+
# Main content area
|
| 85 |
+
col1, col2 = st.columns(2)
|
| 86 |
+
|
| 87 |
+
with col1:
|
| 88 |
+
st.subheader("π€ Document 1 (Main)")
|
| 89 |
+
uploaded_file1 = st.file_uploader(
|
| 90 |
+
"Upload PDF or DOCX",
|
| 91 |
+
type=["pdf", "docx"],
|
| 92 |
+
key="file1",
|
| 93 |
+
help="Maximum file size: 50MB"
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
with col2:
|
| 97 |
+
st.subheader("π€ Document 2 (Comparison)")
|
| 98 |
+
uploaded_file2 = st.file_uploader(
|
| 99 |
+
"Upload PDF or DOCX",
|
| 100 |
+
type=["pdf", "docx"],
|
| 101 |
+
key="file2",
|
| 102 |
+
help="Maximum file size: 50MB"
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
# Compare button
|
| 106 |
+
st.markdown("---")
|
| 107 |
+
|
| 108 |
+
if st.button("π Compare Documents", type="primary", use_container_width=True):
|
| 109 |
+
if not uploaded_file1 or not uploaded_file2:
|
| 110 |
+
st.error("Please upload both documents before comparing.")
|
| 111 |
+
return
|
| 112 |
+
|
| 113 |
+
# Process documents and compare
|
| 114 |
+
with st.spinner("Processing documents..."):
|
| 115 |
+
try:
|
| 116 |
+
# Save uploaded files
|
| 117 |
+
file1_path = save_uploaded_file(uploaded_file1)
|
| 118 |
+
file2_path = save_uploaded_file(uploaded_file2)
|
| 119 |
+
|
| 120 |
+
# Validate files
|
| 121 |
+
valid1, error1 = validate_file(file1_path)
|
| 122 |
+
valid2, error2 = validate_file(file2_path)
|
| 123 |
+
|
| 124 |
+
if not valid1:
|
| 125 |
+
st.error(f"Document 1 error: {error1}")
|
| 126 |
+
return
|
| 127 |
+
if not valid2:
|
| 128 |
+
st.error(f"Document 2 error: {error2}")
|
| 129 |
+
return
|
| 130 |
+
|
| 131 |
+
# Process documents
|
| 132 |
+
report = asyncio.run(process_and_compare(
|
| 133 |
+
file1_path,
|
| 134 |
+
file2_path,
|
| 135 |
+
{"text": text_weight, "table": table_weight}
|
| 136 |
+
))
|
| 137 |
+
|
| 138 |
+
# Display results
|
| 139 |
+
display_results(report)
|
| 140 |
+
|
| 141 |
+
except Exception as e:
|
| 142 |
+
st.error(f"An error occurred: {str(e)}")
|
| 143 |
+
import traceback
|
| 144 |
+
st.code(traceback.format_exc())
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
async def process_and_compare(file1_path: str, file2_path: str, weights: dict):
|
| 148 |
+
"""
|
| 149 |
+
Process two documents and compare them.
|
| 150 |
+
|
| 151 |
+
Args:
|
| 152 |
+
file1_path: Path to first document
|
| 153 |
+
file2_path: Path to second document
|
| 154 |
+
weights: Modality weights
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
SimilarityReport
|
| 158 |
+
"""
|
| 159 |
+
# Initialize agents
|
| 160 |
+
ingestion_agent = IngestionAgent()
|
| 161 |
+
text_agent = TextAgent()
|
| 162 |
+
table_agent = TableAgent()
|
| 163 |
+
orchestrator = SimilarityOrchestrator(weights=weights)
|
| 164 |
+
|
| 165 |
+
# Progress tracking
|
| 166 |
+
progress_bar = st.progress(0)
|
| 167 |
+
status_text = st.empty()
|
| 168 |
+
|
| 169 |
+
# Step 1: Ingest documents
|
| 170 |
+
status_text.text("β³ Ingesting documents...")
|
| 171 |
+
progress_bar.progress(10)
|
| 172 |
+
|
| 173 |
+
raw_doc1 = await ingestion_agent.process(file1_path)
|
| 174 |
+
raw_doc2 = await ingestion_agent.process(file2_path)
|
| 175 |
+
|
| 176 |
+
progress_bar.progress(25)
|
| 177 |
+
|
| 178 |
+
# Step 2: Extract text
|
| 179 |
+
status_text.text("β³ Extracting and embedding text...")
|
| 180 |
+
|
| 181 |
+
text_chunks1, text_embeddings1 = await text_agent.process(raw_doc1)
|
| 182 |
+
text_chunks2, text_embeddings2 = await text_agent.process(raw_doc2)
|
| 183 |
+
|
| 184 |
+
progress_bar.progress(50)
|
| 185 |
+
|
| 186 |
+
# Step 3: Extract tables
|
| 187 |
+
status_text.text("β³ Extracting and embedding tables...")
|
| 188 |
+
|
| 189 |
+
tables1, table_embeddings1 = await table_agent.process(raw_doc1)
|
| 190 |
+
tables2, table_embeddings2 = await table_agent.process(raw_doc2)
|
| 191 |
+
|
| 192 |
+
progress_bar.progress(75)
|
| 193 |
+
|
| 194 |
+
# Step 4: Create processed documents
|
| 195 |
+
processed_doc1 = ProcessedDocument(
|
| 196 |
+
filename=raw_doc1.filename,
|
| 197 |
+
text_chunks=text_chunks1,
|
| 198 |
+
tables=tables1,
|
| 199 |
+
total_pages=raw_doc1.total_pages,
|
| 200 |
+
file_type=raw_doc1.file_type
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
processed_doc2 = ProcessedDocument(
|
| 204 |
+
filename=raw_doc2.filename,
|
| 205 |
+
text_chunks=text_chunks2,
|
| 206 |
+
tables=tables2,
|
| 207 |
+
total_pages=raw_doc2.total_pages,
|
| 208 |
+
file_type=raw_doc2.file_type
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
# Step 5: Compare documents
|
| 212 |
+
status_text.text("β³ Comparing documents...")
|
| 213 |
+
|
| 214 |
+
report = await orchestrator.compare_documents(
|
| 215 |
+
processed_doc1,
|
| 216 |
+
text_embeddings1,
|
| 217 |
+
table_embeddings1,
|
| 218 |
+
processed_doc2,
|
| 219 |
+
text_embeddings2,
|
| 220 |
+
table_embeddings2
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
progress_bar.progress(100)
|
| 224 |
+
status_text.text("β
Comparison complete!")
|
| 225 |
+
|
| 226 |
+
return report
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def display_results(report):
|
| 230 |
+
"""
|
| 231 |
+
Display comparison results.
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
report: SimilarityReport object
|
| 235 |
+
"""
|
| 236 |
+
st.markdown("---")
|
| 237 |
+
st.header("π Comparison Results")
|
| 238 |
+
|
| 239 |
+
# Overall similarity gauge
|
| 240 |
+
col1, col2 = st.columns([1, 1])
|
| 241 |
+
|
| 242 |
+
with col1:
|
| 243 |
+
gauge_fig = create_similarity_gauge(report.overall_score)
|
| 244 |
+
st.plotly_chart(gauge_fig, use_container_width=True)
|
| 245 |
+
|
| 246 |
+
with col2:
|
| 247 |
+
st.markdown(create_score_legend())
|
| 248 |
+
|
| 249 |
+
# Modality breakdown
|
| 250 |
+
st.markdown("---")
|
| 251 |
+
st.subheader("π Per-Modality Breakdown")
|
| 252 |
+
|
| 253 |
+
breakdown_fig = create_modality_breakdown_chart(report)
|
| 254 |
+
st.plotly_chart(breakdown_fig, use_container_width=True)
|
| 255 |
+
|
| 256 |
+
# Detailed scores
|
| 257 |
+
col1, col2 = st.columns(2)
|
| 258 |
+
|
| 259 |
+
with col1:
|
| 260 |
+
if report.text_score:
|
| 261 |
+
st.metric(
|
| 262 |
+
"Text Similarity",
|
| 263 |
+
f"{report.text_score.score:.1%}",
|
| 264 |
+
f"{report.text_score.details.get('num_matches', 0)} matches"
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
with col2:
|
| 268 |
+
if report.table_score:
|
| 269 |
+
st.metric(
|
| 270 |
+
"Table Similarity",
|
| 271 |
+
f"{report.table_score.score:.1%}",
|
| 272 |
+
f"{report.table_score.details.get('num_matches', 0)} matches"
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
# Matched sections
|
| 276 |
+
st.markdown("---")
|
| 277 |
+
st.subheader("π Top Matched Sections")
|
| 278 |
+
|
| 279 |
+
if report.matched_sections:
|
| 280 |
+
formatted_sections = format_matched_sections(report.matched_sections[:5])
|
| 281 |
+
st.markdown(formatted_sections)
|
| 282 |
+
else:
|
| 283 |
+
st.info("No significant matches found between documents.")
|
| 284 |
+
|
| 285 |
+
# Download report
|
| 286 |
+
st.markdown("---")
|
| 287 |
+
report_json = json.dumps(report.model_dump(), indent=2, default=str)
|
| 288 |
+
|
| 289 |
+
col1, col2, col3 = st.columns([1, 1, 2])
|
| 290 |
+
|
| 291 |
+
with col1:
|
| 292 |
+
st.download_button(
|
| 293 |
+
label="π₯ Download Report (JSON)",
|
| 294 |
+
data=report_json,
|
| 295 |
+
file_name=f"similarity_report_{report.timestamp.strftime('%Y%m%d_%H%M%S')}.json",
|
| 296 |
+
mime="application/json"
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
|
| 300 |
+
if __name__ == "__main__":
|
| 301 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/utils/__init__.py
ADDED
|
File without changes
|
src/utils/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (178 Bytes). View file
|
|
|
src/utils/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (178 Bytes). View file
|
|
|
src/utils/__pycache__/file_handler.cpython-312.pyc
ADDED
|
Binary file (4.59 kB). View file
|
|
|
src/utils/__pycache__/visualization.cpython-312.pyc
ADDED
|
Binary file (8.29 kB). View file
|
|
|
src/utils/__pycache__/visualization.cpython-313.pyc
ADDED
|
Binary file (8.23 kB). View file
|
|
|