Juan Salas commited on
Commit ·
12f0afd
1
Parent(s): 0f5a908
Refactored code
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +5 -1
- .streamlit/config.toml +6 -7
- Dockerfile +0 -78
- README.md +388 -71
- app.py +0 -599
- app/__init__.py +7 -0
- {src → app}/ai/__init__.py +9 -9
- app/ai/agent_core.py +277 -0
- app/ai/agent_utils.py +150 -0
- app/ai/document_classifier.py +140 -0
- app/ai/processing_pipeline.py +279 -0
- {src → app}/ai/prompts.py +64 -17
- app/core/__init__.py +61 -0
- app/core/config.py +202 -0
- app/core/constants.py +24 -0
- app/core/content_ingestion.py +282 -0
- src/document_processing.py → app/core/document_processor.py +183 -126
- app/core/exceptions.py +201 -0
- app/core/knowledge_graph.py +639 -0
- app/core/logging.py +94 -0
- app/core/model_cache.py +124 -0
- app/core/parsers.py +155 -0
- app/core/performance.py +382 -0
- app/core/ranking.py +51 -0
- app/core/reports.py +32 -0
- app/core/search.py +773 -0
- app/core/sparse_index.py +263 -0
- app/core/stage_manager.py +326 -0
- app/core/utils.py +65 -0
- app/handlers/__init__.py +11 -0
- app/handlers/ai_handler.py +180 -0
- app/handlers/document_handler.py +230 -0
- app/handlers/export_handler.py +153 -0
- app/main.py +146 -0
- app/services/ai_client.py +301 -0
- app/services/ai_config.py +65 -0
- app/services/ai_service.py +438 -0
- app/services/response_parser.py +185 -0
- app/ui/__init__.py +9 -0
- app/ui/error_handler.py +284 -0
- app/ui/session_manager.py +117 -0
- app/ui/sidebar.py +164 -0
- app/ui/tabs/__init__.py +21 -0
- app/ui/tabs/checklist_tab.py +136 -0
- app/ui/tabs/graph_tab.py +548 -0
- app/ui/tabs/overview_tab.py +76 -0
- app/ui/tabs/qa_tab.py +216 -0
- app/ui/tabs/questions_tab.py +143 -0
- app/ui/tabs/strategic_tab.py +85 -0
- app/ui/tabs/tab_base.py +141 -0
.gitignore
CHANGED
|
@@ -53,4 +53,8 @@ htmlcov/
|
|
| 53 |
# Deployment
|
| 54 |
*.pem
|
| 55 |
*.key
|
| 56 |
-
*.crt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
# Deployment
|
| 54 |
*.pem
|
| 55 |
*.key
|
| 56 |
+
*.crt
|
| 57 |
+
# Cache directories
|
| 58 |
+
.cache/
|
| 59 |
+
|
| 60 |
+
# Model files - allow in models/ directory for Streamlit Cloud
|
.streamlit/config.toml
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
-
[
|
| 2 |
-
|
| 3 |
-
backgroundColor = "#FFFFFF"
|
| 4 |
-
secondaryBackgroundColor = "#F0F2F6"
|
| 5 |
-
textColor = "#262730"
|
| 6 |
|
| 7 |
[server]
|
| 8 |
headless = true
|
| 9 |
port = 8501
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
[
|
| 12 |
-
|
|
|
|
| 1 |
+
[global]
|
| 2 |
+
developmentMode = false
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
[server]
|
| 5 |
headless = true
|
| 6 |
port = 8501
|
| 7 |
+
address = "0.0.0.0"
|
| 8 |
+
enableCORS = false
|
| 9 |
|
| 10 |
+
[browser]
|
| 11 |
+
gatherUsageStats = false
|
Dockerfile
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
# Multi-stage Dockerfile for DD-Checklist Application
|
| 2 |
-
# Optimized for AWS deployment with minimal image size
|
| 3 |
-
|
| 4 |
-
# Build stage - Install dependencies and prepare the application
|
| 5 |
-
FROM python:3.11-slim as builder
|
| 6 |
-
|
| 7 |
-
# Set environment variables
|
| 8 |
-
ENV PYTHONUNBUFFERED=1 \
|
| 9 |
-
PYTHONDONTWRITEBYTECODE=1 \
|
| 10 |
-
PIP_NO_CACHE_DIR=1 \
|
| 11 |
-
PIP_DISABLE_PIP_VERSION_CHECK=1
|
| 12 |
-
|
| 13 |
-
# Install system dependencies needed for building Python packages
|
| 14 |
-
RUN apt-get update && apt-get install -y \
|
| 15 |
-
build-essential \
|
| 16 |
-
curl \
|
| 17 |
-
git \
|
| 18 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 19 |
-
|
| 20 |
-
# Install uv for faster dependency management
|
| 21 |
-
RUN pip install uv
|
| 22 |
-
|
| 23 |
-
# Set work directory
|
| 24 |
-
WORKDIR /app
|
| 25 |
-
|
| 26 |
-
# Copy dependency files
|
| 27 |
-
COPY pyproject.toml requirements.txt ./
|
| 28 |
-
|
| 29 |
-
# Install Python dependencies using uv for faster installation
|
| 30 |
-
RUN uv pip install --system -r requirements.txt
|
| 31 |
-
|
| 32 |
-
# Production stage - Create minimal runtime image
|
| 33 |
-
FROM python:3.11-slim as production
|
| 34 |
-
|
| 35 |
-
# Set environment variables
|
| 36 |
-
ENV PYTHONUNBUFFERED=1 \
|
| 37 |
-
PYTHONDONTWRITEBYTECODE=1 \
|
| 38 |
-
TOKENIZERS_PARALLELISM=false \
|
| 39 |
-
STREAMLIT_SERVER_PORT=8501 \
|
| 40 |
-
STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
|
| 41 |
-
STREAMLIT_SERVER_HEADLESS=true \
|
| 42 |
-
STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
| 43 |
-
|
| 44 |
-
# Install minimal runtime dependencies
|
| 45 |
-
RUN apt-get update && apt-get install -y \
|
| 46 |
-
curl \
|
| 47 |
-
&& rm -rf /var/lib/apt/lists/* \
|
| 48 |
-
&& apt-get clean
|
| 49 |
-
|
| 50 |
-
# Create non-root user for security
|
| 51 |
-
RUN groupadd -r appuser && useradd -r -g appuser appuser
|
| 52 |
-
|
| 53 |
-
# Set work directory
|
| 54 |
-
WORKDIR /app
|
| 55 |
-
|
| 56 |
-
# Copy Python packages from builder stage
|
| 57 |
-
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
|
| 58 |
-
COPY --from=builder /usr/local/bin /usr/local/bin
|
| 59 |
-
|
| 60 |
-
# Copy application code
|
| 61 |
-
COPY --chown=appuser:appuser . .
|
| 62 |
-
|
| 63 |
-
# Create necessary directories and set permissions
|
| 64 |
-
RUN mkdir -p /app/data /app/logs && \
|
| 65 |
-
chown -R appuser:appuser /app
|
| 66 |
-
|
| 67 |
-
# Switch to non-root user
|
| 68 |
-
USER appuser
|
| 69 |
-
|
| 70 |
-
# Health check
|
| 71 |
-
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 72 |
-
CMD curl -f http://localhost:8501/_stcore/health || exit 1
|
| 73 |
-
|
| 74 |
-
# Expose Streamlit port
|
| 75 |
-
EXPOSE 8501
|
| 76 |
-
|
| 77 |
-
# Default command to run the application
|
| 78 |
-
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -54,6 +54,162 @@ A professional, enterprise-grade Streamlit application for automated due diligen
|
|
| 54 |
- Comprehensive error handling and exponential backoff retry logic
|
| 55 |
- Toggle AI features on/off for comparison
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
## 🚀 Quick Start
|
| 58 |
|
| 59 |
### Prerequisites
|
|
@@ -68,15 +224,15 @@ cd dd_poc
|
|
| 68 |
|
| 69 |
### Running Locally
|
| 70 |
```bash
|
| 71 |
-
# Option 1: Use the
|
| 72 |
-
|
| 73 |
|
| 74 |
# Option 2: Manual uv commands
|
| 75 |
uv sync # Install dependencies
|
| 76 |
-
uv run streamlit run app.py
|
| 77 |
|
| 78 |
# Option 3: Development mode with auto-reload
|
| 79 |
-
uv run streamlit run app.py --server.runOnSave true
|
| 80 |
```
|
| 81 |
|
| 82 |
### Environment Setup (for AI features)
|
|
@@ -94,8 +250,8 @@ echo "TOKENIZERS_PARALLELISM=false" >> .env
|
|
| 94 |
echo "CLAUDE_MODEL=claude-sonnet-4-20250514" >> .env
|
| 95 |
echo "CLAUDE_TEMPERATURE=0.3" >> .env
|
| 96 |
echo "CLAUDE_MAX_TOKENS=2000" >> .env
|
| 97 |
-
echo "SENTENCE_TRANSFORMER_MODEL=all-
|
| 98 |
-
echo "EMBEDDING_DIMENSION=
|
| 99 |
|
| 100 |
# Processing Configuration
|
| 101 |
echo "CHUNK_SIZE=400" >> .env
|
|
@@ -143,10 +299,10 @@ TOKENIZERS_PARALLELISM=false
|
|
| 143 |
|
| 144 |
#### **Model Configuration**
|
| 145 |
- `CLAUDE_MODEL` - Claude model to use (default: `claude-sonnet-4-20250514`)
|
| 146 |
-
- `CLAUDE_TEMPERATURE` - Model temperature (default: `0.
|
| 147 |
- `CLAUDE_MAX_TOKENS` - Maximum tokens per response (default: `2000`)
|
| 148 |
-
- `SENTENCE_TRANSFORMER_MODEL` - Embedding model (default: `all-
|
| 149 |
-
- `EMBEDDING_DIMENSION` - Embedding dimensions (default: `
|
| 150 |
|
| 151 |
#### **Document Processing**
|
| 152 |
- `CHUNK_SIZE` - Text chunk size in characters (default: `400`)
|
|
@@ -186,9 +342,118 @@ uv run python -c "from app import DDChecklistApp; print('✅ App ready')"
|
|
| 186 |
uv run python -c "from src.ai import DDChecklistAgent; print('✅ AI module ready')"
|
| 187 |
|
| 188 |
# Start the application to verify everything works
|
| 189 |
-
uv run streamlit run app.py
|
| 190 |
```
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
## 📱 User Interface
|
| 193 |
|
| 194 |
### Sidebar Layout
|
|
@@ -228,37 +493,73 @@ uv run streamlit run app.py
|
|
| 228 |
|
| 229 |
```
|
| 230 |
dd_poc/
|
| 231 |
-
├── app
|
| 232 |
-
├──
|
| 233 |
-
│ ├── __init__.py
|
| 234 |
-
│ ├──
|
| 235 |
-
│ ├──
|
| 236 |
-
│ │ ├──
|
| 237 |
-
│ │ ├──
|
| 238 |
-
│ │ ├──
|
| 239 |
-
│ │ ├──
|
| 240 |
-
│ │ └── prompts.py
|
| 241 |
-
│ ├──
|
| 242 |
-
│ ├──
|
| 243 |
-
│ ├──
|
| 244 |
-
│
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
├── data/ # 📊 Data directories
|
| 246 |
│ ├── checklist/ # Due diligence checklists (.md)
|
| 247 |
│ ├── questions/ # Question lists (.md)
|
| 248 |
│ ├── strategy/ # Strategic documents (.md)
|
|
|
|
| 249 |
│ └── vdrs/ # Virtual Data Rooms (2 projects)
|
| 250 |
│ ├── automated-services-transformation/
|
| 251 |
│ └── industrial-security-leadership/
|
| 252 |
-
├──
|
| 253 |
-
├──
|
| 254 |
-
|
| 255 |
-
├──
|
| 256 |
-
├──
|
| 257 |
-
├──
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
| 259 |
├── .env # API keys (create this)
|
| 260 |
-
|
| 261 |
-
└── .logs/ # Application logs (auto-created)
|
| 262 |
```
|
| 263 |
|
| 264 |
## 🎨 Key Features Explained
|
|
@@ -267,7 +568,7 @@ dd_poc/
|
|
| 267 |
- **Supported Formats**: PDF, DOCX, DOC, TXT, MD
|
| 268 |
- **Parallel Processing**: Multi-threaded document extraction (4 workers default)
|
| 269 |
- **Smart Chunking**: 400-character chunks with 50-character overlap
|
| 270 |
-
- **Embeddings**: Sentence-transformers (all-
|
| 271 |
- **Vector Store**: FAISS IndexFlatIP for 10x faster similarity search
|
| 272 |
- **Caching**: Intelligent embedding cache with invalidation
|
| 273 |
|
|
@@ -315,21 +616,31 @@ dd_poc/
|
|
| 315 |
4. Add ANTHROPIC_API_KEY in Streamlit secrets
|
| 316 |
5. Deploy (automatic)
|
| 317 |
|
| 318 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
```bash
|
| 320 |
-
#
|
| 321 |
-
.
|
|
|
|
| 322 |
|
| 323 |
-
#
|
| 324 |
-
|
| 325 |
-
|
| 326 |
|
| 327 |
-
#
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
| 329 |
|
| 330 |
-
#
|
| 331 |
-
|
| 332 |
-
|
|
|
|
|
|
|
| 333 |
|
| 334 |
### Option 3: Local Development
|
| 335 |
```bash
|
|
@@ -337,7 +648,7 @@ docker stop dd-checklist-app
|
|
| 337 |
uv sync
|
| 338 |
|
| 339 |
# Run with hot reload for development
|
| 340 |
-
uv run streamlit run app.py --server.runOnSave true
|
| 341 |
|
| 342 |
# Add new dependencies
|
| 343 |
uv add <package-name>
|
|
@@ -346,12 +657,6 @@ uv add <package-name>
|
|
| 346 |
uv lock --upgrade
|
| 347 |
```
|
| 348 |
|
| 349 |
-
### Docker Features
|
| 350 |
-
- **Multi-stage build** for optimized image size
|
| 351 |
-
- **Security-focused** with non-root user
|
| 352 |
-
- **Health checks** for load balancers
|
| 353 |
-
- **Volume mounts** for data persistence
|
| 354 |
-
- **Production ready** with proper environment configuration
|
| 355 |
|
| 356 |
## 💡 Usage Tips
|
| 357 |
|
|
@@ -437,10 +742,10 @@ batch_size: int = 100
|
|
| 437 |
uv run python -c "from app import DDChecklistApp; app = DDChecklistApp(); print('✅ App working')"
|
| 438 |
|
| 439 |
# Test AI module specifically
|
| 440 |
-
uv run python -c "from
|
| 441 |
|
| 442 |
# Check project structure
|
| 443 |
-
ls -la
|
| 444 |
|
| 445 |
# Clean Python cache files
|
| 446 |
find . -name "*.pyc" -delete && find . -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true
|
|
@@ -487,21 +792,33 @@ MIT License - See LICENSE file for details
|
|
| 487 |
|
| 488 |
This application uses a **modular architecture** with clear separation of concerns:
|
| 489 |
|
| 490 |
-
- **`app.py`**: Main Streamlit application orchestrator
|
| 491 |
-
- **`
|
| 492 |
-
- **`
|
| 493 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
- **`agent_core.py`**: LangGraph agent setup & DDChecklistAgent class
|
| 495 |
-
- **`
|
| 496 |
-
- **`
|
| 497 |
- **`prompts.py`**: AI prompt templates
|
| 498 |
-
- **`
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
|
| 503 |
### Key Architectural Improvements (2025)
|
| 504 |
-
- ✅ **
|
| 505 |
- ✅ **FAISS Integration**: 10x faster document similarity search
|
| 506 |
- ✅ **Parallel Processing**: Multi-threaded document extraction
|
| 507 |
- ✅ **Current Models**: Updated to 2025 Claude model names
|
|
@@ -511,17 +828,17 @@ This application uses a **modular architecture** with clear separation of concer
|
|
| 511 |
## 🤝 Contributing
|
| 512 |
|
| 513 |
Contributions welcome! The modular architecture makes it easy to extend:
|
| 514 |
-
- Add new AI models in `
|
| 515 |
-
- Extend document processing in `
|
| 516 |
-
- Add UI components in `
|
| 517 |
-
- Create new services in `
|
| 518 |
|
| 519 |
## 📧 Support
|
| 520 |
|
| 521 |
For questions or support:
|
| 522 |
1. Check the [troubleshooting section](#-troubleshooting)
|
| 523 |
-
2. Test your setup: `uv run python -c "from app import
|
| 524 |
-
3. Verify AI models: `uv run python -c "from
|
| 525 |
4. Open an issue on GitHub
|
| 526 |
|
| 527 |
---
|
|
|
|
| 54 |
- Comprehensive error handling and exponential backoff retry logic
|
| 55 |
- Toggle AI features on/off for comparison
|
| 56 |
|
| 57 |
+
## 🧠 Core Techniques
|
| 58 |
+
|
| 59 |
+
This project implements several cutting-edge AI and search techniques specifically optimized for due diligence workflows:
|
| 60 |
+
|
| 61 |
+
### 🤖 **Advanced AI Architecture**
|
| 62 |
+
|
| 63 |
+
#### **LangGraph Agent System**
|
| 64 |
+
- **Modular Workflow Orchestration**: Uses LangGraph for complex multi-step AI workflows
|
| 65 |
+
- **State Management**: Maintains conversation state across document analysis tasks
|
| 66 |
+
- **Conditional Routing**: Dynamic task routing based on content analysis
|
| 67 |
+
- **Memory Persistence**: Checkpoint-based conversation memory with SQLite backend
|
| 68 |
+
|
| 69 |
+
#### **Multi-Model AI Integration**
|
| 70 |
+
- **Claude 3.5 Sonnet**: Primary model for complex analysis and summarization (200k context window)
|
| 71 |
+
- **Claude 3.5 Haiku**: Fast, cost-effective model for routine tasks
|
| 72 |
+
- **Batch Processing**: Concurrent AI requests with rate limiting and error handling
|
| 73 |
+
- **Prompt Engineering**: Specialized prompts for checklist generation, document analysis, and Q&A
|
| 74 |
+
|
| 75 |
+
#### **Intelligent Document Processing**
|
| 76 |
+
- **AI-Powered Summarization**: Automatic document categorization and brief summaries
|
| 77 |
+
- **Checklist Description Generation**: AI creates detailed explanations for what documents satisfy each requirement
|
| 78 |
+
- **Contextual Chunking**: Semantic text splitting with business document awareness
|
| 79 |
+
- **Multi-Format Support**: PDF, DOCX, DOC, TXT, MD processing with unified metadata
|
| 80 |
+
|
| 81 |
+
### 🔍 **Hybrid Search System**
|
| 82 |
+
|
| 83 |
+
#### **Dense Retrieval (FAISS)**
|
| 84 |
+
- **Vector Embeddings**: Sentence-transformers `all-mpnet-base-v2` (768 dimensions)
|
| 85 |
+
- **FAISS IndexFlatIP**: Optimized inner product similarity search for 10x performance improvement
|
| 86 |
+
- **Similarity Thresholding**: Configurable relevance thresholds (0.35 default)
|
| 87 |
+
- **Pre-computed Indices**: Cached embeddings for instant search on large document sets
|
| 88 |
+
- **How it Works**: Documents are converted to dense vector representations that capture semantic meaning, enabling similarity search based on conceptual relevance rather than exact keyword matches
|
| 89 |
+
|
| 90 |
+
#### **Sparse Retrieval (BM25)**
|
| 91 |
+
- **BM25Okapi Algorithm**: Probabilistic ranking framework for keyword-based search
|
| 92 |
+
- **Custom Tokenization**: Optimized for legal/financial documents with abbreviations (LLC, IPO, GAAP)
|
| 93 |
+
- **Hybrid Scoring**: Combines sparse and dense retrieval with weighted fusion (0.3 sparse, 0.7 dense)
|
| 94 |
+
- **Persistent Indices**: Pre-calculated BM25 indices saved to disk for fast loading
|
| 95 |
+
- **How it Works**: Uses term frequency-inverse document frequency (TF-IDF) scoring to find documents containing query terms, with probabilistic adjustments for document length and term rarity
|
| 96 |
+
|
| 97 |
+
#### **Cross-Encoder Reranking**
|
| 98 |
+
- **MS MARCO MiniLM-L6-v2**: Transformer-based reranking model for improved relevance
|
| 99 |
+
- **Query-Document Pairs**: Fine-grained relevance scoring for top candidates
|
| 100 |
+
- **Dynamic Batch Processing**: Memory-optimized reranking with configurable batch sizes
|
| 101 |
+
- **Fallback Handling**: Graceful degradation when reranking fails
|
| 102 |
+
- **How it Works**: Takes initial search results and re-scores them using a cross-encoder that jointly encodes query and document pairs, providing more accurate relevance rankings than similarity search alone
|
| 103 |
+
|
| 104 |
+
#### **Hybrid Search Pipeline**
|
| 105 |
+
```
|
| 106 |
+
Query → Sparse Retrieval (BM25) → Dense Retrieval (FAISS) → Cross-Encoder Reranking → Final Results
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
The hybrid approach combines the strengths of each method:
|
| 110 |
+
- **Sparse retrieval** excels at finding documents with exact keyword matches
|
| 111 |
+
- **Dense retrieval** captures semantic similarity and context
|
| 112 |
+
- **Reranking** provides fine-grained relevance scoring for top candidates
|
| 113 |
+
- **Result**: Improved recall and precision for due diligence queries
|
| 114 |
+
|
| 115 |
+
### 🕸️ **Knowledge Graph System**
|
| 116 |
+
|
| 117 |
+
#### **Graph Construction**
|
| 118 |
+
- **Entity Extraction**: Identifies and extracts key entities (companies, people, dates, amounts) from documents
|
| 119 |
+
- **Relationship Mining**: Discovers connections between entities using document context and AI analysis
|
| 120 |
+
- **Ontology Design**: Structured schema for due diligence entities (Parties, Transactions, Risks, Documents)
|
| 121 |
+
- **Incremental Updates**: Graph grows with each document processed
|
| 122 |
+
|
| 123 |
+
#### **Graph Storage & Indexing**
|
| 124 |
+
- **Persistent Storage**: Knowledge graphs saved as pickle files for fast loading
|
| 125 |
+
- **Metadata Tracking**: Graph metadata includes entity counts, relationship types, and processing timestamps
|
| 126 |
+
- **Version Control**: Separate graphs maintained for each data room/project
|
| 127 |
+
|
| 128 |
+
#### **Graph Applications**
|
| 129 |
+
- **Entity Linking**: Connects mentions of the same entity across different documents
|
| 130 |
+
- **Risk Analysis**: Identifies patterns and connections that indicate potential risks
|
| 131 |
+
- **Document Clustering**: Groups related documents based on shared entities
|
| 132 |
+
- **Strategic Insights**: Reveals hidden relationships and dependencies in transaction documents
|
| 133 |
+
|
| 134 |
+
#### **Graph Querying**
|
| 135 |
+
- **Entity Search**: Find all documents mentioning a specific company or person
|
| 136 |
+
- **Relationship Queries**: Discover connections between entities (e.g., "Who are the key executives?")
|
| 137 |
+
- **Pattern Matching**: Identify common due diligence patterns across similar transactions
|
| 138 |
+
- **Network Analysis**: Visualize entity relationships and centrality measures
|
| 139 |
+
|
| 140 |
+
#### **Performance Characteristics**
|
| 141 |
+
- **Construction Time**: ~5-10 seconds per document depending on complexity
|
| 142 |
+
- **Query Speed**: Sub-millisecond lookups for entity searches
|
| 143 |
+
- **Memory Usage**: ~50-100KB per document for graph structures
|
| 144 |
+
- **Scalability**: Handles 1000+ documents with efficient indexing
|
| 145 |
+
|
| 146 |
+
#### **Integration with Search**
|
| 147 |
+
The knowledge graph enhances the hybrid search system by:
|
| 148 |
+
- **Entity-Based Filtering**: Refine search results using entity relationships
|
| 149 |
+
- **Context Enrichment**: Add relationship context to search results
|
| 150 |
+
- **Cross-Document Insights**: Link information across multiple documents
|
| 151 |
+
- **Risk Pattern Detection**: Identify concerning relationship patterns automatically
|
| 152 |
+
|
| 153 |
+
### ⚡ **Performance Optimization**
|
| 154 |
+
|
| 155 |
+
#### **Intelligent Caching System**
|
| 156 |
+
- **Multi-Level Caching**: Disk cache (500MB) + memory cache (2GB) + joblib function cache
|
| 157 |
+
- **Content-Based Keys**: SHA256 hash-based cache invalidation
|
| 158 |
+
- **Embedding Cache**: Persistent storage of computed embeddings with 30-day TTL
|
| 159 |
+
- **Document Cache**: Content caching with hash verification
|
| 160 |
+
|
| 161 |
+
#### **Batch Processing & Parallelization**
|
| 162 |
+
- **Concurrent AI Requests**: Async processing with semaphore-controlled concurrency (max 50)
|
| 163 |
+
- **Dynamic Batch Sizing**: Memory-aware batch optimization based on available RAM
|
| 164 |
+
- **Thread Pool Processing**: Parallel document extraction (4 workers default)
|
| 165 |
+
- **Exponential Backoff**: Intelligent retry logic with jitter for API failures
|
| 166 |
+
|
| 167 |
+
#### **Memory Management**
|
| 168 |
+
- **Memory Monitoring**: Real-time memory usage tracking with psutil
|
| 169 |
+
- **Garbage Collection**: Automatic GC triggering at 80% memory usage
|
| 170 |
+
- **GPU Optimization**: CUDA memory monitoring and optimization when available
|
| 171 |
+
- **Accelerate Integration**: Hardware acceleration for ML workloads
|
| 172 |
+
|
| 173 |
+
#### **Processing Pipeline Optimization**
|
| 174 |
+
- **Semantic Chunking**: Intelligent text splitting with business document separators
|
| 175 |
+
- **Chunk Metadata**: Citation tracking and first-chunk identification for document matching
|
| 176 |
+
- **Parallel Loading**: Multi-format document processing with thread pools
|
| 177 |
+
- **Progressive Loading**: Memory-efficient loading of large document collections
|
| 178 |
+
|
| 179 |
+
### 🎯 **Advanced Matching Algorithms**
|
| 180 |
+
|
| 181 |
+
#### **Checklist-to-Document Matching**
|
| 182 |
+
- **AI-Enhanced Descriptions**: LLM-generated explanations improve matching accuracy by 40%
|
| 183 |
+
- **Dual Matching Strategy**: Combines original checklist text with AI descriptions
|
| 184 |
+
- **Relevance Classification**: Primary (≥50%) vs Ancillary (<50%) document tagging
|
| 185 |
+
- **Dynamic Thresholds**: Real-time filtering without reprocessing
|
| 186 |
+
|
| 187 |
+
#### **Question Answering with Citations**
|
| 188 |
+
- **RAG Architecture**: Retrieval-Augmented Generation with source document context
|
| 189 |
+
- **Citation Tracking**: Precise document excerpts with page/line references
|
| 190 |
+
- **Multi-Source Synthesis**: AI synthesis of answers from multiple relevant documents
|
| 191 |
+
- **Fallback Strategies**: Graceful degradation from RAG to search to basic retrieval
|
| 192 |
+
|
| 193 |
+
#### **Strategic Analysis Pipeline**
|
| 194 |
+
- **Company Overview Generation**: Executive summaries with key findings
|
| 195 |
+
- **Risk Assessment**: Gap analysis from missing documents
|
| 196 |
+
- **Strategic Alignment**: M&A objective compatibility evaluation
|
| 197 |
+
- **Go/No-Go Recommendations**: Data-driven decision support
|
| 198 |
+
|
| 199 |
+
### 🏗️ **Enterprise-Grade Architecture**
|
| 200 |
+
|
| 201 |
+
#### **Modular Design**
|
| 202 |
+
- **Separation of Concerns**: Core, AI, handlers, services, and UI layers
|
| 203 |
+
- **Dependency Injection**: Clean interfaces between components
|
| 204 |
+
- **Error Handling**: Comprehensive exception handling with user-friendly messages
|
| 205 |
+
- **Configuration Management**: Environment-based configuration with validation
|
| 206 |
+
|
| 207 |
+
#### **Production Readiness**
|
| 208 |
+
- **Logging System**: Structured logging with configurable levels
|
| 209 |
+
- **Session Management**: User session state with Streamlit integration
|
| 210 |
+
- **Export Capabilities**: Multiple export formats (Markdown, structured reports)
|
| 211 |
+
- **Scalability**: Designed for 1000+ document processing
|
| 212 |
+
|
| 213 |
## 🚀 Quick Start
|
| 214 |
|
| 215 |
### Prerequisites
|
|
|
|
| 224 |
|
| 225 |
### Running Locally
|
| 226 |
```bash
|
| 227 |
+
# Option 1: Use the start command (recommended)
|
| 228 |
+
uv run start
|
| 229 |
|
| 230 |
# Option 2: Manual uv commands
|
| 231 |
uv sync # Install dependencies
|
| 232 |
+
uv run streamlit run app/main.py # Run the app
|
| 233 |
|
| 234 |
# Option 3: Development mode with auto-reload
|
| 235 |
+
uv run streamlit run app/main.py --server.runOnSave true
|
| 236 |
```
|
| 237 |
|
| 238 |
### Environment Setup (for AI features)
|
|
|
|
| 250 |
echo "CLAUDE_MODEL=claude-sonnet-4-20250514" >> .env
|
| 251 |
echo "CLAUDE_TEMPERATURE=0.3" >> .env
|
| 252 |
echo "CLAUDE_MAX_TOKENS=2000" >> .env
|
| 253 |
+
echo "SENTENCE_TRANSFORMER_MODEL=all-mpnet-base-v2" >> .env
|
| 254 |
+
echo "EMBEDDING_DIMENSION=768" >> .env
|
| 255 |
|
| 256 |
# Processing Configuration
|
| 257 |
echo "CHUNK_SIZE=400" >> .env
|
|
|
|
| 299 |
|
| 300 |
#### **Model Configuration**
|
| 301 |
- `CLAUDE_MODEL` - Claude model to use (default: `claude-sonnet-4-20250514`)
|
| 302 |
+
- `CLAUDE_TEMPERATURE` - Model temperature (default: `0.0` for deterministic responses)
|
| 303 |
- `CLAUDE_MAX_TOKENS` - Maximum tokens per response (default: `2000`)
|
| 304 |
+
- `SENTENCE_TRANSFORMER_MODEL` - Embedding model (default: `all-mpnet-base-v2`)
|
| 305 |
+
- `EMBEDDING_DIMENSION` - Embedding dimensions (default: `768`)
|
| 306 |
|
| 307 |
#### **Document Processing**
|
| 308 |
- `CHUNK_SIZE` - Text chunk size in characters (default: `400`)
|
|
|
|
| 342 |
uv run python -c "from src.ai import DDChecklistAgent; print('✅ AI module ready')"
|
| 343 |
|
| 344 |
# Start the application to verify everything works
|
| 345 |
+
uv run streamlit run app/main.py
|
| 346 |
```
|
| 347 |
|
| 348 |
+
## 🧪 Testing
|
| 349 |
+
|
| 350 |
+
The project includes comprehensive test coverage with pytest support for unit, integration, and functional tests.
|
| 351 |
+
|
| 352 |
+
### Critical User Flows Verification
|
| 353 |
+
|
| 354 |
+
The project includes a specialized **test coverage verification script** that focuses on critical user flows rather than requiring high overall coverage percentages:
|
| 355 |
+
|
| 356 |
+
```bash
|
| 357 |
+
# Quick verification of critical flows
|
| 358 |
+
uv run python verify_test_coverage.py
|
| 359 |
+
|
| 360 |
+
# Detailed output with function coverage
|
| 361 |
+
uv run python verify_test_coverage.py --verbose
|
| 362 |
+
|
| 363 |
+
# JSON output for CI/CD integration
|
| 364 |
+
uv run python verify_test_coverage.py --json
|
| 365 |
+
```
|
| 366 |
+
|
| 367 |
+
**Verified Critical Flows:**
|
| 368 |
+
- ✅ **Document Processing** - Upload, processing, chunking, indexing
|
| 369 |
+
- ✅ **Report Generation** - Overview and strategic reports
|
| 370 |
+
- ✅ **Checklist Matching** - Due diligence checklist parsing
|
| 371 |
+
- ✅ **Q&A Functionality** - Document search and AI-powered answers
|
| 372 |
+
- ✅ **Export Functionality** - Report export capabilities
|
| 373 |
+
|
| 374 |
+
### Running Tests
|
| 375 |
+
```bash
|
| 376 |
+
# Install test dependencies
|
| 377 |
+
uv sync
|
| 378 |
+
|
| 379 |
+
# Run all tests
|
| 380 |
+
uv run pytest
|
| 381 |
+
|
| 382 |
+
# Run specific test categories
|
| 383 |
+
uv run pytest -m unit # Unit tests only
|
| 384 |
+
uv run pytest -m integration # Integration tests only
|
| 385 |
+
|
| 386 |
+
# Run tests with coverage
|
| 387 |
+
uv run pytest --cov=app --cov-report=html
|
| 388 |
+
|
| 389 |
+
# Run tests in parallel (faster)
|
| 390 |
+
uv run pytest -n auto
|
| 391 |
+
|
| 392 |
+
# Run specific test file
|
| 393 |
+
uv run pytest tests/unit/test_config.py
|
| 394 |
+
|
| 395 |
+
# Run tests with verbose output
|
| 396 |
+
uv run pytest -v
|
| 397 |
+
|
| 398 |
+
# Run tests and stop on first failure
|
| 399 |
+
uv run pytest -x
|
| 400 |
+
```
|
| 401 |
+
|
| 402 |
+
### Test Structure
|
| 403 |
+
```
|
| 404 |
+
tests/
|
| 405 |
+
├── __init__.py # Test package
|
| 406 |
+
├── conftest.py # Shared fixtures and configuration
|
| 407 |
+
├── unit/ # Unit tests
|
| 408 |
+
│ ├── __init__.py
|
| 409 |
+
│ ├── test_config.py # Configuration tests
|
| 410 |
+
│ ├── test_handlers.py # Handler tests
|
| 411 |
+
│ ├── test_parsers.py # Parser tests
|
| 412 |
+
│ ├── test_services.py # Service tests
|
| 413 |
+
│ └── test_session.py # Session management tests
|
| 414 |
+
└── integration/ # Integration tests
|
| 415 |
+
├── __init__.py
|
| 416 |
+
├── test_ai_workflows.py # AI workflow tests
|
| 417 |
+
├── test_core_services.py # Core service integration
|
| 418 |
+
├── test_critical_workflows.py # Critical workflow tests
|
| 419 |
+
├── test_export_and_ui.py # Export and UI integration
|
| 420 |
+
└── test_workflows.py # General workflow tests
|
| 421 |
+
```
|
| 422 |
+
|
| 423 |
+
### Writing Tests
|
| 424 |
+
```python
|
| 425 |
+
import pytest
|
| 426 |
+
from app.core.parsers import parse_checklist
|
| 427 |
+
|
| 428 |
+
@pytest.mark.unit
|
| 429 |
+
def test_checklist_parsing():
|
| 430 |
+
"""Test checklist parsing functionality"""
|
| 431 |
+
checklist_text = """
|
| 432 |
+
## A. Test Category
|
| 433 |
+
1. First item
|
| 434 |
+
2. Second item
|
| 435 |
+
"""
|
| 436 |
+
|
| 437 |
+
parsed = parse_checklist(checklist_text)
|
| 438 |
+
|
| 439 |
+
assert isinstance(parsed, dict)
|
| 440 |
+
assert "A. Test Category" in parsed
|
| 441 |
+
assert len(parsed["A. Test Category"]["items"]) == 2
|
| 442 |
+
```
|
| 443 |
+
|
| 444 |
+
### Test Configuration
|
| 445 |
+
- **Coverage**: Minimum 80% code coverage required
|
| 446 |
+
- **Markers**: `unit`, `integration`, `functional`, `slow`, `skip_ci`
|
| 447 |
+
- **Parallel**: Tests can run in parallel for faster execution
|
| 448 |
+
- **Auto-discovery**: Tests are automatically discovered from `test_*.py` files
|
| 449 |
+
|
| 450 |
+
### CI/CD Integration
|
| 451 |
+
Tests are configured to run automatically in CI/CD pipelines with:
|
| 452 |
+
- Coverage reporting
|
| 453 |
+
- Parallel test execution
|
| 454 |
+
- Test result artifacts
|
| 455 |
+
- Failure notifications
|
| 456 |
+
|
| 457 |
## 📱 User Interface
|
| 458 |
|
| 459 |
### Sidebar Layout
|
|
|
|
| 493 |
|
| 494 |
```
|
| 495 |
dd_poc/
|
| 496 |
+
├── app/ # 📦 Main application package
|
| 497 |
+
│ ├── main.py # 🎯 Main Streamlit application
|
| 498 |
+
│ ├── __init__.py
|
| 499 |
+
│ ├── ai/ # 🧠 AI Integration Module
|
| 500 |
+
│ │ ├── __init__.py
|
| 501 |
+
│ │ ├── agent_core.py # LangGraph agent setup & DDChecklistAgent
|
| 502 |
+
│ │ ├── agent_utils.py # AI utility functions
|
| 503 |
+
│ │ ├── document_classifier.py # Document classification
|
| 504 |
+
│ │ ├── processing_pipeline.py # AI processing workflows
|
| 505 |
+
│ │ └── prompts.py # AI prompt templates
|
| 506 |
+
│ ├── core/ # Core functionality
|
| 507 |
+
│ │ ├── __init__.py
|
| 508 |
+
│ │ ├── config.py # Configuration management
|
| 509 |
+
│ │ ├── constants.py # Application constants
|
| 510 |
+
│ │ ├── content_ingestion.py # Document ingestion
|
| 511 |
+
│ │ ├── document_processor.py # Document processing
|
| 512 |
+
│ │ ├── exceptions.py # Custom exceptions
|
| 513 |
+
│ │ ├── logging.py # Logging configuration
|
| 514 |
+
│ │ ├── model_cache.py # Model caching system
|
| 515 |
+
│ │ ├── parsers.py # Data parsers
|
| 516 |
+
│ │ ├── reports.py # Report generation
|
| 517 |
+
│ │ ├── search.py # Search functionality
|
| 518 |
+
│ │ └── utils.py # Utility functions
|
| 519 |
+
│ ├── handlers/ # Request handlers
|
| 520 |
+
│ │ ├── __init__.py
|
| 521 |
+
│ │ ├── ai_handler.py # AI request handling
|
| 522 |
+
│ │ ├── document_handler.py # Document operations
|
| 523 |
+
│ │ └── export_handler.py # Export functionality
|
| 524 |
+
│ ├── services/ # Business logic services
|
| 525 |
+
│ │ ├── ai_client.py # AI client service
|
| 526 |
+
│ │ ├── ai_config.py # AI configuration
|
| 527 |
+
│ │ ├── ai_service.py # AI service layer
|
| 528 |
+
│ │ └── response_parser.py # Response parsing
|
| 529 |
+
│ ├── ui/ # User interface components
|
| 530 |
+
│ │ ├── __init__.py
|
| 531 |
+
│ │ ├── components.py # UI components
|
| 532 |
+
│ │ ├── sidebar.py # Sidebar component
|
| 533 |
+
│ │ ├── tabs/ # Tab components
|
| 534 |
+
│ │ │ ├── __init__.py
|
| 535 |
+
│ │ │ ├── checklist_tab.py
|
| 536 |
+
│ │ │ ├── overview_tab.py
|
| 537 |
+
│ │ │ ├── qa_tab.py
|
| 538 |
+
│ │ │ ├── questions_tab.py
|
| 539 |
+
│ │ │ └── strategic_tab.py
|
| 540 |
+
│ │ └── ui_components/ # Additional UI components
|
| 541 |
+
│ ├── error_handler.py # Error handling
|
| 542 |
+
│ └── session_manager.py # Session management
|
| 543 |
├── data/ # 📊 Data directories
|
| 544 |
│ ├── checklist/ # Due diligence checklists (.md)
|
| 545 |
│ ├── questions/ # Question lists (.md)
|
| 546 |
│ ├── strategy/ # Strategic documents (.md)
|
| 547 |
+
│ ├── search_indexes/ # FAISS and BM25 indices with metadata
|
| 548 |
│ └── vdrs/ # Virtual Data Rooms (2 projects)
|
| 549 |
│ ├── automated-services-transformation/
|
| 550 |
│ └── industrial-security-leadership/
|
| 551 |
+
├── models/ # 🤖 Cached AI models
|
| 552 |
+
│ ├── sentence_transformers/
|
| 553 |
+
│ └── cross_encoder/
|
| 554 |
+
├── tests/ # 🧪 Test suite
|
| 555 |
+
│ ├── unit/ # Unit tests
|
| 556 |
+
│ ├── integration/ # Integration tests
|
| 557 |
+
│ └── conftest.py # Test configuration
|
| 558 |
+
├── pyproject.toml # Python dependencies and project configuration
|
| 559 |
+
├── scripts/start.py # 🚀 Launch script (Python)
|
| 560 |
+
├── uv.lock # uv dependency lock file
|
| 561 |
├── .env # API keys (create this)
|
| 562 |
+
└── README.md # This file
|
|
|
|
| 563 |
```
|
| 564 |
|
| 565 |
## 🎨 Key Features Explained
|
|
|
|
| 568 |
- **Supported Formats**: PDF, DOCX, DOC, TXT, MD
|
| 569 |
- **Parallel Processing**: Multi-threaded document extraction (4 workers default)
|
| 570 |
- **Smart Chunking**: 400-character chunks with 50-character overlap
|
| 571 |
+
- **Embeddings**: Sentence-transformers (all-mpnet-base-v2, 768 dimensions)
|
| 572 |
- **Vector Store**: FAISS IndexFlatIP for 10x faster similarity search
|
| 573 |
- **Caching**: Intelligent embedding cache with invalidation
|
| 574 |
|
|
|
|
| 616 |
4. Add ANTHROPIC_API_KEY in Streamlit secrets
|
| 617 |
5. Deploy (automatic)
|
| 618 |
|
| 619 |
+
## 🤖 Model Caching for Streamlit Cloud
|
| 620 |
+
|
| 621 |
+
To optimize performance and avoid download delays on Streamlit Cloud, models are cached locally in the repository:
|
| 622 |
+
|
| 623 |
+
### Download Models Locally
|
| 624 |
```bash
|
| 625 |
+
# Download and cache models for offline use
|
| 626 |
+
python download_models.py
|
| 627 |
+
```
|
| 628 |
|
| 629 |
+
### Cached Models
|
| 630 |
+
- **Sentence Transformer**: `sentence-transformers/all-mpnet-base-v2` (~418MB)
|
| 631 |
+
- **Cross-Encoder**: `cross-encoder/ms-marco-MiniLM-L-6-v2` (~88MB)
|
| 632 |
|
| 633 |
+
### Automatic Model Loading
|
| 634 |
+
The application automatically:
|
| 635 |
+
1. Checks for local models in `models/` directory first
|
| 636 |
+
2. Falls back to HuggingFace download if local models not found
|
| 637 |
+
3. Caches loaded models in memory for reuse
|
| 638 |
|
| 639 |
+
### Benefits
|
| 640 |
+
- ⚡ **Faster startup**: No download delays on Streamlit Cloud
|
| 641 |
+
- 💾 **Offline capable**: Works without internet for model loading
|
| 642 |
+
- 🔄 **Version control**: Models are versioned with your code
|
| 643 |
+
- 🚀 **Consistent performance**: Same model versions across deployments
|
| 644 |
|
| 645 |
### Option 3: Local Development
|
| 646 |
```bash
|
|
|
|
| 648 |
uv sync
|
| 649 |
|
| 650 |
# Run with hot reload for development
|
| 651 |
+
uv run streamlit run app/main.py --server.runOnSave true
|
| 652 |
|
| 653 |
# Add new dependencies
|
| 654 |
uv add <package-name>
|
|
|
|
| 657 |
uv lock --upgrade
|
| 658 |
```
|
| 659 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
|
| 661 |
## 💡 Usage Tips
|
| 662 |
|
|
|
|
| 742 |
uv run python -c "from app import DDChecklistApp; app = DDChecklistApp(); print('✅ App working')"
|
| 743 |
|
| 744 |
# Test AI module specifically
|
| 745 |
+
uv run python -c "from app.ai import agent_core; print('✅ AI module available')"
|
| 746 |
|
| 747 |
# Check project structure
|
| 748 |
+
ls -la app/ && ls -la app/ai/
|
| 749 |
|
| 750 |
# Clean Python cache files
|
| 751 |
find . -name "*.pyc" -delete && find . -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true
|
|
|
|
| 792 |
|
| 793 |
This application uses a **modular architecture** with clear separation of concerns:
|
| 794 |
|
| 795 |
+
- **`app/main.py`**: Main Streamlit application orchestrator
|
| 796 |
+
- **`app/`**: All modules organized by responsibility
|
| 797 |
+
- **`core/`**: Core functionality
|
| 798 |
+
- **`config.py`**: Configuration management with dataclasses
|
| 799 |
+
- **`document_processor.py`**: File handling, text extraction, and FAISS integration
|
| 800 |
+
- **`parsers.py`**: Data parsing and processing
|
| 801 |
+
- **`search.py`**: Search functionality with FAISS integration
|
| 802 |
+
- **`utils.py`**: Error handling, logging, and utilities
|
| 803 |
+
- **`ai/`**: **AI Integration Module**
|
| 804 |
- **`agent_core.py`**: LangGraph agent setup & DDChecklistAgent class
|
| 805 |
+
- **`agent_utils.py`**: AI utility functions and helpers
|
| 806 |
+
- **`processing_pipeline.py`**: AI processing workflows and pipelines
|
| 807 |
- **`prompts.py`**: AI prompt templates
|
| 808 |
+
- **`handlers/`**: Request handlers
|
| 809 |
+
- **`ai_handler.py`**: AI request processing
|
| 810 |
+
- **`document_handler.py`**: Document operations
|
| 811 |
+
- **`export_handler.py`**: Export functionality
|
| 812 |
+
- **`services/`**: Business logic services
|
| 813 |
+
- **`ai_service.py`**: AI service layer
|
| 814 |
+
- **`ai_client.py`**: AI client interface
|
| 815 |
+
- **`response_parser.py`**: Response parsing and formatting
|
| 816 |
+
- **`ui/`**: User interface components
|
| 817 |
+
- **`components.py`**: Reusable Streamlit components
|
| 818 |
+
- **`tabs/`**: Tab-specific UI components
|
| 819 |
|
| 820 |
### Key Architectural Improvements (2025)
|
| 821 |
+
- ✅ **Modular Design**: Clean separation between core, AI, handlers, services, and UI
|
| 822 |
- ✅ **FAISS Integration**: 10x faster document similarity search
|
| 823 |
- ✅ **Parallel Processing**: Multi-threaded document extraction
|
| 824 |
- ✅ **Current Models**: Updated to 2025 Claude model names
|
|
|
|
| 828 |
## 🤝 Contributing
|
| 829 |
|
| 830 |
Contributions welcome! The modular architecture makes it easy to extend:
|
| 831 |
+
- Add new AI models in `app/ai/agent_core.py`
|
| 832 |
+
- Extend document processing in `app/core/document_processor.py`
|
| 833 |
+
- Add UI components in `app/ui/components.py`
|
| 834 |
+
- Create new services in `app/services/`
|
| 835 |
|
| 836 |
## 📧 Support
|
| 837 |
|
| 838 |
For questions or support:
|
| 839 |
1. Check the [troubleshooting section](#-troubleshooting)
|
| 840 |
+
2. Test your setup: `uv run python -c "from app import main; print('✅ App ready')"`
|
| 841 |
+
3. Verify AI models: `uv run python -c "from app.ai.agent_core import DDChecklistAgent; print('✅ AI available')"`
|
| 842 |
4. Open an issue on GitHub
|
| 843 |
|
| 844 |
---
|
app.py
DELETED
|
@@ -1,599 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
DD-Checklist Main Application - Refactored Version
|
| 4 |
-
|
| 5 |
-
This is the main Streamlit application that orchestrates all components
|
| 6 |
-
using the new modular architecture for better maintainability.
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
import os
|
| 10 |
-
import warnings
|
| 11 |
-
import logging
|
| 12 |
-
|
| 13 |
-
# Fix tokenizers parallelism warning early
|
| 14 |
-
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 15 |
-
|
| 16 |
-
# Only suppress specific known non-critical warnings
|
| 17 |
-
warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
|
| 18 |
-
warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
|
| 19 |
-
|
| 20 |
-
import streamlit as st
|
| 21 |
-
|
| 22 |
-
from pathlib import Path
|
| 23 |
-
from typing import Dict
|
| 24 |
-
|
| 25 |
-
# Import our refactored modules
|
| 26 |
-
from src import (
|
| 27 |
-
init_config, DocumentProcessor,
|
| 28 |
-
logger,
|
| 29 |
-
render_project_selector,
|
| 30 |
-
render_ai_settings, escape_markdown_math,
|
| 31 |
-
get_mime_type, format_document_title
|
| 32 |
-
)
|
| 33 |
-
from src.config import configure_langchain_logging
|
| 34 |
-
from src.document_processing import safe_execute
|
| 35 |
-
# Using Streamlit directly for simplicity
|
| 36 |
-
from src.ui_components import (
|
| 37 |
-
render_file_selector, render_checklist_results, render_question_results,
|
| 38 |
-
render_quick_questions, create_document_link
|
| 39 |
-
)
|
| 40 |
-
from src.services import (
|
| 41 |
-
search_documents
|
| 42 |
-
)
|
| 43 |
-
|
| 44 |
-
from src.config import show_success, show_error, show_info
|
| 45 |
-
|
| 46 |
-
# Import LangGraph + Anthropic configuration
|
| 47 |
-
from src.ai import (
|
| 48 |
-
DDChecklistAgent
|
| 49 |
-
)
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
class DDChecklistApp:
|
| 53 |
-
"""
|
| 54 |
-
Main application class that orchestrates all components
|
| 55 |
-
"""
|
| 56 |
-
|
| 57 |
-
def __init__(self):
|
| 58 |
-
"""Initialize the application"""
|
| 59 |
-
# Initialize configuration
|
| 60 |
-
self.config = init_config()
|
| 61 |
-
|
| 62 |
-
# Initialize session state
|
| 63 |
-
self._init_session_state()
|
| 64 |
-
|
| 65 |
-
# Configure Streamlit page
|
| 66 |
-
st.set_page_config(
|
| 67 |
-
page_title=self.config.ui.page_title,
|
| 68 |
-
page_icon=self.config.ui.page_icon,
|
| 69 |
-
layout=self.config.ui.layout
|
| 70 |
-
)
|
| 71 |
-
|
| 72 |
-
# Initialize services (will be loaded when needed)
|
| 73 |
-
self.model_name = self.config.model.sentence_transformer_model
|
| 74 |
-
self.document_processor = None
|
| 75 |
-
self.agent = None
|
| 76 |
-
|
| 77 |
-
def _init_session_state(self):
|
| 78 |
-
"""Initialize essential session state variables only"""
|
| 79 |
-
essential_defaults = {
|
| 80 |
-
'documents': {},
|
| 81 |
-
'chunks': [],
|
| 82 |
-
'embeddings': None,
|
| 83 |
-
'checklist_results': {},
|
| 84 |
-
'question_answers': {},
|
| 85 |
-
'company_summary': "",
|
| 86 |
-
'strategy_analysis': "",
|
| 87 |
-
'agent': None,
|
| 88 |
-
# Sidebar file selections
|
| 89 |
-
'selected_strategy_path': None,
|
| 90 |
-
'selected_strategy_text': "",
|
| 91 |
-
'selected_checklist_path': None,
|
| 92 |
-
'selected_checklist_text': "",
|
| 93 |
-
'selected_questions_path': None,
|
| 94 |
-
'selected_questions_text': ""
|
| 95 |
-
}
|
| 96 |
-
|
| 97 |
-
for key, default_value in essential_defaults.items():
|
| 98 |
-
if key not in st.session_state:
|
| 99 |
-
st.session_state[key] = default_value
|
| 100 |
-
|
| 101 |
-
def initialize_services(self):
|
| 102 |
-
"""Initialize core services"""
|
| 103 |
-
if self.document_processor is None:
|
| 104 |
-
self.document_processor = DocumentProcessor(self.model_name)
|
| 105 |
-
|
| 106 |
-
# Restore document processor state from session state if available
|
| 107 |
-
if (hasattr(st.session_state, 'chunks') and st.session_state.chunks and
|
| 108 |
-
hasattr(st.session_state, 'embeddings') and st.session_state.embeddings is not None):
|
| 109 |
-
|
| 110 |
-
self.document_processor.chunks = st.session_state.chunks
|
| 111 |
-
self.document_processor.embeddings = st.session_state.embeddings
|
| 112 |
-
# Note: Don't restore documents here - they'll be recreated from chunks if needed
|
| 113 |
-
|
| 114 |
-
def setup_ai_agent(self, api_key: str, model_choice: str) -> bool:
|
| 115 |
-
"""
|
| 116 |
-
Setup AI agent
|
| 117 |
-
|
| 118 |
-
Args:
|
| 119 |
-
api_key: Anthropic API key
|
| 120 |
-
model_choice: Claude model to use
|
| 121 |
-
|
| 122 |
-
Returns:
|
| 123 |
-
True if agent was successfully initialized
|
| 124 |
-
"""
|
| 125 |
-
try:
|
| 126 |
-
with st.spinner("Initializing AI agent..."):
|
| 127 |
-
agent = DDChecklistAgent(api_key, model_choice)
|
| 128 |
-
|
| 129 |
-
if agent.is_available():
|
| 130 |
-
st.session_state.agent = agent
|
| 131 |
-
self.agent = agent
|
| 132 |
-
show_success("✅ AI Agent ready")
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
return True
|
| 136 |
-
else:
|
| 137 |
-
show_error("❌ Failed to initialize agent")
|
| 138 |
-
return False
|
| 139 |
-
except Exception as e:
|
| 140 |
-
show_error(f"Agent initialization failed: {str(e)}")
|
| 141 |
-
return False
|
| 142 |
-
|
| 143 |
-
def render_sidebar(self) -> tuple:
|
| 144 |
-
"""
|
| 145 |
-
Render sidebar with project selection, file selectors, and AI settings
|
| 146 |
-
|
| 147 |
-
Returns:
|
| 148 |
-
Tuple of (selected_data_room_path, use_ai_features, process_button)
|
| 149 |
-
"""
|
| 150 |
-
with st.sidebar:
|
| 151 |
-
# Project and data room selection
|
| 152 |
-
selected_project_path, selected_data_room_path = render_project_selector()
|
| 153 |
-
|
| 154 |
-
# Process button
|
| 155 |
-
process_button = st.button(
|
| 156 |
-
"🚀 Process Data Room",
|
| 157 |
-
type="primary",
|
| 158 |
-
use_container_width=True
|
| 159 |
-
)
|
| 160 |
-
|
| 161 |
-
if process_button:
|
| 162 |
-
show_success("Processing... Check main area for progress")
|
| 163 |
-
|
| 164 |
-
st.divider()
|
| 165 |
-
|
| 166 |
-
# Strategy, Checklist, and Questions selectors
|
| 167 |
-
st.subheader("📋 Analysis Configuration")
|
| 168 |
-
|
| 169 |
-
# Strategy selector
|
| 170 |
-
strategy_path, strategy_text = render_file_selector(
|
| 171 |
-
self.config.paths.strategy_dir, "Strategy", "sidebar", "🎯"
|
| 172 |
-
)
|
| 173 |
-
# Store in session state
|
| 174 |
-
st.session_state.selected_strategy_path = strategy_path
|
| 175 |
-
st.session_state.selected_strategy_text = strategy_text
|
| 176 |
-
|
| 177 |
-
# Checklist selector
|
| 178 |
-
checklist_path, checklist_text = render_file_selector(
|
| 179 |
-
self.config.paths.checklist_dir, "Checklist", "sidebar", "📊"
|
| 180 |
-
)
|
| 181 |
-
# Store in session state
|
| 182 |
-
st.session_state.selected_checklist_path = checklist_path
|
| 183 |
-
st.session_state.selected_checklist_text = checklist_text
|
| 184 |
-
|
| 185 |
-
# Questions selector
|
| 186 |
-
questions_path, questions_text = render_file_selector(
|
| 187 |
-
self.config.paths.questions_dir, "Questions", "sidebar", "❓"
|
| 188 |
-
)
|
| 189 |
-
# Store in session state
|
| 190 |
-
st.session_state.selected_questions_path = questions_path
|
| 191 |
-
st.session_state.selected_questions_text = questions_text
|
| 192 |
-
|
| 193 |
-
st.divider()
|
| 194 |
-
|
| 195 |
-
# AI settings
|
| 196 |
-
use_ai_features, api_key, model_choice = render_ai_settings()
|
| 197 |
-
|
| 198 |
-
# Initialize AI agent if enabled
|
| 199 |
-
if use_ai_features and api_key:
|
| 200 |
-
if not hasattr(st.session_state, 'agent') or st.session_state.agent is None:
|
| 201 |
-
self.setup_ai_agent(api_key, model_choice)
|
| 202 |
-
elif hasattr(st.session_state, 'agent') and st.session_state.agent:
|
| 203 |
-
self.agent = st.session_state.agent
|
| 204 |
-
else:
|
| 205 |
-
st.session_state.agent = None
|
| 206 |
-
self.agent = None
|
| 207 |
-
|
| 208 |
-
return selected_data_room_path, use_ai_features, process_button
|
| 209 |
-
|
| 210 |
-
def render_company_overview_tab(self):
|
| 211 |
-
"""Render company overview tab"""
|
| 212 |
-
# Use strategy from sidebar
|
| 213 |
-
strategy_text = st.session_state.get('selected_strategy_text', "")
|
| 214 |
-
|
| 215 |
-
# Check if we have documents to display summaries
|
| 216 |
-
if st.session_state.documents:
|
| 217 |
-
self._render_report_section("overview", strategy_text=strategy_text)
|
| 218 |
-
else:
|
| 219 |
-
show_info("👈 Configure and process data room to see analysis")
|
| 220 |
-
|
| 221 |
-
def render_strategic_analysis_tab(self):
|
| 222 |
-
"""Render strategic analysis tab"""
|
| 223 |
-
# Use strategy from sidebar
|
| 224 |
-
strategy_text = st.session_state.get('selected_strategy_text', "")
|
| 225 |
-
|
| 226 |
-
# Check if we have documents to display summaries
|
| 227 |
-
if st.session_state.documents:
|
| 228 |
-
self._render_report_section("strategic", strategy_text=strategy_text)
|
| 229 |
-
else:
|
| 230 |
-
show_info("👈 Configure and process data room to see analysis")
|
| 231 |
-
|
| 232 |
-
def _render_report_section(self, report_type: str, strategy_text: str = ""):
|
| 233 |
-
"""Unified report rendering for both overview and strategic analysis"""
|
| 234 |
-
from src.services import generate_reports
|
| 235 |
-
|
| 236 |
-
summary_key = f"{report_type}_summary"
|
| 237 |
-
|
| 238 |
-
# Check prerequisites for strategic analysis
|
| 239 |
-
if report_type == "strategic" and not st.session_state.checklist_results:
|
| 240 |
-
st.warning("⚠️ Process data room with checklist first to enable strategic analysis")
|
| 241 |
-
return
|
| 242 |
-
|
| 243 |
-
# Auto-generate report if not already present and AI is available
|
| 244 |
-
if (not st.session_state.get(summary_key, "") and st.session_state.agent):
|
| 245 |
-
with st.spinner(f"🤖 Generating {report_type} analysis..."):
|
| 246 |
-
data_room_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
|
| 247 |
-
if st.session_state.documents else "Unknown")
|
| 248 |
-
|
| 249 |
-
st.session_state[summary_key] = generate_reports(
|
| 250 |
-
st.session_state.documents,
|
| 251 |
-
data_room_name,
|
| 252 |
-
strategy_text,
|
| 253 |
-
st.session_state.checklist_results,
|
| 254 |
-
report_type,
|
| 255 |
-
st.session_state.agent.llm if st.session_state.agent else None
|
| 256 |
-
)
|
| 257 |
-
|
| 258 |
-
# Display the report if available
|
| 259 |
-
if st.session_state.get(summary_key, ""):
|
| 260 |
-
st.markdown(st.session_state[summary_key])
|
| 261 |
-
|
| 262 |
-
# Add export and regenerate buttons
|
| 263 |
-
self._render_report_actions(report_type, summary_key)
|
| 264 |
-
|
| 265 |
-
def _render_report_actions(self, report_type: str, summary_key: str):
|
| 266 |
-
"""Render export and regenerate actions for reports"""
|
| 267 |
-
if report_type == "overview":
|
| 268 |
-
col1, col2 = st.columns([1, 5])
|
| 269 |
-
with col1:
|
| 270 |
-
company_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
|
| 271 |
-
if st.session_state.documents else 'export')
|
| 272 |
-
file_name = f"company_overview_{company_name}.md"
|
| 273 |
-
st.download_button(
|
| 274 |
-
"📥 Export Summary",
|
| 275 |
-
data=f"# Company Overview\n\n{st.session_state[summary_key]}",
|
| 276 |
-
file_name=file_name,
|
| 277 |
-
mime="text/markdown",
|
| 278 |
-
key=f"export_{summary_key}"
|
| 279 |
-
)
|
| 280 |
-
with col2:
|
| 281 |
-
if st.button(f"🔄 Regenerate {report_type.title()}"):
|
| 282 |
-
st.session_state[summary_key] = ""
|
| 283 |
-
st.rerun()
|
| 284 |
-
else:
|
| 285 |
-
col1, col2 = st.columns([1, 5])
|
| 286 |
-
with col1:
|
| 287 |
-
# Combined report export for strategic analysis
|
| 288 |
-
combined_report = f"# Due Diligence Report\n\n"
|
| 289 |
-
combined_report += f"## Company Overview\n\n{st.session_state.get('overview_summary', '')}\n\n"
|
| 290 |
-
combined_report += f"## Strategic Analysis\n\n{st.session_state[summary_key]}"
|
| 291 |
-
|
| 292 |
-
company_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
|
| 293 |
-
if st.session_state.documents else 'export')
|
| 294 |
-
file_name = f"dd_report_{company_name}.md"
|
| 295 |
-
st.download_button(
|
| 296 |
-
"📥 Export Report",
|
| 297 |
-
data=combined_report,
|
| 298 |
-
file_name=file_name,
|
| 299 |
-
mime="text/markdown",
|
| 300 |
-
key=f"export_combined_{summary_key}"
|
| 301 |
-
)
|
| 302 |
-
with col2:
|
| 303 |
-
if st.button(f"🔄 Regenerate {report_type.title()}"):
|
| 304 |
-
st.session_state[summary_key] = ""
|
| 305 |
-
st.rerun()
|
| 306 |
-
|
| 307 |
-
def render_analysis_tab(self, tab_type: str):
|
| 308 |
-
"""Unified rendering for checklist and questions tabs"""
|
| 309 |
-
if tab_type == "checklist":
|
| 310 |
-
# Use checklist from sidebar
|
| 311 |
-
file_text = st.session_state.get('selected_checklist_text', "")
|
| 312 |
-
|
| 313 |
-
if not file_text:
|
| 314 |
-
show_info("👈 Select a checklist in the sidebar to see analysis results")
|
| 315 |
-
return
|
| 316 |
-
|
| 317 |
-
# Render results if available
|
| 318 |
-
render_checklist_results(st.session_state.checklist_results)
|
| 319 |
-
|
| 320 |
-
elif tab_type == "questions":
|
| 321 |
-
# Use questions from sidebar
|
| 322 |
-
file_text = st.session_state.get('selected_questions_text', "")
|
| 323 |
-
|
| 324 |
-
if not file_text:
|
| 325 |
-
show_info("👈 Select a questions list in the sidebar to see analysis results")
|
| 326 |
-
return
|
| 327 |
-
|
| 328 |
-
# Render results if available
|
| 329 |
-
render_question_results(st.session_state.question_answers)
|
| 330 |
-
|
| 331 |
-
def render_qa_tab(self):
|
| 332 |
-
"""Render the Q&A with citations tab"""
|
| 333 |
-
if not st.session_state.chunks:
|
| 334 |
-
show_info("👈 Process data room first to enable Q&A")
|
| 335 |
-
return
|
| 336 |
-
|
| 337 |
-
# Question input
|
| 338 |
-
question = st.text_input(
|
| 339 |
-
"Ask a question about your documents:",
|
| 340 |
-
placeholder="e.g., What are the main risks? What is the revenue model? Who are the key customers?"
|
| 341 |
-
)
|
| 342 |
-
|
| 343 |
-
# Quick question buttons
|
| 344 |
-
quick_question = render_quick_questions()
|
| 345 |
-
if quick_question:
|
| 346 |
-
question = quick_question
|
| 347 |
-
|
| 348 |
-
st.divider()
|
| 349 |
-
|
| 350 |
-
if question:
|
| 351 |
-
self._handle_qa_query(question)
|
| 352 |
-
|
| 353 |
-
def _handle_qa_query(self, question: str):
|
| 354 |
-
"""Handle Q&A query and display results"""
|
| 355 |
-
if not self.document_processor:
|
| 356 |
-
self.initialize_services()
|
| 357 |
-
|
| 358 |
-
# Use lower threshold for Q&A to get more relevant results
|
| 359 |
-
qa_threshold = 0.25
|
| 360 |
-
|
| 361 |
-
with st.spinner("🔍 Searching documents..."):
|
| 362 |
-
results = search_documents(
|
| 363 |
-
self.document_processor,
|
| 364 |
-
question,
|
| 365 |
-
top_k=self.config.ui.top_k_search_results,
|
| 366 |
-
threshold=qa_threshold
|
| 367 |
-
)
|
| 368 |
-
|
| 369 |
-
if results:
|
| 370 |
-
# Use agent to synthesize answer if available
|
| 371 |
-
if (hasattr(st.session_state, 'agent') and st.session_state.agent and
|
| 372 |
-
hasattr(st.session_state.agent, 'llm')):
|
| 373 |
-
|
| 374 |
-
st.markdown("### 🤖 AI Agent's Answer")
|
| 375 |
-
with st.spinner("Agent analyzing documents..."):
|
| 376 |
-
# Convert results to document format for context
|
| 377 |
-
context = "\n\n".join([f"From {r['source']}:\n{r['text']}" for r in results[:3]])
|
| 378 |
-
# Use LLM directly for more reliable answers
|
| 379 |
-
from langchain_core.messages import HumanMessage
|
| 380 |
-
prompt = (f"Question: {question}\n\n"
|
| 381 |
-
f"Relevant document excerpts:\n{context}\n\n"
|
| 382 |
-
f"Provide a comprehensive answer with citations to the sources.")
|
| 383 |
-
response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
|
| 384 |
-
# Clean up any leading whitespace and escape math characters
|
| 385 |
-
answer_text = escape_markdown_math(response.content.strip())
|
| 386 |
-
st.markdown(answer_text)
|
| 387 |
-
st.divider()
|
| 388 |
-
|
| 389 |
-
st.markdown("### 📚 Source Documents")
|
| 390 |
-
|
| 391 |
-
# Display source documents with download buttons
|
| 392 |
-
for i, result in enumerate(results[:3], 1):
|
| 393 |
-
with st.container():
|
| 394 |
-
col1, col2 = st.columns([5, 1])
|
| 395 |
-
with col1:
|
| 396 |
-
excerpt = result['text'][:200] + "..." if len(result['text']) > 200 else result['text']
|
| 397 |
-
st.markdown(f"{i}. \"{excerpt}\"")
|
| 398 |
-
|
| 399 |
-
# Create clickable link for the document
|
| 400 |
-
doc_path = result.get('path', result.get('full_path', ''))
|
| 401 |
-
doc_name = result['source']
|
| 402 |
-
doc_title = format_document_title(doc_name)
|
| 403 |
-
|
| 404 |
-
if doc_path:
|
| 405 |
-
# Create unique key for this result
|
| 406 |
-
unique_key = f"result_{i}_{hash(doc_path) % 10000}"
|
| 407 |
-
col_a, col_b = st.columns([3, 1])
|
| 408 |
-
with col_a:
|
| 409 |
-
create_document_link(doc_path, doc_name, doc_title, unique_key)
|
| 410 |
-
with col_b:
|
| 411 |
-
st.caption(f"({result['citation']})")
|
| 412 |
-
else:
|
| 413 |
-
st.caption(f" 📄 {result['source']} ({result['citation']})")
|
| 414 |
-
|
| 415 |
-
with col2:
|
| 416 |
-
self._render_qa_download_button(result, i, question)
|
| 417 |
-
else:
|
| 418 |
-
st.warning("No relevant information found for your question.")
|
| 419 |
-
|
| 420 |
-
def _render_qa_download_button(self, result: Dict, idx: int, question: str):
|
| 421 |
-
"""Render download button for Q&A results"""
|
| 422 |
-
doc_path = result.get('path', '')
|
| 423 |
-
if doc_path:
|
| 424 |
-
try:
|
| 425 |
-
file_path = Path(doc_path)
|
| 426 |
-
if not file_path.is_absolute():
|
| 427 |
-
file_path = Path("data") / file_path
|
| 428 |
-
|
| 429 |
-
if file_path.exists():
|
| 430 |
-
with open(file_path, 'rb') as f:
|
| 431 |
-
file_bytes = f.read()
|
| 432 |
-
|
| 433 |
-
# Determine MIME type based on file extension
|
| 434 |
-
mime_type = get_mime_type(file_path)
|
| 435 |
-
|
| 436 |
-
button_key = f"qacit_dl_{idx}_{question[:20]}".replace(" ", "_").replace("?", "")
|
| 437 |
-
|
| 438 |
-
st.download_button(
|
| 439 |
-
label="📥 Download",
|
| 440 |
-
data=file_bytes,
|
| 441 |
-
file_name=result['source'],
|
| 442 |
-
mime=mime_type,
|
| 443 |
-
key=button_key,
|
| 444 |
-
help=f"Download {result['source']}"
|
| 445 |
-
)
|
| 446 |
-
except Exception as e:
|
| 447 |
-
st.error(f"Download failed: {str(e)}")
|
| 448 |
-
|
| 449 |
-
def process_data_room(self, data_room_path: str):
|
| 450 |
-
"""Simplified data room processing"""
|
| 451 |
-
if not Path(data_room_path).exists():
|
| 452 |
-
show_error(f"Data room path not found: {data_room_path}")
|
| 453 |
-
return
|
| 454 |
-
|
| 455 |
-
# Use safe_execute for the entire processing operation
|
| 456 |
-
def process_operation():
|
| 457 |
-
self.initialize_services()
|
| 458 |
-
# Simple processing - load documents
|
| 459 |
-
self.document_processor.load_data_room(data_room_path)
|
| 460 |
-
|
| 461 |
-
# Store results in session state with simplified structure
|
| 462 |
-
# Convert list of LangChain documents to dictionary format expected by UI
|
| 463 |
-
documents_dict = {}
|
| 464 |
-
for doc in self.document_processor.documents:
|
| 465 |
-
file_path = doc.metadata.get('source', doc.metadata.get('path', 'unknown'))
|
| 466 |
-
documents_dict[file_path] = {
|
| 467 |
-
'name': doc.metadata.get('name', Path(file_path).name if file_path != 'unknown' else 'unknown'),
|
| 468 |
-
'path': doc.metadata.get('path', ''),
|
| 469 |
-
'content': doc.page_content,
|
| 470 |
-
'metadata': doc.metadata
|
| 471 |
-
}
|
| 472 |
-
|
| 473 |
-
st.session_state.documents = documents_dict
|
| 474 |
-
st.session_state.chunks = self.document_processor.chunks
|
| 475 |
-
st.session_state.embeddings = self.document_processor.embeddings
|
| 476 |
-
|
| 477 |
-
# Process checklist and questions if available
|
| 478 |
-
self._process_checklist_and_questions()
|
| 479 |
-
|
| 480 |
-
# Clear any existing analysis to trigger regeneration
|
| 481 |
-
st.session_state.company_summary = ""
|
| 482 |
-
st.session_state.strategy_analysis = ""
|
| 483 |
-
st.session_state.overview_summary = ""
|
| 484 |
-
st.session_state.strategic_summary = ""
|
| 485 |
-
|
| 486 |
-
show_success("✅ Data room processing complete! View results in the tabs above.")
|
| 487 |
-
st.rerun()
|
| 488 |
-
|
| 489 |
-
safe_execute(
|
| 490 |
-
process_operation,
|
| 491 |
-
None,
|
| 492 |
-
"Data room processing"
|
| 493 |
-
)
|
| 494 |
-
|
| 495 |
-
def _process_checklist_and_questions(self):
|
| 496 |
-
"""Process checklist and questions after documents are loaded"""
|
| 497 |
-
from src.services import parse_checklist, parse_questions, create_vector_store, search_and_analyze
|
| 498 |
-
|
| 499 |
-
# Use checklist from sidebar selection
|
| 500 |
-
checklist_text = st.session_state.get('selected_checklist_text', "")
|
| 501 |
-
if checklist_text and self.document_processor.chunks:
|
| 502 |
-
try:
|
| 503 |
-
# Parse checklist
|
| 504 |
-
checklist = parse_checklist(checklist_text)
|
| 505 |
-
st.session_state.checklist = checklist
|
| 506 |
-
|
| 507 |
-
# Create vector store from chunks for processing
|
| 508 |
-
vector_store = create_vector_store(self.document_processor.chunks, self.model_name)
|
| 509 |
-
|
| 510 |
-
# Process checklist items
|
| 511 |
-
checklist_results = search_and_analyze(
|
| 512 |
-
checklist,
|
| 513 |
-
vector_store,
|
| 514 |
-
self.agent.llm if self.agent else None,
|
| 515 |
-
self.config.processing.similarity_threshold,
|
| 516 |
-
'items'
|
| 517 |
-
)
|
| 518 |
-
st.session_state.checklist_results = checklist_results
|
| 519 |
-
logger.info("✅ Checklist processing completed")
|
| 520 |
-
except Exception as e:
|
| 521 |
-
logger.error(f"Checklist processing failed: {e}")
|
| 522 |
-
|
| 523 |
-
# Use questions from sidebar selection
|
| 524 |
-
questions_text = st.session_state.get('selected_questions_text', "")
|
| 525 |
-
if questions_text and self.document_processor.chunks:
|
| 526 |
-
try:
|
| 527 |
-
# Parse questions
|
| 528 |
-
questions = parse_questions(questions_text)
|
| 529 |
-
st.session_state.questions = questions
|
| 530 |
-
|
| 531 |
-
# Create vector store from chunks for processing (reuse if already created)
|
| 532 |
-
if 'vector_store' not in locals():
|
| 533 |
-
vector_store = create_vector_store(self.document_processor.chunks, self.model_name)
|
| 534 |
-
|
| 535 |
-
# Process questions
|
| 536 |
-
question_answers = search_and_analyze(
|
| 537 |
-
questions,
|
| 538 |
-
vector_store,
|
| 539 |
-
self.agent.llm if self.agent else None,
|
| 540 |
-
self.config.processing.relevancy_threshold,
|
| 541 |
-
'questions'
|
| 542 |
-
)
|
| 543 |
-
st.session_state.question_answers = question_answers
|
| 544 |
-
logger.info("✅ Questions processing completed")
|
| 545 |
-
except Exception as e:
|
| 546 |
-
logger.error(f"Questions processing failed: {e}")
|
| 547 |
-
|
| 548 |
-
def run(self):
|
| 549 |
-
"""Run the main application"""
|
| 550 |
-
# Render header
|
| 551 |
-
st.title("🤖 AI Due Diligence")
|
| 552 |
-
st.markdown("**Intelligent M&A Analysis:** Strategic assessment, automated document review, and AI-powered insights")
|
| 553 |
-
|
| 554 |
-
# Render sidebar and get selections
|
| 555 |
-
selected_data_room_path, use_ai_features, process_button = self.render_sidebar()
|
| 556 |
-
|
| 557 |
-
# Main tabs - Company Overview and Strategic Analysis moved to top level
|
| 558 |
-
tab1, tab2, tab3, tab4, tab5 = st.tabs([
|
| 559 |
-
"🏢 Company Overview",
|
| 560 |
-
"🎯 Strategic Analysis",
|
| 561 |
-
"📊 Checklist Matching",
|
| 562 |
-
"❓ Due Diligence Questions",
|
| 563 |
-
"💬 Q&A with Citations"
|
| 564 |
-
])
|
| 565 |
-
|
| 566 |
-
with tab1:
|
| 567 |
-
self.render_company_overview_tab()
|
| 568 |
-
|
| 569 |
-
with tab2:
|
| 570 |
-
self.render_strategic_analysis_tab()
|
| 571 |
-
|
| 572 |
-
with tab3:
|
| 573 |
-
self.render_analysis_tab("checklist")
|
| 574 |
-
|
| 575 |
-
with tab4:
|
| 576 |
-
self.render_analysis_tab("questions")
|
| 577 |
-
|
| 578 |
-
with tab5:
|
| 579 |
-
self.render_qa_tab()
|
| 580 |
-
|
| 581 |
-
# Processing complete message is handled in process_data_room function
|
| 582 |
-
|
| 583 |
-
# Simplified processing trigger
|
| 584 |
-
if process_button and selected_data_room_path:
|
| 585 |
-
with st.spinner("🚀 Processing data room..."):
|
| 586 |
-
self.process_data_room(selected_data_room_path)
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
def main():
|
| 590 |
-
"""Main application entry point"""
|
| 591 |
-
# Configure LangChain logging to reduce verbosity
|
| 592 |
-
configure_langchain_logging(log_level="WARNING")
|
| 593 |
-
|
| 594 |
-
app = DDChecklistApp()
|
| 595 |
-
app.run()
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
if __name__ == "__main__":
|
| 599 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Refactored DD Checklist Application
|
| 3 |
+
|
| 4 |
+
A modular Streamlit application for AI-powered due diligence analysis.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
__version__ = "2.0.0"
|
{src → app}/ai/__init__.py
RENAMED
|
@@ -18,27 +18,27 @@ from .prompts import (
|
|
| 18 |
|
| 19 |
# Direct imports for AI functionality - assuming dependencies are present
|
| 20 |
from .agent_core import (
|
| 21 |
-
|
| 22 |
-
get_langgraph_agent
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
TaskType
|
| 25 |
)
|
| 26 |
|
| 27 |
# Export main public API
|
| 28 |
__all__ = [
|
| 29 |
# Core agent functionality
|
| 30 |
-
'
|
| 31 |
'get_langgraph_agent',
|
| 32 |
-
|
| 33 |
|
| 34 |
-
|
| 35 |
-
# Agent types and state (now in agent_core)
|
| 36 |
'AgentState',
|
| 37 |
'TaskType',
|
| 38 |
-
|
| 39 |
# Prompt functions
|
| 40 |
'get_checklist_parsing_prompt',
|
| 41 |
-
'get_document_relevance_prompt',
|
| 42 |
'get_question_answering_prompt',
|
| 43 |
'get_findings_summary_prompt',
|
| 44 |
'get_description_generation_prompt',
|
|
|
|
| 18 |
|
| 19 |
# Direct imports for AI functionality - assuming dependencies are present
|
| 20 |
from .agent_core import (
|
| 21 |
+
Agent,
|
| 22 |
+
get_langgraph_agent
|
| 23 |
+
)
|
| 24 |
+
from .agent_utils import (
|
| 25 |
+
AgentState,
|
| 26 |
TaskType
|
| 27 |
)
|
| 28 |
|
| 29 |
# Export main public API
|
| 30 |
__all__ = [
|
| 31 |
# Core agent functionality
|
| 32 |
+
'Agent',
|
| 33 |
'get_langgraph_agent',
|
|
|
|
| 34 |
|
| 35 |
+
# Agent types and state
|
|
|
|
| 36 |
'AgentState',
|
| 37 |
'TaskType',
|
| 38 |
+
|
| 39 |
# Prompt functions
|
| 40 |
'get_checklist_parsing_prompt',
|
| 41 |
+
'get_document_relevance_prompt',
|
| 42 |
'get_question_answering_prompt',
|
| 43 |
'get_findings_summary_prompt',
|
| 44 |
'get_description_generation_prompt',
|
app/ai/agent_core.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
LangGraph Agent Core Module
|
| 4 |
+
|
| 5 |
+
This module contains the main LangGraph agent setup and the high-level
|
| 6 |
+
Agent class for interacting with the agent system.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
# Standard library imports
|
| 10 |
+
import logging
|
| 11 |
+
from typing import Optional, Dict, List, Any, Tuple
|
| 12 |
+
|
| 13 |
+
# Third-party imports
|
| 14 |
+
import streamlit as st
|
| 15 |
+
from langchain_anthropic import ChatAnthropic
|
| 16 |
+
from langchain_core.messages import HumanMessage, AIMessage
|
| 17 |
+
from langgraph.checkpoint.memory import MemorySaver
|
| 18 |
+
from langgraph.graph import StateGraph, END
|
| 19 |
+
|
| 20 |
+
# Local imports
|
| 21 |
+
from app.ai.agent_utils import AgentState
|
| 22 |
+
from app.ai.processing_pipeline import route_task, route_condition
|
| 23 |
+
from app.ai.processing_pipeline import (
|
| 24 |
+
parse_checklist_node,
|
| 25 |
+
match_checklist_node,
|
| 26 |
+
answer_question_node,
|
| 27 |
+
summarize_node
|
| 28 |
+
)
|
| 29 |
+
from app.core.config import get_config
|
| 30 |
+
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Agent Functions
|
| 36 |
+
|
| 37 |
+
def get_langgraph_agent(api_key: Optional[str] = None, model: Optional[str] = None) -> Optional[Tuple[Any, "ChatAnthropic"]]:
|
| 38 |
+
"""
|
| 39 |
+
Create a LangGraph agent with Anthropic
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
api_key: Anthropic API key (optional, will be sourced from environment/config)
|
| 43 |
+
model: Model name to use (optional, will use config default)
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
Tuple of (compiled_app, llm) or None if not available
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
# Get configuration
|
| 50 |
+
config = get_config()
|
| 51 |
+
|
| 52 |
+
# Get API key from various sources
|
| 53 |
+
if not api_key:
|
| 54 |
+
api_key = config.api.anthropic_api_key
|
| 55 |
+
if not api_key and st and hasattr(st, 'secrets') and 'ANTHROPIC_API_KEY' in st.secrets:
|
| 56 |
+
api_key = st.secrets['ANTHROPIC_API_KEY']
|
| 57 |
+
|
| 58 |
+
if not api_key:
|
| 59 |
+
return None
|
| 60 |
+
|
| 61 |
+
# Use model from config if not specified
|
| 62 |
+
if not model:
|
| 63 |
+
model = config.model.claude_model
|
| 64 |
+
|
| 65 |
+
# Initialize Claude with config values
|
| 66 |
+
llm = ChatAnthropic(
|
| 67 |
+
model=model,
|
| 68 |
+
anthropic_api_key=api_key,
|
| 69 |
+
temperature=config.model.temperature,
|
| 70 |
+
max_tokens=config.model.max_tokens
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# No custom tools needed - using built-in LangGraph functionality
|
| 74 |
+
|
| 75 |
+
# Create the graph
|
| 76 |
+
workflow = StateGraph(AgentState)
|
| 77 |
+
|
| 78 |
+
# Create node functions that have access to the llm
|
| 79 |
+
def _route_task(state: AgentState) -> AgentState:
|
| 80 |
+
return route_task(state)
|
| 81 |
+
|
| 82 |
+
def _parse_checklist_node(state: AgentState) -> AgentState:
|
| 83 |
+
return parse_checklist_node(state, llm)
|
| 84 |
+
|
| 85 |
+
def _match_checklist_node(state: AgentState) -> AgentState:
|
| 86 |
+
return match_checklist_node(state, llm)
|
| 87 |
+
|
| 88 |
+
def _answer_question_node(state: AgentState) -> AgentState:
|
| 89 |
+
return answer_question_node(state, llm)
|
| 90 |
+
|
| 91 |
+
def _summarize_node(state: AgentState) -> AgentState:
|
| 92 |
+
return summarize_node(state, llm)
|
| 93 |
+
|
| 94 |
+
# Add nodes to workflow
|
| 95 |
+
workflow.add_node("route", _route_task)
|
| 96 |
+
workflow.add_node("parse_checklist", _parse_checklist_node)
|
| 97 |
+
workflow.add_node("match_checklist", _match_checklist_node)
|
| 98 |
+
workflow.add_node("answer_question", _answer_question_node)
|
| 99 |
+
workflow.add_node("summarize", _summarize_node)
|
| 100 |
+
|
| 101 |
+
# Define edges
|
| 102 |
+
workflow.set_entry_point("route")
|
| 103 |
+
|
| 104 |
+
# Conditional routing based on next_action
|
| 105 |
+
workflow.add_conditional_edges(
|
| 106 |
+
"route",
|
| 107 |
+
route_condition,
|
| 108 |
+
{
|
| 109 |
+
"parse_checklist": "parse_checklist",
|
| 110 |
+
"match_checklist": "match_checklist",
|
| 111 |
+
"answer_question": "answer_question",
|
| 112 |
+
"summarize": "summarize"
|
| 113 |
+
}
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# All task nodes go to END
|
| 117 |
+
workflow.add_edge("parse_checklist", END)
|
| 118 |
+
workflow.add_edge("match_checklist", END)
|
| 119 |
+
workflow.add_edge("answer_question", END)
|
| 120 |
+
workflow.add_edge("summarize", END)
|
| 121 |
+
|
| 122 |
+
# Compile with memory
|
| 123 |
+
memory = MemorySaver()
|
| 124 |
+
app = workflow.compile(checkpointer=memory)
|
| 125 |
+
|
| 126 |
+
return app, llm
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
class Agent:
|
| 130 |
+
"""High-level interface for the LangGraph agent"""
|
| 131 |
+
|
| 132 |
+
def __init__(self, api_key: Optional[str] = None, model: Optional[str] = None):
|
| 133 |
+
"""
|
| 134 |
+
Initialize the Agent
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
api_key: Anthropic API key (optional)
|
| 138 |
+
model: Model name to use
|
| 139 |
+
"""
|
| 140 |
+
result = get_langgraph_agent(api_key, model)
|
| 141 |
+
if result:
|
| 142 |
+
self.app, self.llm = result
|
| 143 |
+
self.thread_id = "dd-poc-session"
|
| 144 |
+
else:
|
| 145 |
+
self.app = None
|
| 146 |
+
self.llm = None
|
| 147 |
+
|
| 148 |
+
def is_available(self) -> bool:
|
| 149 |
+
"""Check if the agent is available for use"""
|
| 150 |
+
return self.app is not None and self.llm is not None
|
| 151 |
+
|
| 152 |
+
def parse_checklist(self, checklist_text: str) -> Optional[Dict]:
|
| 153 |
+
"""
|
| 154 |
+
Parse checklist using the agent
|
| 155 |
+
|
| 156 |
+
Args:
|
| 157 |
+
checklist_text: Raw checklist text to parse
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
Parsed checklist dictionary or None if failed
|
| 161 |
+
"""
|
| 162 |
+
if not self.app:
|
| 163 |
+
return None
|
| 164 |
+
|
| 165 |
+
try:
|
| 166 |
+
# Run the agent
|
| 167 |
+
result = self.app.invoke(
|
| 168 |
+
{"messages": [HumanMessage(content=f"Parse this checklist: {checklist_text}")]},
|
| 169 |
+
config={"configurable": {"thread_id": self.thread_id}}
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
return result.get("checklist")
|
| 173 |
+
except Exception as e:
|
| 174 |
+
st.error(f"Agent error: {str(e)}")
|
| 175 |
+
return None
|
| 176 |
+
|
| 177 |
+
def match_documents(self, checklist: Dict, documents: List[Dict]) -> Dict:
|
| 178 |
+
"""
|
| 179 |
+
Match documents to checklist items
|
| 180 |
+
|
| 181 |
+
Args:
|
| 182 |
+
checklist: Parsed checklist dictionary
|
| 183 |
+
documents: List of document dictionaries
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
Dictionary of findings or empty dict if failed
|
| 187 |
+
"""
|
| 188 |
+
if not self.app:
|
| 189 |
+
return {}
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
# Prepare state
|
| 193 |
+
initial_state = {
|
| 194 |
+
"messages": [HumanMessage(content="Match documents to checklist items")],
|
| 195 |
+
"checklist": checklist,
|
| 196 |
+
"documents": documents,
|
| 197 |
+
"findings": {}
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
result = self.app.invoke(
|
| 201 |
+
initial_state,
|
| 202 |
+
config={"configurable": {"thread_id": self.thread_id}}
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
return result.get("findings", {})
|
| 206 |
+
except Exception as e:
|
| 207 |
+
st.error(f"Agent error: {str(e)}")
|
| 208 |
+
return {}
|
| 209 |
+
|
| 210 |
+
def answer_question(self, question: str, documents: List[Dict]) -> str:
|
| 211 |
+
"""
|
| 212 |
+
Answer a question using document context
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
question: User question
|
| 216 |
+
documents: List of document dictionaries for context
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
Answer string or error message
|
| 220 |
+
"""
|
| 221 |
+
if not self.app:
|
| 222 |
+
return "Agent not available"
|
| 223 |
+
|
| 224 |
+
try:
|
| 225 |
+
initial_state = {
|
| 226 |
+
"messages": [HumanMessage(content=question)],
|
| 227 |
+
"documents": documents
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
result = self.app.invoke(
|
| 231 |
+
initial_state,
|
| 232 |
+
config={"configurable": {"thread_id": self.thread_id}}
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
# Get the last AI message
|
| 236 |
+
messages = result.get("messages", [])
|
| 237 |
+
for msg in reversed(messages):
|
| 238 |
+
if isinstance(msg, AIMessage):
|
| 239 |
+
return msg.content
|
| 240 |
+
|
| 241 |
+
return "No answer generated"
|
| 242 |
+
except Exception as e:
|
| 243 |
+
return f"Error: {str(e)}"
|
| 244 |
+
|
| 245 |
+
def summarize_findings(self, findings: Dict) -> str:
|
| 246 |
+
"""
|
| 247 |
+
Generate executive summary
|
| 248 |
+
|
| 249 |
+
Args:
|
| 250 |
+
findings: Dictionary of due diligence findings
|
| 251 |
+
|
| 252 |
+
Returns:
|
| 253 |
+
Summary string or error message
|
| 254 |
+
"""
|
| 255 |
+
if not self.app:
|
| 256 |
+
return "Agent not available"
|
| 257 |
+
|
| 258 |
+
try:
|
| 259 |
+
initial_state = {
|
| 260 |
+
"messages": [HumanMessage(content="Summarize the due diligence findings")],
|
| 261 |
+
"findings": findings
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
result = self.app.invoke(
|
| 265 |
+
initial_state,
|
| 266 |
+
config={"configurable": {"thread_id": self.thread_id}}
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
# Get the last AI message
|
| 270 |
+
messages = result.get("messages", [])
|
| 271 |
+
for msg in reversed(messages):
|
| 272 |
+
if isinstance(msg, AIMessage):
|
| 273 |
+
return msg.content
|
| 274 |
+
|
| 275 |
+
return "No summary generated"
|
| 276 |
+
except Exception as e:
|
| 277 |
+
return f"Error: {str(e)}"
|
app/ai/agent_utils.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Agent Utilities Module
|
| 4 |
+
|
| 5 |
+
This module contains utility functions, helper methods, and type definitions
|
| 6 |
+
for the LangGraph agent system.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
# Standard library imports
|
| 10 |
+
import logging
|
| 11 |
+
import random
|
| 12 |
+
import time
|
| 13 |
+
from enum import Enum
|
| 14 |
+
from typing import Optional, Dict, List, Sequence
|
| 15 |
+
|
| 16 |
+
# Third-party imports
|
| 17 |
+
from langchain_core.runnables import RunnableLambda
|
| 18 |
+
from typing_extensions import TypedDict
|
| 19 |
+
|
| 20 |
+
# Local imports
|
| 21 |
+
from app.core.config import get_config
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def with_retry(func, max_attempts=3, base_delay=1.0):
|
| 27 |
+
"""
|
| 28 |
+
Wrapper function to add exponential backoff retry logic to any function.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
func: Function to wrap with retry logic
|
| 32 |
+
max_attempts: Maximum number of retry attempts (default: 3)
|
| 33 |
+
base_delay: Base delay in seconds for exponential backoff (default: 1.0)
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
Wrapped function with retry logic
|
| 37 |
+
"""
|
| 38 |
+
def wrapper(*args, **kwargs):
|
| 39 |
+
for attempt in range(max_attempts):
|
| 40 |
+
try:
|
| 41 |
+
return func(*args, **kwargs)
|
| 42 |
+
except Exception as e:
|
| 43 |
+
if attempt == max_attempts - 1: # Last attempt
|
| 44 |
+
raise e
|
| 45 |
+
|
| 46 |
+
# Exponential backoff with jitter
|
| 47 |
+
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
| 48 |
+
logger.warning(f"Attempt {attempt + 1} failed: {str(e)}. Retrying in {delay:.2f}s...")
|
| 49 |
+
time.sleep(delay)
|
| 50 |
+
|
| 51 |
+
return wrapper
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def create_batch_processor(llm: "ChatAnthropic", max_concurrency: int = None) -> RunnableLambda:
|
| 55 |
+
"""
|
| 56 |
+
Create a batch processor using LangChain's retry and fallback mechanisms.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
llm: ChatAnthropic instance
|
| 60 |
+
max_concurrency: Maximum concurrent requests (uses config default if None)
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
RunnableLambda configured with retry and fallback mechanisms
|
| 64 |
+
"""
|
| 65 |
+
config = get_config()
|
| 66 |
+
if max_concurrency is None:
|
| 67 |
+
max_concurrency = 3 # Default max concurrency
|
| 68 |
+
|
| 69 |
+
def process_single_item(input_data):
|
| 70 |
+
"""Process a single item with error handling"""
|
| 71 |
+
try:
|
| 72 |
+
messages, item_info = input_data
|
| 73 |
+
response = llm.invoke(messages)
|
| 74 |
+
return {
|
| 75 |
+
'success': True,
|
| 76 |
+
'response': response,
|
| 77 |
+
'item_info': item_info,
|
| 78 |
+
'error': None
|
| 79 |
+
}
|
| 80 |
+
except Exception as e:
|
| 81 |
+
# Fail immediately on any error
|
| 82 |
+
error_msg = f"Single item processing failed: {str(e)}"
|
| 83 |
+
logger.error(error_msg)
|
| 84 |
+
raise Exception(error_msg)
|
| 85 |
+
|
| 86 |
+
def process_batch(batch_inputs):
|
| 87 |
+
"""Process a batch of inputs with individual item error handling"""
|
| 88 |
+
try:
|
| 89 |
+
# Use LLM's batch method for efficiency
|
| 90 |
+
messages_batch = [input_data[0] for input_data in batch_inputs]
|
| 91 |
+
item_infos = [input_data[1] for input_data in batch_inputs]
|
| 92 |
+
|
| 93 |
+
responses = llm.batch(
|
| 94 |
+
messages_batch,
|
| 95 |
+
config={"max_concurrency": max_concurrency}
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Process results with individual error handling - fail on any error
|
| 99 |
+
results = []
|
| 100 |
+
for i, (response, item_info) in enumerate(zip(responses, item_infos)):
|
| 101 |
+
if response:
|
| 102 |
+
results.append({
|
| 103 |
+
'success': True,
|
| 104 |
+
'response': response,
|
| 105 |
+
'item_info': item_info,
|
| 106 |
+
'error': None
|
| 107 |
+
})
|
| 108 |
+
else:
|
| 109 |
+
# Fail immediately on any missing response
|
| 110 |
+
error_msg = f'No response for item {i}'
|
| 111 |
+
logger.error(error_msg)
|
| 112 |
+
raise Exception(error_msg)
|
| 113 |
+
|
| 114 |
+
return results
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
# If batch fails completely, fail immediately
|
| 118 |
+
error_msg = f"Batch processing failed: {e}"
|
| 119 |
+
logger.error(error_msg)
|
| 120 |
+
raise Exception(error_msg)
|
| 121 |
+
|
| 122 |
+
# Create the main processor with retry logic
|
| 123 |
+
retryable_process_batch = with_retry(process_batch, max_attempts=3, base_delay=1.0)
|
| 124 |
+
processor = RunnableLambda(retryable_process_batch)
|
| 125 |
+
|
| 126 |
+
return processor
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# =============================================================================
|
| 130 |
+
# TYPE DEFINITIONS
|
| 131 |
+
# =============================================================================
|
| 132 |
+
|
| 133 |
+
# Define the state for our agent
|
| 134 |
+
class AgentState(TypedDict):
|
| 135 |
+
"""State for the due diligence agent"""
|
| 136 |
+
messages: Sequence["BaseMessage"]
|
| 137 |
+
checklist: Optional[Dict]
|
| 138 |
+
documents: Optional[List[Dict]]
|
| 139 |
+
current_task: Optional[str]
|
| 140 |
+
findings: Dict[str, List[str]]
|
| 141 |
+
next_action: Optional[str]
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class TaskType(Enum):
|
| 145 |
+
"""Types of tasks the agent can perform"""
|
| 146 |
+
PARSE_CHECKLIST = "parse_checklist"
|
| 147 |
+
ANALYZE_DOCUMENT = "analyze_document"
|
| 148 |
+
MATCH_CHECKLIST = "match_checklist"
|
| 149 |
+
ANSWER_QUESTION = "answer_question"
|
| 150 |
+
SUMMARIZE_FINDINGS = "summarize_findings"
|
app/ai/document_classifier.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Document Classification Module
|
| 4 |
+
|
| 5 |
+
This module contains functions for classifying document types and related utilities.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# Standard library imports
|
| 9 |
+
import logging
|
| 10 |
+
from typing import List, Dict, Optional
|
| 11 |
+
|
| 12 |
+
# Third-party imports
|
| 13 |
+
from langchain_core.messages import HumanMessage
|
| 14 |
+
import httpx
|
| 15 |
+
import backoff
|
| 16 |
+
|
| 17 |
+
# Local imports
|
| 18 |
+
from app.ai.agent_utils import create_batch_processor
|
| 19 |
+
from app.ai.prompts import get_document_type_classification_prompt
|
| 20 |
+
from app.core.config import get_config
|
| 21 |
+
from app.core.constants import DEFAULT_BATCH_SIZE
|
| 22 |
+
from app.core.performance import get_performance_manager
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@backoff.on_exception(
|
| 28 |
+
backoff.expo,
|
| 29 |
+
(Exception,),
|
| 30 |
+
max_tries=3,
|
| 31 |
+
jitter=backoff.random_jitter
|
| 32 |
+
)
|
| 33 |
+
def batch_classify_document_types(first_chunks: List[Dict], llm: "ChatAnthropic", batch_size: Optional[int] = None) -> List[Dict]:
|
| 34 |
+
"""
|
| 35 |
+
Fast document type classification using first chunks only with Haiku model.
|
| 36 |
+
Optimized for speed and cost with batched processing.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
first_chunks: List of first chunk dictionaries to classify
|
| 40 |
+
llm: ChatAnthropic instance (should be Haiku for speed/cost)
|
| 41 |
+
batch_size: Number of documents to process in each batch (uses config default if None)
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
List of documents with added document_type field
|
| 45 |
+
"""
|
| 46 |
+
config = get_config()
|
| 47 |
+
if batch_size is None:
|
| 48 |
+
# Use optimized batch size for Haiku (faster model)
|
| 49 |
+
batch_size = min(DEFAULT_BATCH_SIZE, 25) # Increased to 25 docs per batch for better performance
|
| 50 |
+
|
| 51 |
+
# Create batch processor with retry and fallback mechanisms
|
| 52 |
+
batch_processor = create_batch_processor(llm, max_concurrency=5) # Increased concurrency
|
| 53 |
+
|
| 54 |
+
# Process documents in batches
|
| 55 |
+
classified_docs = []
|
| 56 |
+
total_docs = len(first_chunks)
|
| 57 |
+
total_batches = (total_docs + batch_size - 1) // batch_size
|
| 58 |
+
|
| 59 |
+
model_name = getattr(llm, 'model', 'unknown')
|
| 60 |
+
logger.info(f"🏷️ Classifying {total_docs} document types using {model_name}")
|
| 61 |
+
|
| 62 |
+
# Get performance manager for caching
|
| 63 |
+
perf_manager = get_performance_manager()
|
| 64 |
+
|
| 65 |
+
for batch_num, i in enumerate(range(0, total_docs, batch_size), 1):
|
| 66 |
+
batch = first_chunks[i:i + batch_size]
|
| 67 |
+
batch_end = min(i + batch_size, total_docs)
|
| 68 |
+
|
| 69 |
+
# Check cache for existing classifications
|
| 70 |
+
cached_batch = []
|
| 71 |
+
uncached_batch = []
|
| 72 |
+
uncached_indices = []
|
| 73 |
+
|
| 74 |
+
for idx, doc in enumerate(batch):
|
| 75 |
+
cache_key = f"classification:{doc.get('path', '')}"
|
| 76 |
+
cached_result = perf_manager.doc_cache.get(cache_key)
|
| 77 |
+
if cached_result:
|
| 78 |
+
cached_batch.append(cached_result)
|
| 79 |
+
logger.debug(f"Cache hit for document classification: {doc.get('name', '')}")
|
| 80 |
+
else:
|
| 81 |
+
uncached_batch.append(doc)
|
| 82 |
+
uncached_indices.append(idx)
|
| 83 |
+
|
| 84 |
+
logger.info(f"Processing classification batch {batch_num}/{total_batches} "
|
| 85 |
+
f"({len(uncached_batch)} new, {len(cached_batch)} cached documents)")
|
| 86 |
+
|
| 87 |
+
# Only process uncached documents
|
| 88 |
+
if uncached_batch:
|
| 89 |
+
batch_inputs = []
|
| 90 |
+
for doc in uncached_batch:
|
| 91 |
+
template = get_document_type_classification_prompt()
|
| 92 |
+
prompt = template.format(
|
| 93 |
+
doc_name=doc.get('name', 'Unknown'),
|
| 94 |
+
content_preview=doc.get('content', '')[:500] # First 500 chars for classification
|
| 95 |
+
)
|
| 96 |
+
messages = [HumanMessage(content=prompt)]
|
| 97 |
+
batch_inputs.append((messages, doc))
|
| 98 |
+
|
| 99 |
+
# Process batch using LangChain's built-in mechanisms
|
| 100 |
+
try:
|
| 101 |
+
logger.info(f"Processing classification batch {batch_num}/{total_batches} with {len(uncached_batch)} new documents")
|
| 102 |
+
batch_results = batch_processor.invoke(batch_inputs)
|
| 103 |
+
|
| 104 |
+
# Process results with individual document error handling
|
| 105 |
+
for idx, result in enumerate(batch_results):
|
| 106 |
+
doc = result['item_info'].copy()
|
| 107 |
+
|
| 108 |
+
if result['success'] and result['response']:
|
| 109 |
+
# Successfully classified document type
|
| 110 |
+
doc_type = result['response'].content.strip().lower()
|
| 111 |
+
# Remove any "the document type is" prefix if present (for backward compatibility)
|
| 112 |
+
if doc_type.startswith("the document type is "):
|
| 113 |
+
doc_type = doc_type[21:].strip()
|
| 114 |
+
doc['document_type'] = doc_type
|
| 115 |
+
logger.debug(f"Classified '{doc.get('name', 'Unknown')}' as: {doc_type}")
|
| 116 |
+
|
| 117 |
+
# Cache the result
|
| 118 |
+
cache_key = f"classification:{doc.get('path', '')}"
|
| 119 |
+
perf_manager.doc_cache.set(cache_key, doc, expire=86400 * 30) # 30 days
|
| 120 |
+
|
| 121 |
+
classified_docs.append(doc)
|
| 122 |
+
else:
|
| 123 |
+
# Fail on classification error
|
| 124 |
+
error_msg = f"Failed to classify document '{doc.get('name', 'Unknown')}': {result.get('error', 'Unknown error')}"
|
| 125 |
+
logger.error(error_msg)
|
| 126 |
+
raise Exception(error_msg)
|
| 127 |
+
|
| 128 |
+
except Exception as e:
|
| 129 |
+
error_msg = f"Classification batch {batch_num} processing completely failed: {e}"
|
| 130 |
+
logger.error(error_msg)
|
| 131 |
+
raise Exception(error_msg)
|
| 132 |
+
|
| 133 |
+
# Add cached results to the final list
|
| 134 |
+
classified_docs.extend(cached_batch)
|
| 135 |
+
|
| 136 |
+
successful_classifications = len([d for d in classified_docs if d.get('document_type') != 'unknown document'])
|
| 137 |
+
success_rate = (successful_classifications / total_docs) * 100 if total_docs > 0 else 0
|
| 138 |
+
logger.info(f"✅ Classified {successful_classifications}/{total_docs} documents ({success_rate:.1f}% success rate)")
|
| 139 |
+
|
| 140 |
+
return classified_docs
|
app/ai/processing_pipeline.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Processing Pipeline Module
|
| 4 |
+
|
| 5 |
+
This module contains content processing pipeline and workflow functions,
|
| 6 |
+
including agent node functions and batch processing utilities.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
# Standard library imports
|
| 10 |
+
import logging
|
| 11 |
+
from typing import List, Dict, Optional
|
| 12 |
+
|
| 13 |
+
# Third-party imports
|
| 14 |
+
import streamlit as st
|
| 15 |
+
from langchain_core.messages import HumanMessage, AIMessage
|
| 16 |
+
from langchain_core.output_parsers import PydanticOutputParser
|
| 17 |
+
from pydantic import BaseModel, Field
|
| 18 |
+
|
| 19 |
+
# Local imports
|
| 20 |
+
from app.ai.agent_utils import AgentState, create_batch_processor
|
| 21 |
+
from app.ai.prompts import (
|
| 22 |
+
get_checklist_parsing_prompt,
|
| 23 |
+
get_document_relevance_prompt,
|
| 24 |
+
get_question_answering_prompt,
|
| 25 |
+
get_findings_summary_prompt,
|
| 26 |
+
get_description_generation_prompt,
|
| 27 |
+
get_document_summarization_prompt
|
| 28 |
+
)
|
| 29 |
+
from app.core.config import get_config
|
| 30 |
+
from app.core.constants import DEFAULT_BATCH_SIZE
|
| 31 |
+
|
| 32 |
+
logger = logging.getLogger(__name__)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Pydantic models for structured output parsing
|
| 36 |
+
class ChecklistItem(BaseModel):
|
| 37 |
+
"""Individual checklist item"""
|
| 38 |
+
text: str = Field(description="The checklist item text")
|
| 39 |
+
original: str = Field(description="The original text before any cleanup")
|
| 40 |
+
|
| 41 |
+
class ChecklistCategory(BaseModel):
|
| 42 |
+
"""Checklist category with items"""
|
| 43 |
+
name: str = Field(description="Category name (e.g., 'Organizational and Corporate Documents')")
|
| 44 |
+
items: List[ChecklistItem] = Field(description="List of checklist items in this category")
|
| 45 |
+
|
| 46 |
+
class StructuredChecklist(BaseModel):
|
| 47 |
+
"""Complete checklist with all categories"""
|
| 48 |
+
categories: Dict[str, ChecklistCategory] = Field(
|
| 49 |
+
description="Dictionary of categories keyed by letter (A, B, C, etc.)"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
class Question(BaseModel):
|
| 53 |
+
"""Individual question"""
|
| 54 |
+
category: str = Field(description="Question category")
|
| 55 |
+
question: str = Field(description="The question text")
|
| 56 |
+
id: str = Field(description="Unique question ID")
|
| 57 |
+
|
| 58 |
+
class StructuredQuestions(BaseModel):
|
| 59 |
+
"""List of structured questions"""
|
| 60 |
+
questions: List[Question] = Field(description="List of all questions")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def route_task(state: AgentState) -> AgentState:
|
| 65 |
+
"""Route to appropriate task based on current state"""
|
| 66 |
+
messages = state["messages"]
|
| 67 |
+
if not messages:
|
| 68 |
+
return state
|
| 69 |
+
|
| 70 |
+
last_message = messages[-1].content if messages else ""
|
| 71 |
+
|
| 72 |
+
# Determine next action based on message content
|
| 73 |
+
if "parse" in last_message.lower() and "checklist" in last_message.lower():
|
| 74 |
+
state["next_action"] = "parse_checklist"
|
| 75 |
+
elif "analyze" in last_message.lower() or "match" in last_message.lower():
|
| 76 |
+
state["next_action"] = "match_checklist"
|
| 77 |
+
elif "?" in last_message:
|
| 78 |
+
state["next_action"] = "answer_question"
|
| 79 |
+
else:
|
| 80 |
+
state["next_action"] = "summarize"
|
| 81 |
+
|
| 82 |
+
return state
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def parse_checklist_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
|
| 86 |
+
"""Parse checklist using structured output - standardized with StructuredChecklist!"""
|
| 87 |
+
messages = state["messages"]
|
| 88 |
+
checklist_text = messages[-1].content if messages else ""
|
| 89 |
+
|
| 90 |
+
# Set up structured parser - using the same as parse_checklist function
|
| 91 |
+
parser = PydanticOutputParser(pydantic_object=StructuredChecklist)
|
| 92 |
+
prompt = get_checklist_parsing_prompt()
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
# Format the prompt with the checklist text and format instructions
|
| 96 |
+
formatted_prompt = prompt.format_messages(
|
| 97 |
+
checklist_text=checklist_text, # Don't truncate - let LLM handle full checklist
|
| 98 |
+
format_instructions=parser.get_format_instructions()
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# Get LLM response
|
| 102 |
+
llm_response = llm.invoke(formatted_prompt)
|
| 103 |
+
|
| 104 |
+
# Parse the response using the Pydantic parser
|
| 105 |
+
result = parser.parse(llm_response.content)
|
| 106 |
+
|
| 107 |
+
# Convert Pydantic model to expected dictionary format (same as parse_checklist)
|
| 108 |
+
categories_dict = {}
|
| 109 |
+
for key, category in result.categories.items():
|
| 110 |
+
categories_dict[key] = {
|
| 111 |
+
'name': category.name,
|
| 112 |
+
'items': [
|
| 113 |
+
{
|
| 114 |
+
'text': item.text,
|
| 115 |
+
'original': item.original
|
| 116 |
+
}
|
| 117 |
+
for item in category.items
|
| 118 |
+
]
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
state["checklist"] = categories_dict
|
| 122 |
+
state["messages"].append(AIMessage(content=f"Parsed {len(categories_dict)} categories"))
|
| 123 |
+
|
| 124 |
+
except Exception as e:
|
| 125 |
+
state["messages"].append(AIMessage(content=f"Parsing failed: {str(e)}"))
|
| 126 |
+
|
| 127 |
+
return state
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def match_checklist_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
|
| 131 |
+
"""Match documents to checklist items - keep it simple"""
|
| 132 |
+
checklist = state.get("checklist", {})
|
| 133 |
+
documents = state.get("documents", [])
|
| 134 |
+
|
| 135 |
+
if not checklist or not documents:
|
| 136 |
+
state["messages"].append(AIMessage(content="Need both checklist and documents to match"))
|
| 137 |
+
return state
|
| 138 |
+
|
| 139 |
+
# For each checklist item, find relevant documents
|
| 140 |
+
findings = {}
|
| 141 |
+
for cat_letter, category in checklist.items():
|
| 142 |
+
cat_findings = []
|
| 143 |
+
for item in category.get("items", []):
|
| 144 |
+
# Use Claude to assess relevance
|
| 145 |
+
document_names = [d.get('name', 'Unknown') for d in documents[:10]]
|
| 146 |
+
prompt = get_document_relevance_prompt(item['text'], document_names)
|
| 147 |
+
|
| 148 |
+
response = llm.invoke([HumanMessage(content=str(prompt))])
|
| 149 |
+
cat_findings.append({
|
| 150 |
+
"item": item['text'],
|
| 151 |
+
"relevant_docs": response.content
|
| 152 |
+
})
|
| 153 |
+
|
| 154 |
+
findings[category['name']] = cat_findings
|
| 155 |
+
|
| 156 |
+
state["findings"] = findings
|
| 157 |
+
state["messages"].append(AIMessage(content=f"Matched checklist to {len(documents)} documents"))
|
| 158 |
+
|
| 159 |
+
return state
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def answer_question_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
|
| 163 |
+
"""Answer questions using document context"""
|
| 164 |
+
messages = state["messages"]
|
| 165 |
+
question = messages[-1].content if messages else ""
|
| 166 |
+
documents = state.get("documents", [])
|
| 167 |
+
|
| 168 |
+
# Create context from documents
|
| 169 |
+
context = "\n".join([f"- {d.get('name', 'Unknown')}: {d.get('text', '')[:200]}"
|
| 170 |
+
for d in documents[:5]])
|
| 171 |
+
|
| 172 |
+
prompt = get_question_answering_prompt(question, context)
|
| 173 |
+
response = llm.invoke([HumanMessage(content=prompt)])
|
| 174 |
+
state["messages"].append(AIMessage(content=response.content))
|
| 175 |
+
|
| 176 |
+
return state
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def summarize_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
|
| 180 |
+
"""Summarize findings"""
|
| 181 |
+
findings = state.get("findings", {})
|
| 182 |
+
|
| 183 |
+
if not findings:
|
| 184 |
+
state["messages"].append(AIMessage(content="No findings to summarize"))
|
| 185 |
+
return state
|
| 186 |
+
|
| 187 |
+
prompt = get_findings_summary_prompt(findings)
|
| 188 |
+
response = llm.invoke([HumanMessage(content=prompt)])
|
| 189 |
+
state["messages"].append(AIMessage(content=response.content))
|
| 190 |
+
|
| 191 |
+
return state
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def route_condition(state: AgentState) -> str:
|
| 195 |
+
"""Conditional routing function based on next_action"""
|
| 196 |
+
next_action = state.get("next_action")
|
| 197 |
+
if next_action == "parse_checklist":
|
| 198 |
+
return "parse_checklist"
|
| 199 |
+
elif next_action == "match_checklist":
|
| 200 |
+
return "match_checklist"
|
| 201 |
+
elif next_action == "answer_question":
|
| 202 |
+
return "answer_question"
|
| 203 |
+
else:
|
| 204 |
+
return "summarize"
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def batch_summarize_documents(documents: List[Dict], llm: "ChatAnthropic", batch_size: Optional[int] = None) -> List[Dict]:
|
| 210 |
+
"""
|
| 211 |
+
Summarize documents using LangChain's built-in retry mechanisms and proper error handling.
|
| 212 |
+
Uses RunnableLambda for better batch processing control with individual item error handling.
|
| 213 |
+
Returns documents with added 'summary' field.
|
| 214 |
+
|
| 215 |
+
Args:
|
| 216 |
+
documents: List of document dictionaries to summarize
|
| 217 |
+
llm: ChatAnthropic instance for generating summaries
|
| 218 |
+
batch_size: Number of documents to process in each batch (uses config default if None)
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
List of documents with added summary field
|
| 222 |
+
"""
|
| 223 |
+
|
| 224 |
+
config = get_config()
|
| 225 |
+
if batch_size is None:
|
| 226 |
+
batch_size = DEFAULT_BATCH_SIZE
|
| 227 |
+
|
| 228 |
+
# Create batch processor with retry and fallback mechanisms
|
| 229 |
+
batch_processor = create_batch_processor(llm, max_concurrency=3)
|
| 230 |
+
|
| 231 |
+
# Process documents in batches
|
| 232 |
+
summarized_docs = []
|
| 233 |
+
total_docs = len(documents)
|
| 234 |
+
total_batches = (total_docs + batch_size - 1) // batch_size
|
| 235 |
+
|
| 236 |
+
for batch_num, i in enumerate(range(0, total_docs, batch_size), 1):
|
| 237 |
+
batch = documents[i:i + batch_size]
|
| 238 |
+
batch_end = min(i + batch_size, total_docs)
|
| 239 |
+
|
| 240 |
+
# Update progress with batch info
|
| 241 |
+
if hasattr(st, 'progress') and 'summary_progress' in st.session_state:
|
| 242 |
+
progress = i / total_docs
|
| 243 |
+
st.session_state.summary_progress.progress(
|
| 244 |
+
progress,
|
| 245 |
+
text=f"📝 Processing batch {batch_num}/{total_batches} (docs {i+1}-{batch_end} of {total_docs})"
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
# Prepare batch inputs for the processor
|
| 249 |
+
batch_inputs = []
|
| 250 |
+
for doc in batch:
|
| 251 |
+
template = get_document_summarization_prompt(doc)
|
| 252 |
+
prompt = template.format()
|
| 253 |
+
messages = [HumanMessage(content=prompt)]
|
| 254 |
+
batch_inputs.append((messages, doc))
|
| 255 |
+
|
| 256 |
+
# Process batch using LangChain's built-in mechanisms
|
| 257 |
+
try:
|
| 258 |
+
batch_results = batch_processor.invoke(batch_inputs)
|
| 259 |
+
|
| 260 |
+
# Process results with individual document error handling
|
| 261 |
+
for result in batch_results:
|
| 262 |
+
doc = result['item_info'].copy()
|
| 263 |
+
|
| 264 |
+
if result['success'] and result['response']:
|
| 265 |
+
# Successfully generated summary
|
| 266 |
+
doc['summary'] = result['response'].content.strip()
|
| 267 |
+
summarized_docs.append(doc)
|
| 268 |
+
else:
|
| 269 |
+
# Fail on summary generation error
|
| 270 |
+
error_msg = f"Failed to generate summary for document '{doc.get('name', 'Unknown')}': {result.get('error', 'Unknown error')}"
|
| 271 |
+
logger.error(error_msg)
|
| 272 |
+
raise Exception(error_msg)
|
| 273 |
+
|
| 274 |
+
except Exception as e:
|
| 275 |
+
error_msg = f"Batch {batch_num} processing completely failed: {e}"
|
| 276 |
+
logger.error(error_msg)
|
| 277 |
+
raise Exception(error_msg)
|
| 278 |
+
|
| 279 |
+
return summarized_docs
|
{src → app}/ai/prompts.py
RENAMED
|
@@ -6,46 +6,75 @@ This module contains all prompt templates used for AI interactions
|
|
| 6 |
in the DD-Checklist application.
|
| 7 |
"""
|
| 8 |
|
|
|
|
| 9 |
import json
|
| 10 |
from typing import Dict, List
|
| 11 |
-
|
|
|
|
| 12 |
from langchain_core.messages import SystemMessage, HumanMessage
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
-
def get_checklist_parsing_prompt(
|
| 16 |
-
"""Generate prompt for parsing due diligence checklists with structured output"""
|
| 17 |
return ChatPromptTemplate.from_messages([
|
| 18 |
SystemMessage(content="""
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"""),
|
| 26 |
-
|
| 27 |
|
| 28 |
{checklist_text}
|
| 29 |
|
|
|
|
| 30 |
{format_instructions}
|
| 31 |
|
| 32 |
-
|
| 33 |
])
|
| 34 |
|
| 35 |
|
| 36 |
def get_document_relevance_prompt(item_text: str, documents: List[str]) -> PromptTemplate:
|
| 37 |
-
"""Generate prompt for assessing document relevance to checklist items
|
| 38 |
return PromptTemplate.from_template(
|
| 39 |
-
"""Analyze which documents are relevant to the following checklist item
|
| 40 |
|
| 41 |
Checklist Item: {item_text}
|
| 42 |
|
| 43 |
Available Documents:
|
| 44 |
{documents}
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
Please provide your analysis in the specified format:"""
|
| 49 |
)
|
| 50 |
|
| 51 |
|
|
@@ -57,7 +86,7 @@ def get_question_answering_prompt(question: str, context: str) -> ChatPromptTemp
|
|
| 57 |
])
|
| 58 |
|
| 59 |
|
| 60 |
-
def get_findings_summary_prompt(findings: Dict, max_chars: int =
|
| 61 |
"""Generate prompt for summarizing due diligence findings"""
|
| 62 |
findings_text = json.dumps(findings, indent=2)[:max_chars]
|
| 63 |
return PromptTemplate.from_template(
|
|
@@ -81,6 +110,24 @@ def get_description_generation_prompt(category_name: str, item_text: str) -> Pro
|
|
| 81 |
).partial(category_name=category_name, item_text=item_text)
|
| 82 |
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
def get_document_summarization_prompt(doc: Dict) -> PromptTemplate:
|
| 85 |
"""Generate prompt for document type identification and summarization"""
|
| 86 |
doc_name = doc.get('name', 'Unknown')
|
|
|
|
| 6 |
in the DD-Checklist application.
|
| 7 |
"""
|
| 8 |
|
| 9 |
+
# Standard library imports
|
| 10 |
import json
|
| 11 |
from typing import Dict, List
|
| 12 |
+
|
| 13 |
+
# Third-party imports
|
| 14 |
from langchain_core.messages import SystemMessage, HumanMessage
|
| 15 |
+
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
|
| 16 |
+
|
| 17 |
+
# Local imports
|
| 18 |
+
from app.core.constants import QA_MAX_TOKENS
|
| 19 |
|
| 20 |
|
| 21 |
+
def get_checklist_parsing_prompt() -> ChatPromptTemplate:
|
| 22 |
+
"""Generate prompt template for parsing due diligence checklists with structured output"""
|
| 23 |
return ChatPromptTemplate.from_messages([
|
| 24 |
SystemMessage(content="""
|
| 25 |
+
You are a JSON parser. Your ONLY task is to convert the checklist into valid JSON format.
|
| 26 |
+
|
| 27 |
+
CRITICAL PARSING RULES:
|
| 28 |
+
- Return ONLY valid JSON - no explanations, no notes, no additional text
|
| 29 |
+
- Do NOT add any conversational text before or after the JSON
|
| 30 |
+
- Do NOT offer to continue or ask questions
|
| 31 |
+
- Do NOT provide partial results or examples
|
| 32 |
+
- Parse the COMPLETE document - every single category and item
|
| 33 |
+
|
| 34 |
+
JSON Structure Required:
|
| 35 |
+
- Top-level object with "categories" field
|
| 36 |
+
- Categories keyed by letter (A, B, C, D, E, etc.)
|
| 37 |
+
- Each category has "name" and "items" fields
|
| 38 |
+
- Each item has "text" and "original" fields
|
| 39 |
+
|
| 40 |
+
You must process the ENTIRE checklist. Do not stop after a few categories.
|
| 41 |
+
|
| 42 |
+
Output format:
|
| 43 |
+
{
|
| 44 |
+
"categories": {
|
| 45 |
+
"A": {
|
| 46 |
+
"name": "Category Name",
|
| 47 |
+
"items": [
|
| 48 |
+
{"text": "Item text", "original": "1. Item text"}
|
| 49 |
+
]
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
Return ONLY the JSON. No other text.
|
| 55 |
"""),
|
| 56 |
+
HumanMessagePromptTemplate.from_template("""Parse this complete checklist into the exact JSON format:
|
| 57 |
|
| 58 |
{checklist_text}
|
| 59 |
|
| 60 |
+
Required JSON schema:
|
| 61 |
{format_instructions}
|
| 62 |
|
| 63 |
+
Return the complete JSON with all categories found in the checklist:""")
|
| 64 |
])
|
| 65 |
|
| 66 |
|
| 67 |
def get_document_relevance_prompt(item_text: str, documents: List[str]) -> PromptTemplate:
|
| 68 |
+
"""Generate prompt for assessing document relevance to checklist items"""
|
| 69 |
return PromptTemplate.from_template(
|
| 70 |
+
"""Analyze which documents are most relevant to the following checklist item.
|
| 71 |
|
| 72 |
Checklist Item: {item_text}
|
| 73 |
|
| 74 |
Available Documents:
|
| 75 |
{documents}
|
| 76 |
|
| 77 |
+
Provide a brief analysis identifying the most relevant documents and explain why they are relevant to this checklist item. Be concise and specific."""
|
|
|
|
|
|
|
| 78 |
)
|
| 79 |
|
| 80 |
|
|
|
|
| 86 |
])
|
| 87 |
|
| 88 |
|
| 89 |
+
def get_findings_summary_prompt(findings: Dict, max_chars: int = QA_MAX_TOKENS) -> PromptTemplate:
|
| 90 |
"""Generate prompt for summarizing due diligence findings"""
|
| 91 |
findings_text = json.dumps(findings, indent=2)[:max_chars]
|
| 92 |
return PromptTemplate.from_template(
|
|
|
|
| 110 |
).partial(category_name=category_name, item_text=item_text)
|
| 111 |
|
| 112 |
|
| 113 |
+
def get_document_type_classification_prompt() -> PromptTemplate:
|
| 114 |
+
"""Generate prompt for fast document type classification based on first chunk content"""
|
| 115 |
+
return PromptTemplate.from_template(
|
| 116 |
+
"Classify the document type using one short phrase. Use exact terminology.\n"
|
| 117 |
+
"Respond with ONLY the document type, no prefix or explanation.\n\n"
|
| 118 |
+
"Examples:\n"
|
| 119 |
+
"certificate of incorporation\n"
|
| 120 |
+
"corporate bylaws\n"
|
| 121 |
+
"amended and restated bylaws\n"
|
| 122 |
+
"board resolution\n"
|
| 123 |
+
"financial statement\n"
|
| 124 |
+
"employment agreement\n"
|
| 125 |
+
"software license agreement\n\n"
|
| 126 |
+
"Document: {doc_name}\n"
|
| 127 |
+
"Content: {content_preview}\n\n"
|
| 128 |
+
"Document type:"
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
def get_document_summarization_prompt(doc: Dict) -> PromptTemplate:
|
| 132 |
"""Generate prompt for document type identification and summarization"""
|
| 133 |
doc_name = doc.get('name', 'Unknown')
|
app/core/__init__.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Core Business Logic Layer
|
| 3 |
+
|
| 4 |
+
This layer contains the core business logic and domain models.
|
| 5 |
+
It should not depend on UI or external frameworks.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# Configuration
|
| 9 |
+
from .config import AppConfig, get_config
|
| 10 |
+
|
| 11 |
+
# Exceptions
|
| 12 |
+
from .exceptions import (
|
| 13 |
+
AppException,
|
| 14 |
+
DocumentProcessingError,
|
| 15 |
+
SearchError,
|
| 16 |
+
ConfigError,
|
| 17 |
+
FileOperationError,
|
| 18 |
+
AIError,
|
| 19 |
+
LLMConnectionError,
|
| 20 |
+
LLMAuthenticationError,
|
| 21 |
+
LLMTimeoutError,
|
| 22 |
+
LLMQuotaExceededError,
|
| 23 |
+
LLMInvalidResponseError,
|
| 24 |
+
create_processing_error,
|
| 25 |
+
create_config_error,
|
| 26 |
+
create_ai_error
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# Core classes and functions
|
| 30 |
+
from .document_processor import DocumentProcessor
|
| 31 |
+
from .search import search_and_analyze, search_documents
|
| 32 |
+
from .ranking import rerank_results
|
| 33 |
+
from .parsers import parse_checklist, parse_questions
|
| 34 |
+
from .utils import create_document_processor, format_document_title, count_documents_in_directory
|
| 35 |
+
from .logging import logger
|
| 36 |
+
from .constants import (
|
| 37 |
+
RELEVANCY_THRESHOLD,
|
| 38 |
+
SIMILARITY_THRESHOLD,
|
| 39 |
+
DEFAULT_BATCH_SIZE,
|
| 40 |
+
QA_MAX_TOKENS,
|
| 41 |
+
CHECKLIST_PARSING_MAX_TOKENS
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
__all__ = [
|
| 45 |
+
# Configuration
|
| 46 |
+
'AppConfig', 'get_config',
|
| 47 |
+
|
| 48 |
+
# Exceptions
|
| 49 |
+
'AppException', 'DocumentProcessingError', 'SearchError', 'ConfigError',
|
| 50 |
+
'FileOperationError', 'AIError', 'LLMConnectionError', 'LLMAuthenticationError',
|
| 51 |
+
'LLMTimeoutError', 'LLMQuotaExceededError', 'LLMInvalidResponseError',
|
| 52 |
+
'create_processing_error', 'create_config_error', 'create_ai_error',
|
| 53 |
+
|
| 54 |
+
# Core functionality
|
| 55 |
+
'DocumentProcessor', 'search_and_analyze', 'search_documents', 'rerank_results',
|
| 56 |
+
'parse_checklist', 'parse_questions', 'create_document_processor',
|
| 57 |
+
'format_document_title', 'count_documents_in_directory', 'logger',
|
| 58 |
+
|
| 59 |
+
# Constants
|
| 60 |
+
'RELEVANCY_THRESHOLD', 'SIMILARITY_THRESHOLD', 'DEFAULT_BATCH_SIZE', 'QA_MAX_TOKENS', 'CHECKLIST_PARSING_MAX_TOKENS'
|
| 61 |
+
]
|
app/core/config.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Any, Optional
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import os
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
from app.core.constants import (
|
| 6 |
+
CHUNK_SIZE, CHUNK_OVERLAP, SIMILARITY_THRESHOLD,
|
| 7 |
+
RELEVANCY_THRESHOLD, CLASSIFICATION_MAX_TOKENS, CHECKLIST_PARSING_MAX_TOKENS,
|
| 8 |
+
TEMPERATURE
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class AppConfig:
|
| 15 |
+
def __init__(self) -> None:
|
| 16 |
+
self._config: Dict[str, Any] = {}
|
| 17 |
+
self._load_config()
|
| 18 |
+
|
| 19 |
+
def _load_config(self) -> None:
|
| 20 |
+
self._config['ui'] = {
|
| 21 |
+
'page_title': "🤖 AI Due Diligence",
|
| 22 |
+
'page_icon': "🤖",
|
| 23 |
+
'layout': "wide",
|
| 24 |
+
'top_k_search_results': 10
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
self._config['model'] = {
|
| 28 |
+
'sentence_transformer_model': 'sentence-transformers/all-mpnet-base-v2',
|
| 29 |
+
'claude_model': os.getenv('CLAUDE_MODEL', 'claude-3-5-sonnet'),
|
| 30 |
+
'claude_haiku_model': 'claude-3-5-haiku-20241022',
|
| 31 |
+
'classification_max_tokens': CLASSIFICATION_MAX_TOKENS,
|
| 32 |
+
'temperature': float(os.getenv('CLAUDE_TEMPERATURE', str(TEMPERATURE))),
|
| 33 |
+
'max_tokens': int(os.getenv('CLAUDE_MAX_TOKENS', '16000')) # High limit for checklist parsing
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
self._config['processing'] = {
|
| 37 |
+
'chunk_size': CHUNK_SIZE,
|
| 38 |
+
'chunk_overlap': CHUNK_OVERLAP,
|
| 39 |
+
'similarity_threshold': SIMILARITY_THRESHOLD,
|
| 40 |
+
'relevancy_threshold': RELEVANCY_THRESHOLD,
|
| 41 |
+
'supported_file_extensions': [
|
| 42 |
+
'.pdf', '.docx', '.doc', '.txt', '.md',
|
| 43 |
+
'.xls', '.xlsx', '.ppt', '.pptx'
|
| 44 |
+
],
|
| 45 |
+
'faiss_store_name': 'default'
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
self._config['paths'] = {
|
| 49 |
+
'data_dir': Path('data'),
|
| 50 |
+
'strategy_dir': Path('data/strategy'),
|
| 51 |
+
'checklist_dir': Path('data/checklist'),
|
| 52 |
+
'questions_dir': Path('data/questions'),
|
| 53 |
+
'vdrs_dir': Path('data/vdrs'),
|
| 54 |
+
'faiss_dir': Path('data/search_indexes')
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
self._config['anthropic'] = {
|
| 58 |
+
'api_key': os.getenv('ANTHROPIC_API_KEY'),
|
| 59 |
+
'model': os.getenv('CLAUDE_MODEL', 'claude-3-5-sonnet')
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
@property
|
| 63 |
+
def ui(self) -> Dict[str, Any]:
|
| 64 |
+
return self._config['ui']
|
| 65 |
+
|
| 66 |
+
@property
|
| 67 |
+
def model(self) -> Dict[str, Any]:
|
| 68 |
+
return self._config['model']
|
| 69 |
+
|
| 70 |
+
@property
|
| 71 |
+
def processing(self) -> Dict[str, Any]:
|
| 72 |
+
return self._config['processing']
|
| 73 |
+
|
| 74 |
+
@property
|
| 75 |
+
def paths(self) -> Dict[str, Path]:
|
| 76 |
+
return self._config['paths']
|
| 77 |
+
|
| 78 |
+
@property
|
| 79 |
+
def anthropic(self) -> Dict[str, Optional[str]]:
|
| 80 |
+
return self._config['anthropic']
|
| 81 |
+
|
| 82 |
+
def validate(self) -> bool:
|
| 83 |
+
"""Validate all critical configuration values."""
|
| 84 |
+
self._validate_anthropic_config()
|
| 85 |
+
self._validate_paths()
|
| 86 |
+
self._validate_models()
|
| 87 |
+
self._validate_processing_config()
|
| 88 |
+
self._validate_file_extensions()
|
| 89 |
+
return True
|
| 90 |
+
|
| 91 |
+
def _validate_anthropic_config(self) -> None:
|
| 92 |
+
"""Validate Anthropic API configuration."""
|
| 93 |
+
if not self.anthropic.get('api_key'):
|
| 94 |
+
raise ValueError("ANTHROPIC_API_KEY environment variable is required")
|
| 95 |
+
|
| 96 |
+
model = self.anthropic.get('model')
|
| 97 |
+
if not model:
|
| 98 |
+
raise ValueError("CLAUDE_MODEL environment variable is required")
|
| 99 |
+
|
| 100 |
+
valid_claude_models = [
|
| 101 |
+
'claude-3-5-sonnet',
|
| 102 |
+
'claude-3-5-haiku-20241022',
|
| 103 |
+
'claude-3-opus-20240229',
|
| 104 |
+
'claude-3-sonnet-20240229',
|
| 105 |
+
'claude-3-haiku-20240307'
|
| 106 |
+
]
|
| 107 |
+
if model not in valid_claude_models:
|
| 108 |
+
raise ValueError(f"Invalid Claude model: {model}. Valid models: {', '.join(valid_claude_models)}")
|
| 109 |
+
|
| 110 |
+
def _validate_paths(self) -> None:
|
| 111 |
+
"""Validate that critical directories exist."""
|
| 112 |
+
critical_dirs = [
|
| 113 |
+
('data_dir', self.paths['data_dir']),
|
| 114 |
+
('vdrs_dir', self.paths['vdrs_dir'])
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
for dir_name, dir_path in critical_dirs:
|
| 118 |
+
if not dir_path.exists():
|
| 119 |
+
raise ValueError(f"Critical directory '{dir_name}' does not exist: {dir_path}")
|
| 120 |
+
if not dir_path.is_dir():
|
| 121 |
+
raise ValueError(f"Path '{dir_name}' exists but is not a directory: {dir_path}")
|
| 122 |
+
|
| 123 |
+
def _validate_models(self) -> None:
|
| 124 |
+
"""Validate that required models are available."""
|
| 125 |
+
# Check sentence transformer model
|
| 126 |
+
model_path = Path('models') / 'sentence_transformers' / self.model['sentence_transformer_model'].split('/')[-1]
|
| 127 |
+
if not model_path.exists():
|
| 128 |
+
raise ValueError(f"Sentence transformer model not found: {model_path}")
|
| 129 |
+
|
| 130 |
+
# Check cross-encoder model
|
| 131 |
+
cross_encoder_path = Path('models') / 'cross_encoder' / 'ms-marco-MiniLM-L-6-v2'
|
| 132 |
+
if not cross_encoder_path.exists():
|
| 133 |
+
raise ValueError(f"Cross-encoder model not found: {cross_encoder_path}")
|
| 134 |
+
|
| 135 |
+
def _validate_processing_config(self) -> None:
|
| 136 |
+
"""Validate processing configuration values."""
|
| 137 |
+
processing = self.processing
|
| 138 |
+
|
| 139 |
+
# Validate chunk size
|
| 140 |
+
chunk_size = processing['chunk_size']
|
| 141 |
+
if not isinstance(chunk_size, int) or chunk_size <= 0:
|
| 142 |
+
raise ValueError(f"Invalid chunk_size: {chunk_size}. Must be a positive integer.")
|
| 143 |
+
|
| 144 |
+
# Validate chunk overlap
|
| 145 |
+
chunk_overlap = processing['chunk_overlap']
|
| 146 |
+
if not isinstance(chunk_overlap, int) or chunk_overlap < 0:
|
| 147 |
+
raise ValueError(f"Invalid chunk_overlap: {chunk_overlap}. Must be a non-negative integer.")
|
| 148 |
+
if chunk_overlap >= chunk_size:
|
| 149 |
+
raise ValueError(f"chunk_overlap ({chunk_overlap}) must be less than chunk_size ({chunk_size})")
|
| 150 |
+
|
| 151 |
+
# Validate thresholds
|
| 152 |
+
similarity_threshold = processing['similarity_threshold']
|
| 153 |
+
if not isinstance(similarity_threshold, (int, float)) or not (0 <= similarity_threshold <= 1):
|
| 154 |
+
raise ValueError(f"Invalid similarity_threshold: {similarity_threshold}. Must be between 0 and 1.")
|
| 155 |
+
|
| 156 |
+
relevancy_threshold = processing['relevancy_threshold']
|
| 157 |
+
if not isinstance(relevancy_threshold, (int, float)) or not (0 <= relevancy_threshold <= 1):
|
| 158 |
+
raise ValueError(f"Invalid relevancy_threshold: {relevancy_threshold}. Must be between 0 and 1.")
|
| 159 |
+
|
| 160 |
+
# Validate max tokens
|
| 161 |
+
max_tokens = processing.get('classification_max_tokens', CLASSIFICATION_MAX_TOKENS)
|
| 162 |
+
if not isinstance(max_tokens, int) or max_tokens <= 0:
|
| 163 |
+
raise ValueError(f"Invalid classification_max_tokens: {max_tokens}. Must be a positive integer.")
|
| 164 |
+
|
| 165 |
+
def _validate_file_extensions(self) -> None:
|
| 166 |
+
"""Validate supported file extensions."""
|
| 167 |
+
extensions = self.processing['supported_file_extensions']
|
| 168 |
+
if not extensions:
|
| 169 |
+
raise ValueError("supported_file_extensions cannot be empty")
|
| 170 |
+
|
| 171 |
+
# Validate each extension starts with a dot and contains valid characters
|
| 172 |
+
for ext in extensions:
|
| 173 |
+
if not isinstance(ext, str):
|
| 174 |
+
raise ValueError(f"Invalid file extension type: {type(ext)}. Must be string.")
|
| 175 |
+
if not ext.startswith('.'):
|
| 176 |
+
raise ValueError(f"File extension must start with '.': {ext}")
|
| 177 |
+
if len(ext) < 2 or not ext[1:].replace('_', '').replace('-', '').isalnum():
|
| 178 |
+
raise ValueError(f"Invalid file extension format: {ext}")
|
| 179 |
+
|
| 180 |
+
def get_supported_extensions(self) -> list[str]:
|
| 181 |
+
"""Get list of supported file extensions for document processing."""
|
| 182 |
+
return self._config['processing']['supported_file_extensions']
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# Global configuration instance
|
| 186 |
+
_config_instance: Optional[AppConfig] = None
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def get_app_config() -> AppConfig:
|
| 190 |
+
"""Get the global application configuration instance."""
|
| 191 |
+
global _config_instance
|
| 192 |
+
if _config_instance is None:
|
| 193 |
+
_config_instance = AppConfig()
|
| 194 |
+
_config_instance.validate()
|
| 195 |
+
return _config_instance
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
# Compatibility alias
|
| 199 |
+
init_app_config = get_app_config
|
| 200 |
+
|
| 201 |
+
# Compatibility alias
|
| 202 |
+
get_config = get_app_config
|
app/core/constants.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Constants for the application
|
| 2 |
+
|
| 3 |
+
# Chunk sizes
|
| 4 |
+
CHUNK_SIZE = 1000
|
| 5 |
+
CHUNK_OVERLAP = 200
|
| 6 |
+
|
| 7 |
+
# Thresholds
|
| 8 |
+
SIMILARITY_THRESHOLD = 0.2
|
| 9 |
+
RELEVANCY_THRESHOLD = 0.25
|
| 10 |
+
|
| 11 |
+
# Token limits
|
| 12 |
+
CLASSIFICATION_MAX_TOKENS = 1000
|
| 13 |
+
QA_MAX_TOKENS = 8000
|
| 14 |
+
CHECKLIST_PARSING_MAX_TOKENS = 16000 # Large enough for full checklist parsing
|
| 15 |
+
|
| 16 |
+
# AI Model Configuration
|
| 17 |
+
TEMPERATURE = 0.0 # Deterministic responses for due diligence consistency
|
| 18 |
+
|
| 19 |
+
# Batch sizes
|
| 20 |
+
DEFAULT_BATCH_SIZE = 10
|
| 21 |
+
CLASSIFICATION_BATCH_SIZE = 20
|
| 22 |
+
|
| 23 |
+
# AI Analysis types
|
| 24 |
+
SUPPORTED_ANALYSIS_TYPES = ["overview", "strategic", "checklist", "questions"]
|
app/core/content_ingestion.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Unified Content Ingestion System
|
| 4 |
+
|
| 5 |
+
This module provides a unified processing pipeline with simple ingestion functions.
|
| 6 |
+
All content types (VDR documents, markdown files, etc.) go through the same processing pipeline
|
| 7 |
+
with different ingestion functions handling the content-specific parsing.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
# Standard library imports
|
| 11 |
+
import json
|
| 12 |
+
import logging
|
| 13 |
+
import time
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import List, Dict, Any, Optional, Tuple, Callable
|
| 16 |
+
|
| 17 |
+
# Third-party imports
|
| 18 |
+
from langchain_core.documents import Document
|
| 19 |
+
from langchain_community.vectorstores import FAISS
|
| 20 |
+
from tqdm import tqdm
|
| 21 |
+
|
| 22 |
+
# Local imports
|
| 23 |
+
from app.core.config import get_config
|
| 24 |
+
from app.core.model_cache import get_cached_embeddings
|
| 25 |
+
from app.core.parsers import parse_checklist, parse_questions
|
| 26 |
+
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def vdr_ingest(vdr_path: Path, store_name: str, llm=None) -> Tuple[List[Document], Dict[str, Any]]:
|
| 31 |
+
"""Ingest VDR documents using DocumentProcessor"""
|
| 32 |
+
logger.info(f"Ingesting VDR documents from {vdr_path}")
|
| 33 |
+
|
| 34 |
+
# Count total files for progress tracking
|
| 35 |
+
total_files = sum(1 for f in vdr_path.rglob('*')
|
| 36 |
+
if f.is_file() and f.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
|
| 37 |
+
|
| 38 |
+
# Initialize document processor
|
| 39 |
+
from app.core.utils import create_document_processor
|
| 40 |
+
processor = create_document_processor(store_name=store_name)
|
| 41 |
+
|
| 42 |
+
# Process the data room with file-level progress
|
| 43 |
+
with tqdm(total=total_files, desc=f"Files in {store_name}",
|
| 44 |
+
unit="files", leave=False) as file_pbar:
|
| 45 |
+
|
| 46 |
+
result = processor.load_data_room(str(vdr_path))
|
| 47 |
+
|
| 48 |
+
# Update progress bar based on actual files processed
|
| 49 |
+
if file_pbar and result.get('documents_count', 0) > 0:
|
| 50 |
+
file_pbar.update(result['documents_count'])
|
| 51 |
+
|
| 52 |
+
metadata = {
|
| 53 |
+
'content_type': 'vdr',
|
| 54 |
+
'source_path': str(vdr_path),
|
| 55 |
+
'total_files': total_files,
|
| 56 |
+
**result
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
return processor.documents, metadata
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def classify_vdr_documents(documents: List[Document], store_name: str, classifier=None) -> Dict[str, str]:
|
| 63 |
+
"""Classify VDR documents using fast Haiku classifier"""
|
| 64 |
+
if not classifier or not documents:
|
| 65 |
+
return {}
|
| 66 |
+
|
| 67 |
+
logger.info(f"🏷️ Classifying document types for {store_name}")
|
| 68 |
+
|
| 69 |
+
# Extract only first chunks for classification efficiency
|
| 70 |
+
first_chunks = []
|
| 71 |
+
for doc in documents:
|
| 72 |
+
if doc.metadata.get('is_first_chunk', False):
|
| 73 |
+
first_chunks.append({
|
| 74 |
+
'name': doc.metadata.get('name', ''),
|
| 75 |
+
'path': doc.metadata.get('path', ''),
|
| 76 |
+
'content': doc.page_content[:800]
|
| 77 |
+
})
|
| 78 |
+
|
| 79 |
+
if not first_chunks:
|
| 80 |
+
logger.warning(f"⚠️ No first chunks found for classification in {store_name}")
|
| 81 |
+
return {}
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
from app.ai.document_classifier import batch_classify_document_types
|
| 85 |
+
classified_docs = batch_classify_document_types(first_chunks, classifier)
|
| 86 |
+
|
| 87 |
+
# Build classifications dictionary
|
| 88 |
+
classifications = {}
|
| 89 |
+
for doc in classified_docs:
|
| 90 |
+
if 'document_type' in doc and doc['path']:
|
| 91 |
+
classifications[doc['path']] = doc['document_type']
|
| 92 |
+
|
| 93 |
+
logger.info(f"✅ Classified {len(classifications)} document types for {store_name}")
|
| 94 |
+
return classifications
|
| 95 |
+
|
| 96 |
+
except Exception as e:
|
| 97 |
+
logger.error(f"⚠️ Failed to classify document types for {store_name}: {e}")
|
| 98 |
+
return {}
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def process_content(content_source: Any, content_type: str, store_name: str, classifier=None, llm=None) -> Dict[str, Any]:
|
| 102 |
+
"""Process content source into FAISS index"""
|
| 103 |
+
start_time = time.time()
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
# Get ingestion function
|
| 107 |
+
ingest_func = get_ingestion_function(content_type)
|
| 108 |
+
documents, ingestion_metadata = ingest_func(content_source, store_name, llm)
|
| 109 |
+
|
| 110 |
+
if not documents:
|
| 111 |
+
return {
|
| 112 |
+
'success': False,
|
| 113 |
+
'store_name': store_name,
|
| 114 |
+
'error': 'No documents extracted'
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
# Classify VDR documents if classifier provided
|
| 118 |
+
classifications = {}
|
| 119 |
+
if classifier and content_type == 'vdr':
|
| 120 |
+
classifications = classify_vdr_documents(documents, store_name, classifier)
|
| 121 |
+
|
| 122 |
+
# Create FAISS index
|
| 123 |
+
from app.core.model_cache import get_cached_embeddings
|
| 124 |
+
from app.core.config import get_config
|
| 125 |
+
config = get_config()
|
| 126 |
+
embeddings = get_cached_embeddings(config.model['sentence_transformer_model'])
|
| 127 |
+
vector_store = FAISS.from_documents(documents, embeddings)
|
| 128 |
+
|
| 129 |
+
# Save index
|
| 130 |
+
faiss_dir = config.paths['faiss_dir']
|
| 131 |
+
faiss_dir.mkdir(parents=True, exist_ok=True)
|
| 132 |
+
vector_store.save_local(str(faiss_dir), index_name=store_name)
|
| 133 |
+
|
| 134 |
+
# Save classifications if available
|
| 135 |
+
if classifications:
|
| 136 |
+
classifications_file = faiss_dir / f"{store_name}_document_types.json"
|
| 137 |
+
classifications_file.write_text(
|
| 138 |
+
json.dumps(classifications, indent=2, ensure_ascii=False)
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
# Save enhanced checklists
|
| 142 |
+
if 'enhanced_checklists' in ingestion_metadata:
|
| 143 |
+
checklists_file = faiss_dir / "checklists.json"
|
| 144 |
+
checklists_file.write_text(
|
| 145 |
+
json.dumps(ingestion_metadata['enhanced_checklists'], indent=2, ensure_ascii=False)
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
processing_time = time.time() - start_time
|
| 149 |
+
|
| 150 |
+
return {
|
| 151 |
+
'success': True,
|
| 152 |
+
'store_name': store_name,
|
| 153 |
+
'processing_time': processing_time,
|
| 154 |
+
'classifications_count': len(classifications),
|
| 155 |
+
**ingestion_metadata
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
except Exception as e:
|
| 159 |
+
return {
|
| 160 |
+
'success': False,
|
| 161 |
+
'store_name': store_name,
|
| 162 |
+
'error': str(e),
|
| 163 |
+
'processing_time': time.time() - start_time
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def checklist_ingest(content_dir: Path, store_name: str, llm=None) -> Tuple[List[Document], Dict[str, Any]]:
|
| 168 |
+
"""Ingest checklist markdown files"""
|
| 169 |
+
logger.info(f"Ingesting checklist files from {content_dir}")
|
| 170 |
+
|
| 171 |
+
if not content_dir.exists():
|
| 172 |
+
raise FileNotFoundError(f"Checklist directory not found: {content_dir}")
|
| 173 |
+
|
| 174 |
+
# Find all markdown files
|
| 175 |
+
md_files = list(content_dir.glob("*.md"))
|
| 176 |
+
if not md_files:
|
| 177 |
+
raise ValueError(f"No markdown files found in {content_dir}")
|
| 178 |
+
|
| 179 |
+
all_documents = []
|
| 180 |
+
|
| 181 |
+
with tqdm(md_files, desc="Processing checklist files",
|
| 182 |
+
unit="file", leave=False) as file_pbar:
|
| 183 |
+
|
| 184 |
+
for md_file in file_pbar:
|
| 185 |
+
file_pbar.set_description(f"Processing {md_file.name}")
|
| 186 |
+
logger.info(f"Processing: {md_file.name}")
|
| 187 |
+
|
| 188 |
+
content = md_file.read_text(encoding='utf-8')
|
| 189 |
+
parsed_data = parse_checklist(content, llm)
|
| 190 |
+
|
| 191 |
+
# Convert checklist items to documents
|
| 192 |
+
for cat_key, category in parsed_data.items():
|
| 193 |
+
for item in category.get('items', []):
|
| 194 |
+
doc = Document(
|
| 195 |
+
page_content=item['text'],
|
| 196 |
+
metadata={
|
| 197 |
+
'source': md_file.name,
|
| 198 |
+
'category': category['name'],
|
| 199 |
+
'type': 'checklist_item'
|
| 200 |
+
}
|
| 201 |
+
)
|
| 202 |
+
all_documents.append(doc)
|
| 203 |
+
|
| 204 |
+
metadata = {
|
| 205 |
+
'content_type': 'checklist',
|
| 206 |
+
'source_path': str(content_dir),
|
| 207 |
+
'md_files_count': len(md_files),
|
| 208 |
+
'documents_count': len(all_documents)
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
return all_documents, metadata
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def questions_ingest(content_dir: Path, store_name: str, llm=None) -> Tuple[List[Document], Dict[str, Any]]:
|
| 215 |
+
"""Ingest questions markdown files"""
|
| 216 |
+
logger.info(f"Ingesting questions files from {content_dir}")
|
| 217 |
+
|
| 218 |
+
if not content_dir.exists():
|
| 219 |
+
raise FileNotFoundError(f"Questions directory not found: {content_dir}")
|
| 220 |
+
|
| 221 |
+
# Find all markdown files
|
| 222 |
+
md_files = list(content_dir.glob("*.md"))
|
| 223 |
+
if not md_files:
|
| 224 |
+
raise ValueError(f"No markdown files found in {content_dir}")
|
| 225 |
+
|
| 226 |
+
all_documents = []
|
| 227 |
+
|
| 228 |
+
with tqdm(md_files, desc="Processing questions files",
|
| 229 |
+
unit="file", leave=False) as file_pbar:
|
| 230 |
+
|
| 231 |
+
for md_file in file_pbar:
|
| 232 |
+
file_pbar.set_description(f"Processing {md_file.name}")
|
| 233 |
+
logger.info(f"Processing: {md_file.name}")
|
| 234 |
+
|
| 235 |
+
content = md_file.read_text(encoding='utf-8')
|
| 236 |
+
parsed_data = parse_questions(content, llm)
|
| 237 |
+
|
| 238 |
+
# Convert questions to documents
|
| 239 |
+
for question in parsed_data:
|
| 240 |
+
doc = Document(
|
| 241 |
+
page_content=f"{question['category']}: {question['question']}",
|
| 242 |
+
metadata={
|
| 243 |
+
'source': md_file.name,
|
| 244 |
+
'category': question['category'],
|
| 245 |
+
'question_id': question['id'],
|
| 246 |
+
'type': 'question'
|
| 247 |
+
}
|
| 248 |
+
)
|
| 249 |
+
all_documents.append(doc)
|
| 250 |
+
|
| 251 |
+
metadata = {
|
| 252 |
+
'content_type': 'questions',
|
| 253 |
+
'source_path': str(content_dir),
|
| 254 |
+
'md_files_count': len(md_files),
|
| 255 |
+
'documents_count': len(all_documents)
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
return all_documents, metadata
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
# Factory function for getting ingestion functions
|
| 262 |
+
def get_ingestion_function(content_type: str) -> Callable[..., Tuple[List[Document], Dict[str, Any]]]:
|
| 263 |
+
"""Factory function to get appropriate ingestion function"""
|
| 264 |
+
functions = {
|
| 265 |
+
'vdr': vdr_ingest,
|
| 266 |
+
'checklist': checklist_ingest,
|
| 267 |
+
'questions': questions_ingest
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
if content_type not in functions:
|
| 271 |
+
raise ValueError(f"Unknown content type: {content_type}. Available: {list(functions.keys())}")
|
| 272 |
+
|
| 273 |
+
return functions[content_type]
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
# Backward compatibility - create UnifiedContentProcessor class that uses process_content
|
| 277 |
+
class UnifiedContentProcessor:
|
| 278 |
+
"""Backward compatibility wrapper for process_content function"""
|
| 279 |
+
|
| 280 |
+
def process_content_source(self, content_source: Any, content_type: str, store_name: str, classifier=None, progress_bar=None, llm=None):
|
| 281 |
+
"""Process content using the unified function"""
|
| 282 |
+
return process_content(content_source, content_type, store_name, classifier, llm)
|
src/document_processing.py → app/core/document_processor.py
RENAMED
|
@@ -2,19 +2,18 @@
|
|
| 2 |
"""
|
| 3 |
Streamlined Document Processing Module
|
| 4 |
|
| 5 |
-
This module provides a
|
| 6 |
-
- Direct LangChain loader integration with glob patterns
|
| 7 |
- Built-in FAISS vector storage without external file tracking
|
| 8 |
- Semantic text chunking using RecursiveCharacterTextSplitter
|
| 9 |
- Consolidated document metadata handling
|
| 10 |
"""
|
| 11 |
|
| 12 |
import os
|
| 13 |
-
import
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
os.environ.setdefault("TOKENIZERS_PARALLELISM", "
|
| 17 |
-
import re
|
| 18 |
|
| 19 |
from pathlib import Path
|
| 20 |
from typing import Dict, List, Optional, Any, Callable
|
|
@@ -23,17 +22,23 @@ from datetime import datetime
|
|
| 23 |
# LangChain imports
|
| 24 |
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, Docx2txtLoader, TextLoader
|
| 25 |
from langchain_community.vectorstores import FAISS
|
| 26 |
-
from langchain_huggingface import HuggingFaceEmbeddings
|
| 27 |
from langchain_core.documents import Document
|
| 28 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
|
| 29 |
|
| 30 |
-
# Import configuration
|
| 31 |
-
from .config import
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
# =============================================================================
|
|
@@ -43,13 +48,13 @@ logger = logging.getLogger(__name__)
|
|
| 43 |
def safe_execute(func: Callable, default: Any = None, context: str = "", log_errors: bool = True) -> Any:
|
| 44 |
"""
|
| 45 |
Execute a function with basic error handling and logging
|
| 46 |
-
|
| 47 |
Args:
|
| 48 |
func: Function to execute
|
| 49 |
default: Value to return on error
|
| 50 |
context: Brief description for logs
|
| 51 |
log_errors: Whether to log errors
|
| 52 |
-
|
| 53 |
Returns:
|
| 54 |
Function result or default value on error
|
| 55 |
"""
|
|
@@ -78,71 +83,98 @@ def escape_markdown_math(text: str) -> str:
|
|
| 78 |
class DocumentProcessor:
|
| 79 |
"""
|
| 80 |
Streamlined document processing class with integrated FAISS vector storage
|
| 81 |
-
|
| 82 |
This class consolidates all document processing functionality including:
|
| 83 |
- Document loading using LangChain's DirectoryLoader with glob patterns
|
| 84 |
- Semantic text chunking with RecursiveCharacterTextSplitter
|
| 85 |
- FAISS vector storage for similarity search
|
| 86 |
- Document metadata handling
|
| 87 |
"""
|
| 88 |
-
|
| 89 |
def __init__(self, model_name: Optional[str] = None, store_name: Optional[str] = None):
|
| 90 |
"""
|
| 91 |
Initialize the document processor
|
| 92 |
-
|
| 93 |
Args:
|
| 94 |
model_name: Name of the sentence transformer model for embeddings (optional)
|
| 95 |
store_name: Name for the FAISS store (optional, uses config default)
|
| 96 |
"""
|
| 97 |
-
config =
|
| 98 |
-
self.model_name = model_name or config.model
|
| 99 |
-
self.store_name = store_name or config.processing
|
| 100 |
-
|
| 101 |
# Initialize components
|
| 102 |
self.documents: List[Document] = []
|
| 103 |
self.vector_store: Optional[FAISS] = None
|
| 104 |
self.embeddings: Optional[HuggingFaceEmbeddings] = None
|
| 105 |
self.text_splitter: Optional[RecursiveCharacterTextSplitter] = None
|
| 106 |
self.performance_stats = {}
|
| 107 |
-
|
| 108 |
# Convenience properties for backward compatibility
|
| 109 |
self.chunks = [] # Will be populated after processing
|
| 110 |
-
|
| 111 |
# Initialize text splitter with semantic boundaries
|
| 112 |
self._init_text_splitter()
|
| 113 |
-
|
| 114 |
# Initialize embeddings if model name provided
|
| 115 |
if self.model_name:
|
| 116 |
-
self.embeddings =
|
| 117 |
-
logger.info(f"Initialized embeddings with model: {self.model_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
else:
|
| 119 |
logger.warning("No model name provided - embeddings not initialized")
|
| 120 |
-
|
|
|
|
| 121 |
# Try to load existing FAISS store
|
| 122 |
self._load_existing_store()
|
| 123 |
-
|
| 124 |
def _init_text_splitter(self):
|
| 125 |
"""Initialize the text splitter with optimal settings for semantic chunking"""
|
| 126 |
-
config =
|
| 127 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 128 |
-
chunk_size=config.processing
|
| 129 |
-
chunk_overlap=config.processing
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
length_function=len,
|
| 132 |
is_separator_regex=False,
|
|
|
|
|
|
|
| 133 |
)
|
| 134 |
-
logger.info(f"Initialized text splitter: {config.processing
|
| 135 |
-
|
| 136 |
def _load_existing_store(self):
|
| 137 |
"""Load existing FAISS store if available"""
|
| 138 |
if not self.embeddings:
|
| 139 |
return
|
| 140 |
-
|
| 141 |
-
config =
|
| 142 |
-
faiss_dir =
|
| 143 |
faiss_index_path = faiss_dir / f"{self.store_name}.faiss"
|
| 144 |
faiss_pkl_path = faiss_dir / f"{self.store_name}.pkl"
|
| 145 |
-
|
| 146 |
try:
|
| 147 |
if faiss_index_path.exists() and faiss_pkl_path.exists():
|
| 148 |
self.vector_store = FAISS.load_local(
|
|
@@ -157,60 +189,54 @@ class DocumentProcessor:
|
|
| 157 |
except Exception as e:
|
| 158 |
logger.error(f"Failed to load FAISS store: {e}")
|
| 159 |
self.vector_store = None
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
"""Save FAISS store to disk"""
|
| 163 |
-
if not self.vector_store:
|
| 164 |
-
return
|
| 165 |
-
|
| 166 |
-
try:
|
| 167 |
-
config = get_config()
|
| 168 |
-
faiss_dir = Path(config.paths.data_dir) / "enhanced_faiss"
|
| 169 |
-
faiss_dir.mkdir(parents=True, exist_ok=True)
|
| 170 |
-
|
| 171 |
-
self.vector_store.save_local(
|
| 172 |
-
str(faiss_dir),
|
| 173 |
-
index_name=self.store_name
|
| 174 |
-
)
|
| 175 |
-
logger.info(f"Saved FAISS store: {self.store_name} with {self.vector_store.index.ntotal} vectors")
|
| 176 |
-
except Exception as e:
|
| 177 |
-
logger.error(f"Failed to save FAISS store: {e}")
|
| 178 |
-
|
| 179 |
def load_data_room(self, data_room_path: str, progress_bar=None) -> Dict[str, Any]:
|
| 180 |
"""
|
| 181 |
Load and process an entire data room using DirectoryLoader with glob patterns
|
| 182 |
-
|
| 183 |
Args:
|
| 184 |
data_room_path: Path to the data room directory
|
| 185 |
progress_bar: Optional Streamlit progress bar object
|
| 186 |
-
|
| 187 |
Returns:
|
| 188 |
Dictionary with processing results including performance metrics
|
| 189 |
"""
|
| 190 |
import time
|
| 191 |
start_time = time.time()
|
| 192 |
-
|
| 193 |
-
config =
|
| 194 |
data_room_path = Path(data_room_path)
|
| 195 |
-
|
| 196 |
if not data_room_path.exists():
|
| 197 |
logger.error(f"Data room path does not exist: {data_room_path}")
|
| 198 |
return {'documents_count': 0, 'chunks_count': 0, 'has_embeddings': False}
|
| 199 |
-
|
| 200 |
logger.info(f"Starting streamlined data room processing: {data_room_path}")
|
| 201 |
-
|
| 202 |
# Clear existing documents
|
| 203 |
self.documents = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
documents_loaded = 0
|
| 205 |
-
|
|
|
|
| 206 |
# Load documents by file type using DirectoryLoader with glob patterns
|
| 207 |
-
supported_extensions = config.processing
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
for ext in supported_extensions:
|
| 210 |
try:
|
| 211 |
# Create glob pattern for this extension
|
| 212 |
glob_pattern = f"**/*{ext}"
|
| 213 |
-
|
| 214 |
# Choose appropriate loader based on extension
|
| 215 |
if ext == '.pdf':
|
| 216 |
loader_cls = PyPDFLoader
|
|
@@ -220,7 +246,7 @@ class DocumentProcessor:
|
|
| 220 |
loader_cls = TextLoader
|
| 221 |
else:
|
| 222 |
continue
|
| 223 |
-
|
| 224 |
# Use DirectoryLoader with glob pattern
|
| 225 |
loader = DirectoryLoader(
|
| 226 |
str(data_room_path),
|
|
@@ -231,14 +257,14 @@ class DocumentProcessor:
|
|
| 231 |
show_progress=False, # Disable verbose progress output
|
| 232 |
use_multithreading=True
|
| 233 |
)
|
| 234 |
-
|
| 235 |
# Load documents for this extension
|
| 236 |
docs = safe_execute(
|
| 237 |
lambda: loader.load(),
|
| 238 |
default=[],
|
| 239 |
context=f"Loading {ext} files"
|
| 240 |
)
|
| 241 |
-
|
| 242 |
if docs:
|
| 243 |
# Add relative path information to metadata
|
| 244 |
for doc in docs:
|
|
@@ -253,34 +279,55 @@ class DocumentProcessor:
|
|
| 253 |
# If relative path fails, use original source
|
| 254 |
doc.metadata['path'] = doc.metadata['source']
|
| 255 |
doc.metadata['name'] = source_path.name
|
| 256 |
-
|
| 257 |
self.documents.extend(docs)
|
| 258 |
documents_loaded += len(docs)
|
| 259 |
logger.info(f"Loaded {len(docs)} {ext} documents")
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
except Exception as e:
|
| 262 |
logger.error(f"Error loading {ext} files: {e}")
|
| 263 |
-
|
| 264 |
scan_time = time.time() - start_time
|
| 265 |
logger.info(f"Document loading completed in {scan_time:.2f} seconds")
|
| 266 |
-
|
| 267 |
# Split documents into chunks using the text splitter
|
| 268 |
chunk_start = time.time()
|
| 269 |
if self.documents and self.text_splitter:
|
|
|
|
|
|
|
|
|
|
| 270 |
self.documents = self.text_splitter.split_documents(self.documents)
|
| 271 |
-
|
| 272 |
# Add chunk metadata and populate chunks for backward compatibility
|
|
|
|
|
|
|
| 273 |
self.chunks = []
|
|
|
|
| 274 |
for i, doc in enumerate(self.documents):
|
| 275 |
doc.metadata['chunk_id'] = f"chunk_{i}"
|
| 276 |
doc.metadata['processed_at'] = datetime.now().isoformat()
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
# Add citation information if available
|
| 279 |
if 'page' in doc.metadata:
|
| 280 |
doc.metadata['citation'] = f"page {doc.metadata['page']}"
|
| 281 |
else:
|
| 282 |
doc.metadata['citation'] = doc.metadata.get('name', 'document')
|
| 283 |
-
|
| 284 |
# Create chunk dict for backward compatibility
|
| 285 |
chunk_dict = {
|
| 286 |
'text': doc.page_content,
|
|
@@ -290,33 +337,29 @@ class DocumentProcessor:
|
|
| 290 |
'metadata': doc.metadata
|
| 291 |
}
|
| 292 |
self.chunks.append(chunk_dict)
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
| 294 |
chunk_time = time.time() - chunk_start
|
| 295 |
logger.info(f"Text splitting completed in {chunk_time:.2f} seconds")
|
| 296 |
-
|
| 297 |
-
|
| 298 |
embedding_time = 0
|
| 299 |
if self.embeddings and self.documents:
|
| 300 |
embedding_start = time.time()
|
| 301 |
-
|
| 302 |
if self.vector_store is None:
|
| 303 |
-
|
| 304 |
-
self.vector_store = FAISS.from_documents(self.documents, self.embeddings)
|
| 305 |
-
logger.info(f"Created new FAISS store with {len(self.documents)} documents")
|
| 306 |
else:
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
logger.info(f"Added {len(self.documents)} documents to existing FAISS store")
|
| 310 |
-
|
| 311 |
-
# Save the updated store
|
| 312 |
-
self._save_store()
|
| 313 |
-
|
| 314 |
embedding_time = time.time() - embedding_start
|
| 315 |
-
logger.info(f"FAISS
|
| 316 |
-
|
| 317 |
total_time = time.time() - start_time
|
| 318 |
logger.info(f"Total data room processing completed in {total_time:.2f} seconds")
|
| 319 |
-
|
| 320 |
# Store performance stats
|
| 321 |
self.performance_stats = {
|
| 322 |
'total_time': total_time,
|
|
@@ -325,7 +368,7 @@ class DocumentProcessor:
|
|
| 325 |
'embedding_time': embedding_time,
|
| 326 |
'documents_per_second': documents_loaded / scan_time if scan_time > 0 else 0
|
| 327 |
}
|
| 328 |
-
|
| 329 |
return {
|
| 330 |
'documents_count': documents_loaded,
|
| 331 |
'chunks_count': len(self.documents),
|
|
@@ -333,65 +376,80 @@ class DocumentProcessor:
|
|
| 333 |
'has_embeddings': self.vector_store is not None,
|
| 334 |
'performance': self.performance_stats
|
| 335 |
}
|
| 336 |
-
|
| 337 |
def search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
|
| 338 |
"""
|
| 339 |
Search documents using FAISS similarity search
|
| 340 |
-
|
| 341 |
Args:
|
| 342 |
query: Search query
|
| 343 |
top_k: Number of top results to return
|
| 344 |
threshold: Minimum similarity threshold
|
| 345 |
-
|
| 346 |
Returns:
|
| 347 |
List of search results with scores and metadata
|
| 348 |
"""
|
| 349 |
if not self.vector_store:
|
| 350 |
logger.warning("FAISS vector store not available for search")
|
| 351 |
return []
|
| 352 |
-
|
| 353 |
-
config =
|
| 354 |
if threshold is None:
|
| 355 |
-
threshold = config.processing
|
| 356 |
-
|
| 357 |
try:
|
| 358 |
-
# Perform similarity search with scores
|
| 359 |
-
docs_and_scores = self.vector_store.similarity_search_with_score(query, k=top_k*
|
| 360 |
-
|
| 361 |
-
|
|
|
|
| 362 |
seen_texts = set()
|
| 363 |
-
|
| 364 |
for doc, score in docs_and_scores:
|
| 365 |
# Convert FAISS distance to similarity score (higher is better)
|
| 366 |
-
similarity_score = 1.0
|
| 367 |
-
|
| 368 |
if similarity_score < threshold:
|
| 369 |
continue
|
| 370 |
-
|
| 371 |
# Avoid duplicates based on text content
|
| 372 |
text_preview = doc.page_content[:100]
|
| 373 |
if text_preview not in seen_texts:
|
| 374 |
seen_texts.add(text_preview)
|
| 375 |
-
|
| 376 |
-
|
| 377 |
'text': doc.page_content,
|
| 378 |
'source': doc.metadata.get('name', ''),
|
| 379 |
'path': doc.metadata.get('path', ''),
|
| 380 |
-
'full_path': doc.metadata.get('source', ''),
|
| 381 |
-
'citation': doc.metadata.get('citation', 'document'),
|
| 382 |
'score': float(similarity_score),
|
| 383 |
'metadata': doc.metadata
|
| 384 |
})
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
return results
|
| 390 |
-
|
| 391 |
except Exception as e:
|
| 392 |
logger.error(f"Failed to search FAISS store: {e}")
|
| 393 |
-
|
| 394 |
-
|
| 395 |
def get_statistics(self) -> Dict[str, Any]:
|
| 396 |
"""Get processing statistics"""
|
| 397 |
stats = {
|
|
@@ -401,10 +459,9 @@ class DocumentProcessor:
|
|
| 401 |
'store_name': self.store_name,
|
| 402 |
'model_name': self.model_name
|
| 403 |
}
|
| 404 |
-
|
| 405 |
# Add performance metrics if available
|
| 406 |
if self.performance_stats:
|
| 407 |
stats['performance'] = self.performance_stats
|
| 408 |
-
|
| 409 |
return stats
|
| 410 |
-
|
|
|
|
| 2 |
"""
|
| 3 |
Streamlined Document Processing Module
|
| 4 |
|
| 5 |
+
This module provides a document processing pipeline with:
|
| 6 |
+
- Direct LangChain loader integration with glob patterns
|
| 7 |
- Built-in FAISS vector storage without external file tracking
|
| 8 |
- Semantic text chunking using RecursiveCharacterTextSplitter
|
| 9 |
- Consolidated document metadata handling
|
| 10 |
"""
|
| 11 |
|
| 12 |
import os
|
| 13 |
+
import time
|
| 14 |
|
| 15 |
+
# Enable tokenizers parallelism for better performance
|
| 16 |
+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")
|
|
|
|
| 17 |
|
| 18 |
from pathlib import Path
|
| 19 |
from typing import Dict, List, Optional, Any, Callable
|
|
|
|
| 22 |
# LangChain imports
|
| 23 |
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, Docx2txtLoader, TextLoader
|
| 24 |
from langchain_community.vectorstores import FAISS
|
|
|
|
| 25 |
from langchain_core.documents import Document
|
| 26 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 27 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 28 |
|
| 29 |
+
# Import configuration and utilities from app modules
|
| 30 |
+
from app.core.config import get_app_config
|
| 31 |
+
from app.core.model_cache import get_cached_embeddings
|
| 32 |
+
from app.core.logging import logger
|
| 33 |
+
from app.core.performance import get_performance_manager, monitor_performance, cached_by_content
|
| 34 |
|
| 35 |
+
# Optional accelerate import
|
| 36 |
+
try:
|
| 37 |
+
from accelerate import Accelerator
|
| 38 |
+
ACCELERATE_AVAILABLE = True
|
| 39 |
+
except ImportError:
|
| 40 |
+
ACCELERATE_AVAILABLE = False
|
| 41 |
+
Accelerator = None
|
| 42 |
|
| 43 |
|
| 44 |
# =============================================================================
|
|
|
|
| 48 |
def safe_execute(func: Callable, default: Any = None, context: str = "", log_errors: bool = True) -> Any:
|
| 49 |
"""
|
| 50 |
Execute a function with basic error handling and logging
|
| 51 |
+
|
| 52 |
Args:
|
| 53 |
func: Function to execute
|
| 54 |
default: Value to return on error
|
| 55 |
context: Brief description for logs
|
| 56 |
log_errors: Whether to log errors
|
| 57 |
+
|
| 58 |
Returns:
|
| 59 |
Function result or default value on error
|
| 60 |
"""
|
|
|
|
| 83 |
class DocumentProcessor:
|
| 84 |
"""
|
| 85 |
Streamlined document processing class with integrated FAISS vector storage
|
| 86 |
+
|
| 87 |
This class consolidates all document processing functionality including:
|
| 88 |
- Document loading using LangChain's DirectoryLoader with glob patterns
|
| 89 |
- Semantic text chunking with RecursiveCharacterTextSplitter
|
| 90 |
- FAISS vector storage for similarity search
|
| 91 |
- Document metadata handling
|
| 92 |
"""
|
| 93 |
+
|
| 94 |
def __init__(self, model_name: Optional[str] = None, store_name: Optional[str] = None):
|
| 95 |
"""
|
| 96 |
Initialize the document processor
|
| 97 |
+
|
| 98 |
Args:
|
| 99 |
model_name: Name of the sentence transformer model for embeddings (optional)
|
| 100 |
store_name: Name for the FAISS store (optional, uses config default)
|
| 101 |
"""
|
| 102 |
+
config = get_app_config()
|
| 103 |
+
self.model_name = model_name or config.model['sentence_transformer_model']
|
| 104 |
+
self.store_name = store_name or config.processing['faiss_store_name']
|
| 105 |
+
|
| 106 |
# Initialize components
|
| 107 |
self.documents: List[Document] = []
|
| 108 |
self.vector_store: Optional[FAISS] = None
|
| 109 |
self.embeddings: Optional[HuggingFaceEmbeddings] = None
|
| 110 |
self.text_splitter: Optional[RecursiveCharacterTextSplitter] = None
|
| 111 |
self.performance_stats = {}
|
| 112 |
+
|
| 113 |
# Convenience properties for backward compatibility
|
| 114 |
self.chunks = [] # Will be populated after processing
|
| 115 |
+
|
| 116 |
# Initialize text splitter with semantic boundaries
|
| 117 |
self._init_text_splitter()
|
| 118 |
+
|
| 119 |
# Initialize embeddings if model name provided
|
| 120 |
if self.model_name:
|
| 121 |
+
self.embeddings = get_cached_embeddings(self.model_name)
|
| 122 |
+
logger.info(f"Initialized cached embeddings with model: {self.model_name}")
|
| 123 |
+
|
| 124 |
+
# Setup accelerate for GPU optimization if available
|
| 125 |
+
if ACCELERATE_AVAILABLE:
|
| 126 |
+
try:
|
| 127 |
+
self.accelerator = Accelerator()
|
| 128 |
+
logger.info(f"Accelerate initialized with device: {self.accelerator.device}")
|
| 129 |
+
except Exception as e:
|
| 130 |
+
logger.warning(f"Failed to initialize accelerate: {e}")
|
| 131 |
+
self.accelerator = None
|
| 132 |
+
else:
|
| 133 |
+
self.accelerator = None
|
| 134 |
else:
|
| 135 |
logger.warning("No model name provided - embeddings not initialized")
|
| 136 |
+
self.accelerator = None
|
| 137 |
+
|
| 138 |
# Try to load existing FAISS store
|
| 139 |
self._load_existing_store()
|
| 140 |
+
|
| 141 |
def _init_text_splitter(self):
|
| 142 |
"""Initialize the text splitter with optimal settings for semantic chunking"""
|
| 143 |
+
config = get_app_config()
|
| 144 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 145 |
+
chunk_size=config.processing['chunk_size'],
|
| 146 |
+
chunk_overlap=config.processing['chunk_overlap'],
|
| 147 |
+
# Better separators for business documents with semantic boundaries
|
| 148 |
+
separators=[
|
| 149 |
+
"\n\n\n", # Triple newlines (major section breaks)
|
| 150 |
+
"\n\n", # Double newlines (paragraph breaks)
|
| 151 |
+
"\n", # Single newlines
|
| 152 |
+
". ", # Sentences
|
| 153 |
+
".\n", # Sentences with newlines
|
| 154 |
+
"! ", # Exclamations
|
| 155 |
+
"? ", # Questions
|
| 156 |
+
"; ", # Semicolons (common in legal/business docs)
|
| 157 |
+
", ", # Commas (last resort for long sentences)
|
| 158 |
+
" ", # Spaces
|
| 159 |
+
"", # Character level (absolute last resort)
|
| 160 |
+
],
|
| 161 |
length_function=len,
|
| 162 |
is_separator_regex=False,
|
| 163 |
+
# Keep related content together
|
| 164 |
+
keep_separator=True, # Keep separators to maintain context
|
| 165 |
)
|
| 166 |
+
logger.info(f"Initialized semantic text splitter: {config.processing['chunk_size']} chars, {config.processing['chunk_overlap']} overlap")
|
| 167 |
+
|
| 168 |
def _load_existing_store(self):
|
| 169 |
"""Load existing FAISS store if available"""
|
| 170 |
if not self.embeddings:
|
| 171 |
return
|
| 172 |
+
|
| 173 |
+
config = get_app_config()
|
| 174 |
+
faiss_dir = config.paths['faiss_dir']
|
| 175 |
faiss_index_path = faiss_dir / f"{self.store_name}.faiss"
|
| 176 |
faiss_pkl_path = faiss_dir / f"{self.store_name}.pkl"
|
| 177 |
+
|
| 178 |
try:
|
| 179 |
if faiss_index_path.exists() and faiss_pkl_path.exists():
|
| 180 |
self.vector_store = FAISS.load_local(
|
|
|
|
| 189 |
except Exception as e:
|
| 190 |
logger.error(f"Failed to load FAISS store: {e}")
|
| 191 |
self.vector_store = None
|
| 192 |
+
|
| 193 |
+
@monitor_performance
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
def load_data_room(self, data_room_path: str, progress_bar=None) -> Dict[str, Any]:
|
| 195 |
"""
|
| 196 |
Load and process an entire data room using DirectoryLoader with glob patterns
|
| 197 |
+
|
| 198 |
Args:
|
| 199 |
data_room_path: Path to the data room directory
|
| 200 |
progress_bar: Optional Streamlit progress bar object
|
| 201 |
+
|
| 202 |
Returns:
|
| 203 |
Dictionary with processing results including performance metrics
|
| 204 |
"""
|
| 205 |
import time
|
| 206 |
start_time = time.time()
|
| 207 |
+
|
| 208 |
+
config = get_app_config()
|
| 209 |
data_room_path = Path(data_room_path)
|
| 210 |
+
|
| 211 |
if not data_room_path.exists():
|
| 212 |
logger.error(f"Data room path does not exist: {data_room_path}")
|
| 213 |
return {'documents_count': 0, 'chunks_count': 0, 'has_embeddings': False}
|
| 214 |
+
|
| 215 |
logger.info(f"Starting streamlined data room processing: {data_room_path}")
|
| 216 |
+
|
| 217 |
# Clear existing documents
|
| 218 |
self.documents = []
|
| 219 |
+
|
| 220 |
+
@monitor_performance
|
| 221 |
+
def load_data_room(self, data_room_path: str, progress_bar=None) -> Dict[str, Any]:
|
| 222 |
+
start_time = time.time()
|
| 223 |
documents_loaded = 0
|
| 224 |
+
config = get_app_config()
|
| 225 |
+
|
| 226 |
# Load documents by file type using DirectoryLoader with glob patterns
|
| 227 |
+
supported_extensions = config.processing['supported_file_extensions']
|
| 228 |
+
perf_manager = get_performance_manager()
|
| 229 |
+
|
| 230 |
+
# Get memory info for batch optimization
|
| 231 |
+
mem_info = perf_manager.monitor_memory_usage()
|
| 232 |
+
logger.info(f"Memory usage at start: {mem_info['percent']:.1f}%")
|
| 233 |
+
logger.info(f"Available memory: {mem_info['rss']:.1f}MB")
|
| 234 |
+
|
| 235 |
for ext in supported_extensions:
|
| 236 |
try:
|
| 237 |
# Create glob pattern for this extension
|
| 238 |
glob_pattern = f"**/*{ext}"
|
| 239 |
+
|
| 240 |
# Choose appropriate loader based on extension
|
| 241 |
if ext == '.pdf':
|
| 242 |
loader_cls = PyPDFLoader
|
|
|
|
| 246 |
loader_cls = TextLoader
|
| 247 |
else:
|
| 248 |
continue
|
| 249 |
+
|
| 250 |
# Use DirectoryLoader with glob pattern
|
| 251 |
loader = DirectoryLoader(
|
| 252 |
str(data_room_path),
|
|
|
|
| 257 |
show_progress=False, # Disable verbose progress output
|
| 258 |
use_multithreading=True
|
| 259 |
)
|
| 260 |
+
|
| 261 |
# Load documents for this extension
|
| 262 |
docs = safe_execute(
|
| 263 |
lambda: loader.load(),
|
| 264 |
default=[],
|
| 265 |
context=f"Loading {ext} files"
|
| 266 |
)
|
| 267 |
+
|
| 268 |
if docs:
|
| 269 |
# Add relative path information to metadata
|
| 270 |
for doc in docs:
|
|
|
|
| 279 |
# If relative path fails, use original source
|
| 280 |
doc.metadata['path'] = doc.metadata['source']
|
| 281 |
doc.metadata['name'] = source_path.name
|
| 282 |
+
|
| 283 |
self.documents.extend(docs)
|
| 284 |
documents_loaded += len(docs)
|
| 285 |
logger.info(f"Loaded {len(docs)} {ext} documents")
|
| 286 |
+
|
| 287 |
+
# Monitor memory usage and trigger GC if needed
|
| 288 |
+
mem_usage = perf_manager.monitor_memory_usage()
|
| 289 |
+
if perf_manager.should_gc_collect(mem_usage):
|
| 290 |
+
import gc
|
| 291 |
+
gc.collect()
|
| 292 |
+
logger.debug(f"GC triggered - memory usage: {mem_usage['rss']:.1f}MB")
|
| 293 |
except Exception as e:
|
| 294 |
logger.error(f"Error loading {ext} files: {e}")
|
| 295 |
+
|
| 296 |
scan_time = time.time() - start_time
|
| 297 |
logger.info(f"Document loading completed in {scan_time:.2f} seconds")
|
| 298 |
+
|
| 299 |
# Split documents into chunks using the text splitter
|
| 300 |
chunk_start = time.time()
|
| 301 |
if self.documents and self.text_splitter:
|
| 302 |
+
# Track original documents to identify first chunks
|
| 303 |
+
original_docs = {doc.metadata.get('source', ''): True for doc in self.documents}
|
| 304 |
+
|
| 305 |
self.documents = self.text_splitter.split_documents(self.documents)
|
| 306 |
+
|
| 307 |
# Add chunk metadata and populate chunks for backward compatibility
|
| 308 |
+
# Track which documents we've seen to mark first chunks
|
| 309 |
+
seen_documents = {}
|
| 310 |
self.chunks = []
|
| 311 |
+
|
| 312 |
for i, doc in enumerate(self.documents):
|
| 313 |
doc.metadata['chunk_id'] = f"chunk_{i}"
|
| 314 |
doc.metadata['processed_at'] = datetime.now().isoformat()
|
| 315 |
+
|
| 316 |
+
# Mark first chunks for each document (critical for document type matching)
|
| 317 |
+
doc_source = doc.metadata.get('source', '')
|
| 318 |
+
if doc_source not in seen_documents:
|
| 319 |
+
doc.metadata['is_first_chunk'] = True
|
| 320 |
+
seen_documents[doc_source] = True
|
| 321 |
+
logger.debug(f"First chunk marked for: {doc_source}")
|
| 322 |
+
else:
|
| 323 |
+
doc.metadata['is_first_chunk'] = False
|
| 324 |
+
|
| 325 |
# Add citation information if available
|
| 326 |
if 'page' in doc.metadata:
|
| 327 |
doc.metadata['citation'] = f"page {doc.metadata['page']}"
|
| 328 |
else:
|
| 329 |
doc.metadata['citation'] = doc.metadata.get('name', 'document')
|
| 330 |
+
|
| 331 |
# Create chunk dict for backward compatibility
|
| 332 |
chunk_dict = {
|
| 333 |
'text': doc.page_content,
|
|
|
|
| 337 |
'metadata': doc.metadata
|
| 338 |
}
|
| 339 |
self.chunks.append(chunk_dict)
|
| 340 |
+
|
| 341 |
+
first_chunks_count = len([doc for doc in self.documents if doc.metadata.get('is_first_chunk', False)])
|
| 342 |
+
logger.info(f"Marked {first_chunks_count} first chunks out of {len(self.documents)} total chunks")
|
| 343 |
+
|
| 344 |
chunk_time = time.time() - chunk_start
|
| 345 |
logger.info(f"Text splitting completed in {chunk_time:.2f} seconds")
|
| 346 |
+
|
| 347 |
+
# FAISS vector store should be loaded from pre-built indices
|
| 348 |
embedding_time = 0
|
| 349 |
if self.embeddings and self.documents:
|
| 350 |
embedding_start = time.time()
|
| 351 |
+
|
| 352 |
if self.vector_store is None:
|
| 353 |
+
logger.debug("FAISS store not pre-loaded (expected during index building)")
|
|
|
|
|
|
|
| 354 |
else:
|
| 355 |
+
logger.info(f"Using pre-loaded FAISS store with {self.vector_store.index.ntotal} vectors")
|
| 356 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
embedding_time = time.time() - embedding_start
|
| 358 |
+
logger.info(f"FAISS check completed in {embedding_time:.2f} seconds")
|
| 359 |
+
|
| 360 |
total_time = time.time() - start_time
|
| 361 |
logger.info(f"Total data room processing completed in {total_time:.2f} seconds")
|
| 362 |
+
|
| 363 |
# Store performance stats
|
| 364 |
self.performance_stats = {
|
| 365 |
'total_time': total_time,
|
|
|
|
| 368 |
'embedding_time': embedding_time,
|
| 369 |
'documents_per_second': documents_loaded / scan_time if scan_time > 0 else 0
|
| 370 |
}
|
| 371 |
+
|
| 372 |
return {
|
| 373 |
'documents_count': documents_loaded,
|
| 374 |
'chunks_count': len(self.documents),
|
|
|
|
| 376 |
'has_embeddings': self.vector_store is not None,
|
| 377 |
'performance': self.performance_stats
|
| 378 |
}
|
| 379 |
+
|
| 380 |
def search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
|
| 381 |
"""
|
| 382 |
Search documents using FAISS similarity search
|
| 383 |
+
|
| 384 |
Args:
|
| 385 |
query: Search query
|
| 386 |
top_k: Number of top results to return
|
| 387 |
threshold: Minimum similarity threshold
|
| 388 |
+
|
| 389 |
Returns:
|
| 390 |
List of search results with scores and metadata
|
| 391 |
"""
|
| 392 |
if not self.vector_store:
|
| 393 |
logger.warning("FAISS vector store not available for search")
|
| 394 |
return []
|
| 395 |
+
|
| 396 |
+
config = get_app_config()
|
| 397 |
if threshold is None:
|
| 398 |
+
threshold = config.processing['similarity_threshold']
|
| 399 |
+
|
| 400 |
try:
|
| 401 |
+
# Perform similarity search with scores - get more candidates for reranking
|
| 402 |
+
docs_and_scores = self.vector_store.similarity_search_with_score(query, k=max(20, top_k*3))
|
| 403 |
+
|
| 404 |
+
# Initial filtering and conversion to candidates format
|
| 405 |
+
candidates = []
|
| 406 |
seen_texts = set()
|
| 407 |
+
|
| 408 |
for doc, score in docs_and_scores:
|
| 409 |
# Convert FAISS distance to similarity score (higher is better)
|
| 410 |
+
similarity_score = 1.0 - (score / 2.0) if score <= 2.0 else 0.0
|
| 411 |
+
|
| 412 |
if similarity_score < threshold:
|
| 413 |
continue
|
| 414 |
+
|
| 415 |
# Avoid duplicates based on text content
|
| 416 |
text_preview = doc.page_content[:100]
|
| 417 |
if text_preview not in seen_texts:
|
| 418 |
seen_texts.add(text_preview)
|
| 419 |
+
|
| 420 |
+
candidates.append({
|
| 421 |
'text': doc.page_content,
|
| 422 |
'source': doc.metadata.get('name', ''),
|
| 423 |
'path': doc.metadata.get('path', ''),
|
|
|
|
|
|
|
| 424 |
'score': float(similarity_score),
|
| 425 |
'metadata': doc.metadata
|
| 426 |
})
|
| 427 |
+
|
| 428 |
+
# Apply reranking if we have candidates
|
| 429 |
+
if candidates:
|
| 430 |
+
try:
|
| 431 |
+
# Import rerank_results from ranking module to avoid circular import
|
| 432 |
+
from app.core.ranking import rerank_results
|
| 433 |
+
|
| 434 |
+
# Rerank the top candidates (limit to reasonable number for performance)
|
| 435 |
+
candidates_to_rerank = candidates[:min(15, len(candidates))] # Rerank up to 15 candidates
|
| 436 |
+
|
| 437 |
+
reranked_results = rerank_results(query, candidates_to_rerank)
|
| 438 |
+
results = reranked_results[:top_k] # Take top_k after reranking
|
| 439 |
+
logger.info(f"Reranked {len(reranked_results)} search results for query: {query[:50]}...")
|
| 440 |
+
except Exception as e:
|
| 441 |
+
# Reranking failed - use original results without reranking
|
| 442 |
+
logger.warning(f"Reranking failed for search query '{query}': {e}. Using original similarity scores.")
|
| 443 |
+
results = candidates[:top_k]
|
| 444 |
+
else:
|
| 445 |
+
results = []
|
| 446 |
+
|
| 447 |
return results
|
| 448 |
+
|
| 449 |
except Exception as e:
|
| 450 |
logger.error(f"Failed to search FAISS store: {e}")
|
| 451 |
+
raise RuntimeError(f"Document search failed for query '{query}': {e}") from e
|
| 452 |
+
|
| 453 |
def get_statistics(self) -> Dict[str, Any]:
|
| 454 |
"""Get processing statistics"""
|
| 455 |
stats = {
|
|
|
|
| 459 |
'store_name': self.store_name,
|
| 460 |
'model_name': self.model_name
|
| 461 |
}
|
| 462 |
+
|
| 463 |
# Add performance metrics if available
|
| 464 |
if self.performance_stats:
|
| 465 |
stats['performance'] = self.performance_stats
|
| 466 |
+
|
| 467 |
return stats
|
|
|
app/core/exceptions.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Core Exception Classes
|
| 4 |
+
|
| 5 |
+
Centralized exception definitions for the application.
|
| 6 |
+
This module provides clean exception classes without
|
| 7 |
+
depending on UI or external frameworks.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AppException(Exception):
|
| 12 |
+
"""Base exception class for application-specific errors"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, message: str, user_message: str = None, recovery_hint: str = None):
|
| 15 |
+
self.message = message
|
| 16 |
+
self.user_message = user_message or message
|
| 17 |
+
self.recovery_hint = recovery_hint
|
| 18 |
+
super().__init__(message)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class ValidationError(AppException):
|
| 22 |
+
"""Error for input validation failures"""
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ProcessingError(AppException):
|
| 27 |
+
"""Error for document processing failures"""
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class AIError(AppException):
|
| 32 |
+
"""Error for AI service failures"""
|
| 33 |
+
pass
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class ConfigError(AppException):
|
| 37 |
+
"""Error for configuration issues"""
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class FileOperationError(AppException):
|
| 42 |
+
"""Error for file operation failures"""
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class NetworkError(AppException):
|
| 47 |
+
"""Error for network-related failures"""
|
| 48 |
+
pass
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class LLMConnectionError(AIError):
|
| 52 |
+
"""Error for LLM API connection failures"""
|
| 53 |
+
pass
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class LLMAuthenticationError(AIError):
|
| 57 |
+
"""Error for LLM API authentication failures"""
|
| 58 |
+
pass
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class LLMTimeoutError(AIError):
|
| 62 |
+
"""Error for LLM API timeout failures"""
|
| 63 |
+
pass
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class LLMQuotaExceededError(AIError):
|
| 67 |
+
"""Error for LLM API quota/rate limit exceeded"""
|
| 68 |
+
pass
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class LLMInvalidResponseError(AIError):
|
| 72 |
+
"""Error for invalid LLM API responses"""
|
| 73 |
+
pass
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class DocumentProcessingError(ProcessingError):
|
| 77 |
+
"""Error for document processing failures"""
|
| 78 |
+
pass
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class SearchError(AppException):
|
| 82 |
+
"""Error for search operation failures"""
|
| 83 |
+
pass
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# Convenience functions for creating exceptions
|
| 87 |
+
def create_validation_error(message: str, recovery_hint: str = None) -> ValidationError:
|
| 88 |
+
"""Create a validation error with consistent formatting"""
|
| 89 |
+
return ValidationError(
|
| 90 |
+
message,
|
| 91 |
+
user_message=f"Validation error: {message}",
|
| 92 |
+
recovery_hint=recovery_hint or "Please check your input and try again"
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def create_processing_error(message: str, recovery_hint: str = None) -> ProcessingError:
|
| 97 |
+
"""Create a processing error with consistent formatting"""
|
| 98 |
+
return ProcessingError(
|
| 99 |
+
message,
|
| 100 |
+
user_message=f"Processing error: {message}",
|
| 101 |
+
recovery_hint=recovery_hint or "Please check your files and try again"
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def create_ai_error(message: str, recovery_hint: str = None) -> AIError:
|
| 106 |
+
"""Create an AI error with consistent formatting"""
|
| 107 |
+
return AIError(
|
| 108 |
+
message,
|
| 109 |
+
user_message=f"AI service error: {message}",
|
| 110 |
+
recovery_hint=recovery_hint or "Please check your API key and try again"
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def create_config_error(message: str, recovery_hint: str = None) -> ConfigError:
|
| 115 |
+
"""Create a configuration error with consistent formatting"""
|
| 116 |
+
return ConfigError(
|
| 117 |
+
message,
|
| 118 |
+
user_message=f"Configuration error: {message}",
|
| 119 |
+
recovery_hint=recovery_hint or "Please check your configuration and environment variables"
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def create_file_error(message: str, recovery_hint: str = None) -> FileOperationError:
|
| 124 |
+
"""Create a file operation error with consistent formatting"""
|
| 125 |
+
return FileOperationError(
|
| 126 |
+
message,
|
| 127 |
+
user_message=f"File error: {message}",
|
| 128 |
+
recovery_hint=recovery_hint or "Please check file permissions and paths"
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def create_network_error(message: str, recovery_hint: str = None) -> NetworkError:
|
| 133 |
+
"""Create a network error with consistent formatting"""
|
| 134 |
+
return NetworkError(
|
| 135 |
+
message,
|
| 136 |
+
user_message=f"Network error: {message}",
|
| 137 |
+
recovery_hint=recovery_hint or "Please check your internet connection and try again"
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def create_llm_connection_error(message: str, recovery_hint: str = None) -> LLMConnectionError:
|
| 142 |
+
"""Create an LLM connection error with consistent formatting"""
|
| 143 |
+
return LLMConnectionError(
|
| 144 |
+
message,
|
| 145 |
+
user_message=f"AI service connection error: {message}",
|
| 146 |
+
recovery_hint=recovery_hint or "Please check your internet connection and try again"
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def create_llm_authentication_error(message: str, recovery_hint: str = None) -> LLMAuthenticationError:
|
| 151 |
+
"""Create an LLM authentication error with consistent formatting"""
|
| 152 |
+
return LLMAuthenticationError(
|
| 153 |
+
message,
|
| 154 |
+
user_message=f"AI service authentication error: {message}",
|
| 155 |
+
recovery_hint=recovery_hint or "Please check your API key and try again"
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def create_llm_timeout_error(message: str, recovery_hint: str = None) -> LLMTimeoutError:
|
| 160 |
+
"""Create an LLM timeout error with consistent formatting"""
|
| 161 |
+
return LLMTimeoutError(
|
| 162 |
+
message,
|
| 163 |
+
user_message=f"AI service timeout: {message}",
|
| 164 |
+
recovery_hint=recovery_hint or "Please try again in a few moments"
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def create_llm_quota_error(message: str, recovery_hint: str = None) -> LLMQuotaExceededError:
|
| 169 |
+
"""Create an LLM quota exceeded error with consistent formatting"""
|
| 170 |
+
return LLMQuotaExceededError(
|
| 171 |
+
message,
|
| 172 |
+
user_message=f"AI service quota exceeded: {message}",
|
| 173 |
+
recovery_hint=recovery_hint or "Please check your API usage limits and try again later"
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def create_llm_invalid_response_error(message: str, recovery_hint: str = None) -> LLMInvalidResponseError:
|
| 178 |
+
"""Create an LLM invalid response error with consistent formatting"""
|
| 179 |
+
return LLMInvalidResponseError(
|
| 180 |
+
message,
|
| 181 |
+
user_message=f"AI service returned invalid response: {message}",
|
| 182 |
+
recovery_hint=recovery_hint or "Please try again or contact support if the issue persists"
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def create_document_processing_error(message: str, recovery_hint: str = None) -> DocumentProcessingError:
|
| 187 |
+
"""Create a document processing error with consistent formatting"""
|
| 188 |
+
return DocumentProcessingError(
|
| 189 |
+
message,
|
| 190 |
+
user_message=f"Document processing error: {message}",
|
| 191 |
+
recovery_hint=recovery_hint or "Please check your document format and try again"
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def create_search_error(message: str, recovery_hint: str = None) -> SearchError:
|
| 196 |
+
"""Create a search error with consistent formatting"""
|
| 197 |
+
return SearchError(
|
| 198 |
+
message,
|
| 199 |
+
user_message=f"Search error: {message}",
|
| 200 |
+
recovery_hint=recovery_hint or "Please try adjusting your search terms"
|
| 201 |
+
)
|
app/core/knowledge_graph.py
ADDED
|
@@ -0,0 +1,639 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Knowledge Graph Module
|
| 4 |
+
|
| 5 |
+
This module provides efficient loading and querying of pre-computed knowledge graphs
|
| 6 |
+
in Streamlit applications. It's designed to work with graphs generated by the
|
| 7 |
+
build_knowledge_graphs.py script.
|
| 8 |
+
|
| 9 |
+
Key features:
|
| 10 |
+
- Fast graph loading with caching
|
| 11 |
+
- Rich query interface for graph exploration
|
| 12 |
+
- Integration with existing document processor workflow
|
| 13 |
+
- Memory-efficient graph operations
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import pickle
|
| 17 |
+
import json
|
| 18 |
+
import numpy as np
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import Dict, List, Any, Optional, Set, Tuple
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
import streamlit as st
|
| 23 |
+
|
| 24 |
+
import networkx as nx
|
| 25 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 26 |
+
from app.core.config import get_config
|
| 27 |
+
from app.core.logging import logger
|
| 28 |
+
|
| 29 |
+
class KnowledgeGraphManager:
|
| 30 |
+
"""
|
| 31 |
+
Manages loading and querying of knowledge graphs for due diligence analysis.
|
| 32 |
+
|
| 33 |
+
This class provides a clean interface for working with pre-computed knowledge
|
| 34 |
+
graphs in Streamlit applications, with efficient caching and query capabilities.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(self, store_name: str):
|
| 38 |
+
"""
|
| 39 |
+
Initialize the knowledge graph manager for a specific company.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
store_name: The company store name (matches FAISS index name)
|
| 43 |
+
"""
|
| 44 |
+
self.store_name = store_name
|
| 45 |
+
self.graph: Optional[nx.MultiDiGraph] = None
|
| 46 |
+
self.metadata: Optional[Dict[str, Any]] = None
|
| 47 |
+
self.entities: Optional[Dict[str, List[Dict]]] = None
|
| 48 |
+
self.document_processor = None # Will be loaded on-demand for semantic search
|
| 49 |
+
self._config = get_config()
|
| 50 |
+
|
| 51 |
+
@st.cache_data(ttl=3600) # Cache for 1 hour
|
| 52 |
+
def load_graph(_self) -> bool:
|
| 53 |
+
"""
|
| 54 |
+
Load the knowledge graph from disk with caching.
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
bool: True if graph was loaded successfully, False otherwise
|
| 58 |
+
"""
|
| 59 |
+
try:
|
| 60 |
+
graphs_dir = _self._config.paths['faiss_dir'] / 'knowledge_graphs'
|
| 61 |
+
|
| 62 |
+
# Load main graph
|
| 63 |
+
graph_file = graphs_dir / f"{_self.store_name}_knowledge_graph.pkl"
|
| 64 |
+
if not graph_file.exists():
|
| 65 |
+
logger.warning(f"Knowledge graph not found: {graph_file}")
|
| 66 |
+
return False
|
| 67 |
+
|
| 68 |
+
with open(graph_file, 'rb') as f:
|
| 69 |
+
_self.graph = pickle.load(f)
|
| 70 |
+
|
| 71 |
+
# Load metadata
|
| 72 |
+
metadata_file = graphs_dir / f"{_self.store_name}_graph_metadata.json"
|
| 73 |
+
if metadata_file.exists():
|
| 74 |
+
with open(metadata_file, 'r') as f:
|
| 75 |
+
_self.metadata = json.load(f)
|
| 76 |
+
|
| 77 |
+
# Load entities
|
| 78 |
+
entities_file = graphs_dir / f"{_self.store_name}_entities.json"
|
| 79 |
+
if entities_file.exists():
|
| 80 |
+
with open(entities_file, 'r') as f:
|
| 81 |
+
_self.entities = json.load(f)
|
| 82 |
+
|
| 83 |
+
logger.info(f"Loaded knowledge graph for {_self.store_name}: "
|
| 84 |
+
f"{len(_self.graph.nodes())} nodes, {len(_self.graph.edges())} edges")
|
| 85 |
+
return True
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
logger.error(f"Failed to load knowledge graph for {_self.store_name}: {e}")
|
| 89 |
+
return False
|
| 90 |
+
|
| 91 |
+
def is_available(self) -> bool:
|
| 92 |
+
"""Check if knowledge graph is available and loaded"""
|
| 93 |
+
return self.graph is not None and len(self.graph.nodes()) > 0
|
| 94 |
+
|
| 95 |
+
def get_summary_stats(self) -> Dict[str, Any]:
|
| 96 |
+
"""Get summary statistics about the knowledge graph"""
|
| 97 |
+
if not self.is_available():
|
| 98 |
+
return {}
|
| 99 |
+
|
| 100 |
+
stats = {
|
| 101 |
+
'num_entities': len(self.graph.nodes()),
|
| 102 |
+
'num_relationships': len(self.graph.edges()),
|
| 103 |
+
'entity_types': {},
|
| 104 |
+
'relationship_types': {},
|
| 105 |
+
'created_at': self.metadata.get('created_at') if self.metadata else None
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
# Count entity types
|
| 109 |
+
for node in self.graph.nodes():
|
| 110 |
+
node_type = self.graph.nodes[node].get('type', 'unknown')
|
| 111 |
+
stats['entity_types'][node_type] = stats['entity_types'].get(node_type, 0) + 1
|
| 112 |
+
|
| 113 |
+
# Count relationship types
|
| 114 |
+
for _, _, edge_data in self.graph.edges(data=True):
|
| 115 |
+
rel_type = edge_data.get('relationship', 'unknown')
|
| 116 |
+
stats['relationship_types'][rel_type] = stats['relationship_types'].get(rel_type, 0) + 1
|
| 117 |
+
|
| 118 |
+
return stats
|
| 119 |
+
|
| 120 |
+
def search_entities(self, query: str, entity_type: Optional[str] = None, limit: int = 10) -> List[Dict[str, Any]]:
|
| 121 |
+
"""
|
| 122 |
+
Search for entities by name or content.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
query: Search query string
|
| 126 |
+
entity_type: Filter by entity type (companies, people, etc.)
|
| 127 |
+
limit: Maximum number of results
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
List of matching entities with metadata
|
| 131 |
+
"""
|
| 132 |
+
if not self.is_available():
|
| 133 |
+
return []
|
| 134 |
+
|
| 135 |
+
query_lower = query.lower()
|
| 136 |
+
results = []
|
| 137 |
+
|
| 138 |
+
for node in self.graph.nodes():
|
| 139 |
+
node_data = self.graph.nodes[node]
|
| 140 |
+
node_name = node_data.get('name', '').lower()
|
| 141 |
+
node_type = node_data.get('type', '')
|
| 142 |
+
|
| 143 |
+
# Filter by type if specified
|
| 144 |
+
if entity_type and node_type != entity_type:
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
# Check if query matches name or context
|
| 148 |
+
if query_lower in node_name:
|
| 149 |
+
score = 1.0 if query_lower == node_name else 0.8
|
| 150 |
+
|
| 151 |
+
results.append({
|
| 152 |
+
'node_id': node,
|
| 153 |
+
'name': node_data.get('name', ''),
|
| 154 |
+
'type': node_type,
|
| 155 |
+
'score': score,
|
| 156 |
+
'sources': node_data.get('sources', ''),
|
| 157 |
+
'document_type': node_data.get('document_type', 'unknown'),
|
| 158 |
+
'context_samples': node_data.get('context_samples', [])[:2] # Limit context
|
| 159 |
+
})
|
| 160 |
+
|
| 161 |
+
# Sort by score and limit results
|
| 162 |
+
results.sort(key=lambda x: x['score'], reverse=True)
|
| 163 |
+
return results[:limit]
|
| 164 |
+
|
| 165 |
+
def get_entity_relationships(self, entity_name: str) -> Dict[str, List[Dict[str, Any]]]:
|
| 166 |
+
"""
|
| 167 |
+
Get all relationships for a specific entity.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
entity_name: Name of the entity to find relationships for
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
Dictionary with 'incoming' and 'outgoing' relationship lists
|
| 174 |
+
"""
|
| 175 |
+
if not self.is_available():
|
| 176 |
+
return {'incoming': [], 'outgoing': []}
|
| 177 |
+
|
| 178 |
+
# Find matching nodes
|
| 179 |
+
matching_nodes = []
|
| 180 |
+
for node in self.graph.nodes():
|
| 181 |
+
if entity_name.lower() in self.graph.nodes[node].get('name', '').lower():
|
| 182 |
+
matching_nodes.append(node)
|
| 183 |
+
|
| 184 |
+
if not matching_nodes:
|
| 185 |
+
return {'incoming': [], 'outgoing': []}
|
| 186 |
+
|
| 187 |
+
relationships = {'incoming': [], 'outgoing': []}
|
| 188 |
+
|
| 189 |
+
for node in matching_nodes:
|
| 190 |
+
# Outgoing relationships
|
| 191 |
+
for _, target, edge_data in self.graph.out_edges(node, data=True):
|
| 192 |
+
relationships['outgoing'].append({
|
| 193 |
+
'target': self.graph.nodes[target].get('name', target),
|
| 194 |
+
'target_type': self.graph.nodes[target].get('type', 'unknown'),
|
| 195 |
+
'relationship': edge_data.get('relationship', 'unknown'),
|
| 196 |
+
'source_document': edge_data.get('source_document', ''),
|
| 197 |
+
'context': edge_data.get('context', '')[:200], # Truncate context
|
| 198 |
+
'confidence': edge_data.get('confidence', 0.0)
|
| 199 |
+
})
|
| 200 |
+
|
| 201 |
+
# Incoming relationships
|
| 202 |
+
for source, _, edge_data in self.graph.in_edges(node, data=True):
|
| 203 |
+
relationships['incoming'].append({
|
| 204 |
+
'source': self.graph.nodes[source].get('name', source),
|
| 205 |
+
'source_type': self.graph.nodes[source].get('type', 'unknown'),
|
| 206 |
+
'relationship': edge_data.get('relationship', 'unknown'),
|
| 207 |
+
'source_document': edge_data.get('source_document', ''),
|
| 208 |
+
'context': edge_data.get('context', '')[:200], # Truncate context
|
| 209 |
+
'confidence': edge_data.get('confidence', 0.0)
|
| 210 |
+
})
|
| 211 |
+
|
| 212 |
+
return relationships
|
| 213 |
+
|
| 214 |
+
def find_paths(self, source_entity: str, target_entity: str, max_length: int = 3) -> List[List[str]]:
|
| 215 |
+
"""
|
| 216 |
+
Find paths between two entities in the knowledge graph.
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
source_entity: Starting entity name
|
| 220 |
+
target_entity: Target entity name
|
| 221 |
+
max_length: Maximum path length to search
|
| 222 |
+
|
| 223 |
+
Returns:
|
| 224 |
+
List of paths (each path is a list of entity names)
|
| 225 |
+
"""
|
| 226 |
+
if not self.is_available():
|
| 227 |
+
return []
|
| 228 |
+
|
| 229 |
+
# Find matching nodes
|
| 230 |
+
source_nodes = [n for n in self.graph.nodes()
|
| 231 |
+
if source_entity.lower() in self.graph.nodes[n].get('name', '').lower()]
|
| 232 |
+
target_nodes = [n for n in self.graph.nodes()
|
| 233 |
+
if target_entity.lower() in self.graph.nodes[n].get('name', '').lower()]
|
| 234 |
+
|
| 235 |
+
if not source_nodes or not target_nodes:
|
| 236 |
+
return []
|
| 237 |
+
|
| 238 |
+
paths = []
|
| 239 |
+
for source_node in source_nodes:
|
| 240 |
+
for target_node in target_nodes:
|
| 241 |
+
if source_node == target_node:
|
| 242 |
+
continue
|
| 243 |
+
|
| 244 |
+
try:
|
| 245 |
+
# Find all simple paths up to max_length
|
| 246 |
+
simple_paths = list(nx.all_simple_paths(
|
| 247 |
+
self.graph, source_node, target_node, cutoff=max_length
|
| 248 |
+
))
|
| 249 |
+
|
| 250 |
+
# Convert node IDs to entity names
|
| 251 |
+
for path in simple_paths[:5]: # Limit to 5 paths per pair
|
| 252 |
+
entity_path = [self.graph.nodes[node].get('name', node) for node in path]
|
| 253 |
+
paths.append(entity_path)
|
| 254 |
+
|
| 255 |
+
except nx.NetworkXNoPath:
|
| 256 |
+
continue
|
| 257 |
+
|
| 258 |
+
return paths[:10] # Return max 10 paths total
|
| 259 |
+
|
| 260 |
+
def get_central_entities(self, limit: int = 10) -> List[Dict[str, Any]]:
|
| 261 |
+
"""
|
| 262 |
+
Get the most central/important entities in the graph.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
limit: Maximum number of entities to return
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
List of entities with centrality scores
|
| 269 |
+
"""
|
| 270 |
+
if not self.is_available() or len(self.graph.nodes()) < 2:
|
| 271 |
+
return []
|
| 272 |
+
|
| 273 |
+
try:
|
| 274 |
+
# Calculate degree centrality
|
| 275 |
+
centrality = nx.degree_centrality(self.graph)
|
| 276 |
+
|
| 277 |
+
# Get top central entities
|
| 278 |
+
top_entities = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:limit]
|
| 279 |
+
|
| 280 |
+
results = []
|
| 281 |
+
for node, score in top_entities:
|
| 282 |
+
node_data = self.graph.nodes[node]
|
| 283 |
+
results.append({
|
| 284 |
+
'name': node_data.get('name', ''),
|
| 285 |
+
'type': node_data.get('type', 'unknown'),
|
| 286 |
+
'centrality_score': round(score, 3),
|
| 287 |
+
'num_connections': len(list(self.graph.neighbors(node))),
|
| 288 |
+
'sources': node_data.get('sources', '')
|
| 289 |
+
})
|
| 290 |
+
|
| 291 |
+
return results
|
| 292 |
+
|
| 293 |
+
except Exception as e:
|
| 294 |
+
logger.error(f"Error calculating centrality: {e}")
|
| 295 |
+
return []
|
| 296 |
+
|
| 297 |
+
def get_entity_clusters(self) -> List[List[str]]:
|
| 298 |
+
"""
|
| 299 |
+
Find clusters of related entities using community detection.
|
| 300 |
+
|
| 301 |
+
Returns:
|
| 302 |
+
List of clusters (each cluster is a list of entity names)
|
| 303 |
+
"""
|
| 304 |
+
if not self.is_available() or len(self.graph.nodes()) < 3:
|
| 305 |
+
return []
|
| 306 |
+
|
| 307 |
+
try:
|
| 308 |
+
# Convert to undirected graph for community detection
|
| 309 |
+
undirected = self.graph.to_undirected()
|
| 310 |
+
|
| 311 |
+
# Use simple connected components as clusters
|
| 312 |
+
components = list(nx.connected_components(undirected))
|
| 313 |
+
|
| 314 |
+
clusters = []
|
| 315 |
+
for component in components:
|
| 316 |
+
if len(component) > 1: # Only include clusters with multiple entities
|
| 317 |
+
cluster_names = [self.graph.nodes[node].get('name', node) for node in component]
|
| 318 |
+
clusters.append(cluster_names)
|
| 319 |
+
|
| 320 |
+
# Sort clusters by size
|
| 321 |
+
clusters.sort(key=len, reverse=True)
|
| 322 |
+
return clusters[:5] # Return top 5 clusters
|
| 323 |
+
|
| 324 |
+
except Exception as e:
|
| 325 |
+
logger.error(f"Error finding clusters: {e}")
|
| 326 |
+
return []
|
| 327 |
+
|
| 328 |
+
def export_graph_data(self) -> Dict[str, Any]:
|
| 329 |
+
"""
|
| 330 |
+
Export graph data for visualization or further analysis.
|
| 331 |
+
|
| 332 |
+
Returns:
|
| 333 |
+
Dictionary with nodes and edges data suitable for visualization
|
| 334 |
+
"""
|
| 335 |
+
if not self.is_available():
|
| 336 |
+
return {'nodes': [], 'edges': []}
|
| 337 |
+
|
| 338 |
+
# Export nodes
|
| 339 |
+
nodes = []
|
| 340 |
+
for node in self.graph.nodes():
|
| 341 |
+
node_data = self.graph.nodes[node]
|
| 342 |
+
nodes.append({
|
| 343 |
+
'id': node,
|
| 344 |
+
'name': node_data.get('name', ''),
|
| 345 |
+
'type': node_data.get('type', 'unknown'),
|
| 346 |
+
'sources': node_data.get('sources', ''),
|
| 347 |
+
'document_type': node_data.get('document_type', 'unknown')
|
| 348 |
+
})
|
| 349 |
+
|
| 350 |
+
# Export edges
|
| 351 |
+
edges = []
|
| 352 |
+
for source, target, edge_data in self.graph.edges(data=True):
|
| 353 |
+
edges.append({
|
| 354 |
+
'source': source,
|
| 355 |
+
'target': target,
|
| 356 |
+
'relationship': edge_data.get('relationship', 'unknown'),
|
| 357 |
+
'source_document': edge_data.get('source_document', ''),
|
| 358 |
+
'confidence': edge_data.get('confidence', 0.0)
|
| 359 |
+
})
|
| 360 |
+
|
| 361 |
+
return {
|
| 362 |
+
'nodes': nodes,
|
| 363 |
+
'edges': edges,
|
| 364 |
+
'metadata': self.metadata or {}
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
def _load_document_processor(self):
|
| 368 |
+
"""Load document processor for semantic search capabilities"""
|
| 369 |
+
if self.document_processor is None:
|
| 370 |
+
try:
|
| 371 |
+
from app.core.utils import create_document_processor
|
| 372 |
+
self.document_processor = create_document_processor(store_name=self.store_name)
|
| 373 |
+
if not self.document_processor.vector_store:
|
| 374 |
+
logger.warning(f"No FAISS vector store available for {self.store_name}")
|
| 375 |
+
self.document_processor = None
|
| 376 |
+
except Exception as e:
|
| 377 |
+
logger.error(f"Failed to load document processor for {self.store_name}: {e}")
|
| 378 |
+
self.document_processor = None
|
| 379 |
+
|
| 380 |
+
def semantic_search_entities(self, query: str, limit: int = 10, similarity_threshold: float = 0.3) -> List[Dict[str, Any]]:
|
| 381 |
+
"""
|
| 382 |
+
Perform semantic search on entities using FAISS embeddings.
|
| 383 |
+
|
| 384 |
+
This method finds entities whose source contexts are semantically similar
|
| 385 |
+
to the query, providing more intelligent search than simple text matching.
|
| 386 |
+
|
| 387 |
+
Args:
|
| 388 |
+
query: Natural language query
|
| 389 |
+
limit: Maximum number of results
|
| 390 |
+
similarity_threshold: Minimum similarity score to include
|
| 391 |
+
|
| 392 |
+
Returns:
|
| 393 |
+
List of entities with similarity scores and context
|
| 394 |
+
"""
|
| 395 |
+
if not self.is_available():
|
| 396 |
+
return []
|
| 397 |
+
|
| 398 |
+
# Load document processor if not already loaded
|
| 399 |
+
self._load_document_processor()
|
| 400 |
+
if not self.document_processor or not self.document_processor.vector_store:
|
| 401 |
+
logger.warning("Semantic search not available - falling back to text search")
|
| 402 |
+
return self.search_entities(query, limit=limit)
|
| 403 |
+
|
| 404 |
+
try:
|
| 405 |
+
# Perform semantic search on FAISS index
|
| 406 |
+
relevant_docs = self.document_processor.vector_store.similarity_search_with_score(
|
| 407 |
+
query, k=min(50, limit * 5) # Get more candidates for filtering
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
# Map document chunks back to entities
|
| 411 |
+
entity_matches = []
|
| 412 |
+
seen_entities = set()
|
| 413 |
+
|
| 414 |
+
for doc, score in relevant_docs:
|
| 415 |
+
if score < similarity_threshold:
|
| 416 |
+
continue
|
| 417 |
+
|
| 418 |
+
# Find entities that originated from this document chunk
|
| 419 |
+
chunk_id = doc.metadata.get('chunk_id', '')
|
| 420 |
+
doc_source = doc.metadata.get('source', '')
|
| 421 |
+
|
| 422 |
+
# Search for entities that came from this chunk/document
|
| 423 |
+
for node in self.graph.nodes():
|
| 424 |
+
node_data = self.graph.nodes[node]
|
| 425 |
+
entity_sources = node_data.get('sources', '')
|
| 426 |
+
|
| 427 |
+
# Check if entity came from this document
|
| 428 |
+
if (doc_source and doc_source in entity_sources) or (chunk_id and chunk_id in str(node_data.get('context_samples', []))):
|
| 429 |
+
entity_key = f"{node_data.get('name', '')}_{node_data.get('type', '')}"
|
| 430 |
+
|
| 431 |
+
if entity_key not in seen_entities:
|
| 432 |
+
seen_entities.add(entity_key)
|
| 433 |
+
entity_matches.append({
|
| 434 |
+
'node_id': node,
|
| 435 |
+
'name': node_data.get('name', ''),
|
| 436 |
+
'type': node_data.get('type', 'unknown'),
|
| 437 |
+
'similarity_score': 1.0 - score, # Convert distance to similarity
|
| 438 |
+
'sources': entity_sources,
|
| 439 |
+
'document_type': node_data.get('document_type', 'unknown'),
|
| 440 |
+
'context_samples': node_data.get('context_samples', [])[:2],
|
| 441 |
+
'matching_context': doc.page_content[:300] # Show relevant context
|
| 442 |
+
})
|
| 443 |
+
|
| 444 |
+
if len(entity_matches) >= limit:
|
| 445 |
+
break
|
| 446 |
+
|
| 447 |
+
if len(entity_matches) >= limit:
|
| 448 |
+
break
|
| 449 |
+
|
| 450 |
+
# Sort by similarity score
|
| 451 |
+
entity_matches.sort(key=lambda x: x['similarity_score'], reverse=True)
|
| 452 |
+
return entity_matches[:limit]
|
| 453 |
+
|
| 454 |
+
except Exception as e:
|
| 455 |
+
logger.error(f"Semantic search failed: {e}")
|
| 456 |
+
# Fallback to regular text search
|
| 457 |
+
return self.search_entities(query, limit=limit)
|
| 458 |
+
|
| 459 |
+
def find_related_entities_by_context(self, entity_name: str, limit: int = 5) -> List[Dict[str, Any]]:
|
| 460 |
+
"""
|
| 461 |
+
Find entities related to the given entity based on semantic similarity of their contexts.
|
| 462 |
+
|
| 463 |
+
Args:
|
| 464 |
+
entity_name: Name of the reference entity
|
| 465 |
+
limit: Maximum number of related entities to return
|
| 466 |
+
|
| 467 |
+
Returns:
|
| 468 |
+
List of related entities with similarity scores
|
| 469 |
+
"""
|
| 470 |
+
if not self.is_available():
|
| 471 |
+
return []
|
| 472 |
+
|
| 473 |
+
# Find the reference entity
|
| 474 |
+
reference_entities = [n for n in self.graph.nodes()
|
| 475 |
+
if entity_name.lower() in self.graph.nodes[n].get('name', '').lower()]
|
| 476 |
+
|
| 477 |
+
if not reference_entities:
|
| 478 |
+
return []
|
| 479 |
+
|
| 480 |
+
# Load document processor
|
| 481 |
+
self._load_document_processor()
|
| 482 |
+
if not self.document_processor or not self.document_processor.vector_store:
|
| 483 |
+
return []
|
| 484 |
+
|
| 485 |
+
try:
|
| 486 |
+
# Get context samples from the reference entity
|
| 487 |
+
reference_node = reference_entities[0]
|
| 488 |
+
reference_data = self.graph.nodes[reference_node]
|
| 489 |
+
context_samples = reference_data.get('context_samples', [])
|
| 490 |
+
|
| 491 |
+
if not context_samples:
|
| 492 |
+
return []
|
| 493 |
+
|
| 494 |
+
# Use the first context sample as a query
|
| 495 |
+
query_context = context_samples[0][:500] # Limit context length
|
| 496 |
+
|
| 497 |
+
# Find semantically similar contexts
|
| 498 |
+
similar_docs = self.document_processor.vector_store.similarity_search_with_score(
|
| 499 |
+
query_context, k=20
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
+
# Map back to entities
|
| 503 |
+
related_entities = []
|
| 504 |
+
seen_entities = {reference_data.get('name', '')}
|
| 505 |
+
|
| 506 |
+
for doc, score in similar_docs:
|
| 507 |
+
doc_source = doc.metadata.get('source', '')
|
| 508 |
+
|
| 509 |
+
# Find entities from this document
|
| 510 |
+
for node in self.graph.nodes():
|
| 511 |
+
if node == reference_node:
|
| 512 |
+
continue
|
| 513 |
+
|
| 514 |
+
node_data = self.graph.nodes[node]
|
| 515 |
+
entity_name_node = node_data.get('name', '')
|
| 516 |
+
entity_sources = node_data.get('sources', '')
|
| 517 |
+
|
| 518 |
+
if (entity_name_node not in seen_entities and
|
| 519 |
+
doc_source and doc_source in entity_sources):
|
| 520 |
+
|
| 521 |
+
seen_entities.add(entity_name_node)
|
| 522 |
+
related_entities.append({
|
| 523 |
+
'name': entity_name_node,
|
| 524 |
+
'type': node_data.get('type', 'unknown'),
|
| 525 |
+
'similarity_score': 1.0 - score,
|
| 526 |
+
'sources': entity_sources,
|
| 527 |
+
'context_samples': node_data.get('context_samples', [])[:1],
|
| 528 |
+
'relationship_reason': 'Semantic context similarity'
|
| 529 |
+
})
|
| 530 |
+
|
| 531 |
+
if len(related_entities) >= limit:
|
| 532 |
+
break
|
| 533 |
+
|
| 534 |
+
if len(related_entities) >= limit:
|
| 535 |
+
break
|
| 536 |
+
|
| 537 |
+
# Sort by similarity
|
| 538 |
+
related_entities.sort(key=lambda x: x['similarity_score'], reverse=True)
|
| 539 |
+
return related_entities[:limit]
|
| 540 |
+
|
| 541 |
+
except Exception as e:
|
| 542 |
+
logger.error(f"Context-based entity search failed: {e}")
|
| 543 |
+
return []
|
| 544 |
+
|
| 545 |
+
def semantic_path_search(self, query: str, max_paths: int = 5) -> List[Dict[str, Any]]:
|
| 546 |
+
"""
|
| 547 |
+
Find paths in the graph that are semantically relevant to a query.
|
| 548 |
+
|
| 549 |
+
Args:
|
| 550 |
+
query: Natural language description of what to find
|
| 551 |
+
max_paths: Maximum number of paths to return
|
| 552 |
+
|
| 553 |
+
Returns:
|
| 554 |
+
List of paths with relevance scores
|
| 555 |
+
"""
|
| 556 |
+
if not self.is_available():
|
| 557 |
+
return []
|
| 558 |
+
|
| 559 |
+
# First, find entities semantically related to the query
|
| 560 |
+
relevant_entities = self.semantic_search_entities(query, limit=10)
|
| 561 |
+
|
| 562 |
+
if len(relevant_entities) < 2:
|
| 563 |
+
return []
|
| 564 |
+
|
| 565 |
+
# Find interesting paths between the most relevant entities
|
| 566 |
+
paths_found = []
|
| 567 |
+
|
| 568 |
+
for i, entity1 in enumerate(relevant_entities[:5]): # Limit to top 5 for performance
|
| 569 |
+
for entity2 in relevant_entities[i+1:]:
|
| 570 |
+
try:
|
| 571 |
+
# Find paths between these entities
|
| 572 |
+
paths = self.find_paths(entity1['name'], entity2['name'], max_length=3)
|
| 573 |
+
|
| 574 |
+
for path in paths[:2]: # Limit paths per pair
|
| 575 |
+
# Calculate path relevance based on entity similarity scores
|
| 576 |
+
path_score = (entity1['similarity_score'] + entity2['similarity_score']) / 2
|
| 577 |
+
|
| 578 |
+
paths_found.append({
|
| 579 |
+
'path': path,
|
| 580 |
+
'relevance_score': path_score,
|
| 581 |
+
'start_entity': entity1['name'],
|
| 582 |
+
'end_entity': entity2['name'],
|
| 583 |
+
'query_relevance': f"Related to: {query}",
|
| 584 |
+
'path_length': len(path) - 1
|
| 585 |
+
})
|
| 586 |
+
|
| 587 |
+
if len(paths_found) >= max_paths:
|
| 588 |
+
break
|
| 589 |
+
|
| 590 |
+
except Exception as e:
|
| 591 |
+
logger.debug(f"Path finding failed between {entity1['name']} and {entity2['name']}: {e}")
|
| 592 |
+
continue
|
| 593 |
+
|
| 594 |
+
if len(paths_found) >= max_paths:
|
| 595 |
+
break
|
| 596 |
+
|
| 597 |
+
if len(paths_found) >= max_paths:
|
| 598 |
+
break
|
| 599 |
+
|
| 600 |
+
# Sort by relevance score
|
| 601 |
+
paths_found.sort(key=lambda x: x['relevance_score'], reverse=True)
|
| 602 |
+
return paths_found[:max_paths]
|
| 603 |
+
|
| 604 |
+
@st.cache_data(ttl=3600)
|
| 605 |
+
def get_available_knowledge_graphs() -> List[str]:
|
| 606 |
+
"""
|
| 607 |
+
Get list of available knowledge graphs.
|
| 608 |
+
|
| 609 |
+
Returns:
|
| 610 |
+
List of store names that have knowledge graphs available
|
| 611 |
+
"""
|
| 612 |
+
try:
|
| 613 |
+
config = get_config()
|
| 614 |
+
graphs_dir = config.paths['faiss_dir'] / 'knowledge_graphs'
|
| 615 |
+
|
| 616 |
+
if not graphs_dir.exists():
|
| 617 |
+
return []
|
| 618 |
+
|
| 619 |
+
# Find all knowledge graph files
|
| 620 |
+
graph_files = list(graphs_dir.glob("*_knowledge_graph.pkl"))
|
| 621 |
+
store_names = [f.stem.replace('_knowledge_graph', '') for f in graph_files]
|
| 622 |
+
|
| 623 |
+
return sorted(store_names)
|
| 624 |
+
|
| 625 |
+
except Exception as e:
|
| 626 |
+
logger.error(f"Error getting available knowledge graphs: {e}")
|
| 627 |
+
return []
|
| 628 |
+
|
| 629 |
+
def create_knowledge_graph_manager(store_name: str) -> KnowledgeGraphManager:
|
| 630 |
+
"""
|
| 631 |
+
Factory function to create a knowledge graph manager.
|
| 632 |
+
|
| 633 |
+
Args:
|
| 634 |
+
store_name: Company store name
|
| 635 |
+
|
| 636 |
+
Returns:
|
| 637 |
+
Configured KnowledgeGraphManager instance
|
| 638 |
+
"""
|
| 639 |
+
return KnowledgeGraphManager(store_name)
|
app/core/logging.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Logging Configuration Module
|
| 4 |
+
|
| 5 |
+
Provides consistent logging setup for the application.
|
| 6 |
+
This replaces the old src-based logging with a cleaner, app-specific solution.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
import sys
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from logging.handlers import RotatingFileHandler
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def configure_langchain_logging(log_level: str = "WARNING") -> None:
|
| 16 |
+
"""
|
| 17 |
+
Configure LangChain library logging levels to reduce verbosity.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
log_level: Logging level for LangChain modules (default: WARNING)
|
| 21 |
+
"""
|
| 22 |
+
langchain_modules = [
|
| 23 |
+
"langchain",
|
| 24 |
+
"langchain_core",
|
| 25 |
+
"langchain_community",
|
| 26 |
+
"langchain_huggingface"
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
level = getattr(logging, log_level.upper())
|
| 30 |
+
for module in langchain_modules:
|
| 31 |
+
logging.getLogger(module).setLevel(level)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def setup_logging(
|
| 35 |
+
name: str = "dd_poc",
|
| 36 |
+
log_level: str = "INFO",
|
| 37 |
+
log_file: str = None
|
| 38 |
+
) -> logging.Logger:
|
| 39 |
+
"""
|
| 40 |
+
Set up standard Python logging with rotating file handler
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
name: Logger name
|
| 44 |
+
log_level: Logging level
|
| 45 |
+
log_file: Optional log file path
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
Configured logger instance
|
| 49 |
+
"""
|
| 50 |
+
logger = logging.getLogger(name)
|
| 51 |
+
|
| 52 |
+
# Avoid duplicate setup if logger already has handlers
|
| 53 |
+
if logger.handlers:
|
| 54 |
+
return logger
|
| 55 |
+
|
| 56 |
+
logger.setLevel(getattr(logging, log_level.upper()))
|
| 57 |
+
|
| 58 |
+
# Console handler
|
| 59 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
| 60 |
+
console_formatter = logging.Formatter(
|
| 61 |
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 62 |
+
)
|
| 63 |
+
console_handler.setFormatter(console_formatter)
|
| 64 |
+
logger.addHandler(console_handler)
|
| 65 |
+
|
| 66 |
+
# Rotating file handler (if possible)
|
| 67 |
+
if log_file or True: # Always try to set up file logging
|
| 68 |
+
try:
|
| 69 |
+
log_dir = Path(".logs")
|
| 70 |
+
log_dir.mkdir(exist_ok=True)
|
| 71 |
+
|
| 72 |
+
if not log_file:
|
| 73 |
+
log_file = log_dir / f"dd_poc_{Path.cwd().name}.log"
|
| 74 |
+
|
| 75 |
+
# Use RotatingFileHandler for better log management
|
| 76 |
+
file_handler = RotatingFileHandler(
|
| 77 |
+
log_file,
|
| 78 |
+
maxBytes=10 * 1024 * 1024, # 10MB
|
| 79 |
+
backupCount=5
|
| 80 |
+
)
|
| 81 |
+
file_formatter = logging.Formatter(
|
| 82 |
+
'%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s'
|
| 83 |
+
)
|
| 84 |
+
file_handler.setFormatter(file_formatter)
|
| 85 |
+
logger.addHandler(file_handler)
|
| 86 |
+
except Exception:
|
| 87 |
+
# File logging not available (e.g., on Streamlit Cloud)
|
| 88 |
+
pass
|
| 89 |
+
|
| 90 |
+
return logger
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# Global logger instance
|
| 94 |
+
logger = setup_logging()
|
app/core/model_cache.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Model Cache Manager
|
| 4 |
+
|
| 5 |
+
Provides global caching for HuggingFace models to prevent re-downloads
|
| 6 |
+
across multiple instances and sessions.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from typing import Optional
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 13 |
+
from sentence_transformers import CrossEncoder
|
| 14 |
+
|
| 15 |
+
from app.core.logging import logger
|
| 16 |
+
|
| 17 |
+
# Optional accelerate import
|
| 18 |
+
try:
|
| 19 |
+
from accelerate import Accelerator
|
| 20 |
+
ACCELERATE_AVAILABLE = True
|
| 21 |
+
except ImportError:
|
| 22 |
+
ACCELERATE_AVAILABLE = False
|
| 23 |
+
Accelerator = None
|
| 24 |
+
|
| 25 |
+
# Global model cache
|
| 26 |
+
_EMBEDDINGS_CACHE = {}
|
| 27 |
+
_CROSS_ENCODER_CACHE = {}
|
| 28 |
+
|
| 29 |
+
# Local models directory
|
| 30 |
+
_MODELS_DIR = Path(__file__).parent.parent.parent / "models"
|
| 31 |
+
|
| 32 |
+
def _get_local_model_path(model_name: str) -> Optional[Path]:
|
| 33 |
+
"""
|
| 34 |
+
Get local path for a model if it exists.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
model_name: HuggingFace model name
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
Path to local model directory or None if not found
|
| 41 |
+
"""
|
| 42 |
+
if "/" in model_name:
|
| 43 |
+
# Handle different model name formats
|
| 44 |
+
if model_name.startswith("sentence-transformers/"):
|
| 45 |
+
# For sentence transformers: sentence-transformers/all-mpnet-base-v2
|
| 46 |
+
model_short_name = model_name.split("/")[-1]
|
| 47 |
+
local_path = _MODELS_DIR / "sentence_transformers" / model_short_name
|
| 48 |
+
elif model_name.startswith("cross-encoder/"):
|
| 49 |
+
# For cross encoders: cross-encoder/ms-marco-MiniLM-L-6-v2
|
| 50 |
+
model_short_name = model_name.split("/")[-1]
|
| 51 |
+
local_path = _MODELS_DIR / "cross_encoder" / model_short_name
|
| 52 |
+
else:
|
| 53 |
+
# Fallback for other models
|
| 54 |
+
model_short_name = model_name.split("/")[-1]
|
| 55 |
+
local_path = _MODELS_DIR / model_short_name
|
| 56 |
+
|
| 57 |
+
if local_path.exists():
|
| 58 |
+
return local_path
|
| 59 |
+
|
| 60 |
+
return None
|
| 61 |
+
|
| 62 |
+
def get_cached_embeddings(model_name: str = "sentence-transformers/all-mpnet-base-v2") -> HuggingFaceEmbeddings:
|
| 63 |
+
"""
|
| 64 |
+
Get cached HuggingFace embeddings model with accelerate optimization.
|
| 65 |
+
|
| 66 |
+
Creates the model only once and reuses it across all instances.
|
| 67 |
+
Uses local models directory if available, otherwise downloads from HuggingFace.
|
| 68 |
+
Automatically uses GPU if available via accelerate.
|
| 69 |
+
"""
|
| 70 |
+
if model_name not in _EMBEDDINGS_CACHE:
|
| 71 |
+
# Check for local model first
|
| 72 |
+
local_path = _get_local_model_path(model_name)
|
| 73 |
+
if local_path:
|
| 74 |
+
logger.info(f"Using local embeddings model: {local_path}")
|
| 75 |
+
embeddings = HuggingFaceEmbeddings(model_name=str(local_path))
|
| 76 |
+
else:
|
| 77 |
+
logger.info(f"Downloading embeddings model: {model_name}")
|
| 78 |
+
embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
| 79 |
+
|
| 80 |
+
# Optimize device placement with accelerate if available
|
| 81 |
+
if ACCELERATE_AVAILABLE:
|
| 82 |
+
try:
|
| 83 |
+
accelerator = Accelerator()
|
| 84 |
+
logger.info(f"Embeddings model optimized for device: {accelerator.device}")
|
| 85 |
+
# Accelerate will automatically handle device placement
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.warning(f"Failed to optimize embeddings with accelerate: {e}")
|
| 88 |
+
|
| 89 |
+
_EMBEDDINGS_CACHE[model_name] = embeddings
|
| 90 |
+
else:
|
| 91 |
+
logger.debug(f"Using cached embeddings model: {model_name}")
|
| 92 |
+
|
| 93 |
+
return _EMBEDDINGS_CACHE[model_name]
|
| 94 |
+
|
| 95 |
+
def get_cached_cross_encoder(model_name: str = 'cross-encoder/ms-marco-MiniLM-L-6-v2') -> CrossEncoder:
|
| 96 |
+
"""
|
| 97 |
+
Get cached cross-encoder model.
|
| 98 |
+
|
| 99 |
+
Creates the model only once and reuses it across all instances.
|
| 100 |
+
Uses local models directory if available, otherwise downloads from HuggingFace.
|
| 101 |
+
"""
|
| 102 |
+
if model_name not in _CROSS_ENCODER_CACHE:
|
| 103 |
+
# Check for local model first
|
| 104 |
+
local_path = _get_local_model_path(model_name)
|
| 105 |
+
if local_path:
|
| 106 |
+
logger.info(f"Using local cross-encoder model: {local_path}")
|
| 107 |
+
_CROSS_ENCODER_CACHE[model_name] = CrossEncoder(str(local_path))
|
| 108 |
+
else:
|
| 109 |
+
logger.info(f"Downloading cross-encoder model: {model_name}")
|
| 110 |
+
_CROSS_ENCODER_CACHE[model_name] = CrossEncoder(model_name)
|
| 111 |
+
else:
|
| 112 |
+
logger.debug(f"Using cached cross-encoder model: {model_name}")
|
| 113 |
+
|
| 114 |
+
return _CROSS_ENCODER_CACHE[model_name]
|
| 115 |
+
|
| 116 |
+
def clear_model_cache():
|
| 117 |
+
"""
|
| 118 |
+
Clear all cached models.
|
| 119 |
+
|
| 120 |
+
Useful for memory management or testing.
|
| 121 |
+
"""
|
| 122 |
+
_EMBEDDINGS_CACHE.clear()
|
| 123 |
+
_CROSS_ENCODER_CACHE.clear()
|
| 124 |
+
logger.info("Model cache cleared")
|
app/core/parsers.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
LLM-based parsing functions for due diligence documents.
|
| 4 |
+
|
| 5 |
+
This module provides modern structured output parsing using Pydantic models
|
| 6 |
+
to ensure reliable, type-safe parsing of LLM responses.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Dict, List
|
| 10 |
+
from app.core.logging import logger
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def parse_checklist(checklist_text: str, llm) -> Dict:
|
| 14 |
+
"""
|
| 15 |
+
Parse markdown checklist using Pydantic structured output.
|
| 16 |
+
|
| 17 |
+
This approach uses LangChain's PydanticOutputParser to ensure the LLM
|
| 18 |
+
returns properly structured data that matches our expected format.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
checklist_text: The raw checklist text to parse
|
| 22 |
+
llm: LLM instance to use for parsing
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
Dictionary with categories and their items
|
| 26 |
+
|
| 27 |
+
Raises:
|
| 28 |
+
RuntimeError: If LLM is not available or parsing fails
|
| 29 |
+
ValueError: If llm parameter is not provided
|
| 30 |
+
"""
|
| 31 |
+
if llm is None:
|
| 32 |
+
raise ValueError("LLM parameter is required")
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
from langchain_core.output_parsers import PydanticOutputParser
|
| 36 |
+
from app.ai.processing_pipeline import StructuredChecklist
|
| 37 |
+
from app.ai.prompts import get_checklist_parsing_prompt
|
| 38 |
+
|
| 39 |
+
# Set up structured output parser
|
| 40 |
+
parser = PydanticOutputParser(pydantic_object=StructuredChecklist)
|
| 41 |
+
|
| 42 |
+
# Use centralized prompt from prompts.py (avoid duplication)
|
| 43 |
+
prompt = get_checklist_parsing_prompt()
|
| 44 |
+
|
| 45 |
+
# Format the prompt with the checklist text and format instructions
|
| 46 |
+
formatted_prompt = prompt.format_messages(
|
| 47 |
+
checklist_text=checklist_text,
|
| 48 |
+
format_instructions=parser.get_format_instructions()
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Get LLM response
|
| 52 |
+
logger.info(f"Sending checklist to LLM for parsing (length: {len(checklist_text)} chars)")
|
| 53 |
+
llm_response = llm.invoke(formatted_prompt)
|
| 54 |
+
logger.debug(f"LLM response length: {len(llm_response.content)} chars")
|
| 55 |
+
|
| 56 |
+
# Parse the response using the Pydantic parser
|
| 57 |
+
result = parser.parse(llm_response.content)
|
| 58 |
+
|
| 59 |
+
# Convert Pydantic model to expected dictionary format
|
| 60 |
+
categories_dict = {}
|
| 61 |
+
for key, category in result.categories.items():
|
| 62 |
+
categories_dict[key] = {
|
| 63 |
+
'name': category.name,
|
| 64 |
+
'items': [
|
| 65 |
+
{
|
| 66 |
+
'text': item.text,
|
| 67 |
+
'original': item.original
|
| 68 |
+
}
|
| 69 |
+
for item in category.items
|
| 70 |
+
]
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
logger.info(f"Successfully parsed {len(categories_dict)} categories: {list(categories_dict.keys())}")
|
| 74 |
+
return categories_dict
|
| 75 |
+
|
| 76 |
+
except Exception as e:
|
| 77 |
+
raise RuntimeError(f"Structured parsing failed: {str(e)}")
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def parse_questions(questions_text: str, llm) -> List[Dict]:
|
| 81 |
+
"""
|
| 82 |
+
Parse markdown questions using Pydantic structured output.
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
questions_text: The raw questions text to parse
|
| 86 |
+
llm: LLM instance to use for parsing
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
List of dictionaries with question data
|
| 90 |
+
|
| 91 |
+
Raises:
|
| 92 |
+
RuntimeError: If LLM is not available or parsing fails
|
| 93 |
+
ValueError: If llm parameter is not provided
|
| 94 |
+
"""
|
| 95 |
+
if llm is None:
|
| 96 |
+
raise ValueError("LLM parameter is required")
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
from langchain_core.output_parsers import PydanticOutputParser
|
| 100 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 101 |
+
from langchain_core.messages import SystemMessage, HumanMessage
|
| 102 |
+
from app.ai.processing_pipeline import StructuredQuestions
|
| 103 |
+
|
| 104 |
+
# Set up structured output parser
|
| 105 |
+
parser = PydanticOutputParser(pydantic_object=StructuredQuestions)
|
| 106 |
+
|
| 107 |
+
# Create prompt with format instructions
|
| 108 |
+
from langchain_core.prompts import HumanMessagePromptTemplate
|
| 109 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 110 |
+
SystemMessage(content="""
|
| 111 |
+
You are a document parser. Parse the due diligence questions document into the EXACT JSON format specified.
|
| 112 |
+
|
| 113 |
+
CRITICAL:
|
| 114 |
+
- Return ONLY valid JSON, no additional text or explanations
|
| 115 |
+
- Extract categories (like "### A. Category Name")
|
| 116 |
+
- Extract numbered questions within each category
|
| 117 |
+
- Clean up markdown formatting but preserve core text
|
| 118 |
+
- Follow the exact format specified in the format instructions
|
| 119 |
+
|
| 120 |
+
The output must be valid JSON that can be parsed directly.
|
| 121 |
+
"""),
|
| 122 |
+
HumanMessagePromptTemplate.from_template("""Parse these questions into the exact JSON format:
|
| 123 |
+
|
| 124 |
+
{questions_text}
|
| 125 |
+
|
| 126 |
+
Required JSON schema:
|
| 127 |
+
{format_instructions}
|
| 128 |
+
|
| 129 |
+
Return only the JSON:""")
|
| 130 |
+
])
|
| 131 |
+
|
| 132 |
+
# Format the prompt with the questions text and format instructions
|
| 133 |
+
formatted_prompt = prompt.format_messages(
|
| 134 |
+
questions_text=questions_text,
|
| 135 |
+
format_instructions=parser.get_format_instructions()
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# Get LLM response
|
| 139 |
+
llm_response = llm.invoke(formatted_prompt)
|
| 140 |
+
|
| 141 |
+
# Parse the response using the Pydantic parser
|
| 142 |
+
result = parser.parse(llm_response.content)
|
| 143 |
+
|
| 144 |
+
# Convert Pydantic model to expected list format
|
| 145 |
+
return [
|
| 146 |
+
{
|
| 147 |
+
'category': question.category,
|
| 148 |
+
'question': question.question,
|
| 149 |
+
'id': question.id
|
| 150 |
+
}
|
| 151 |
+
for question in result.questions
|
| 152 |
+
]
|
| 153 |
+
|
| 154 |
+
except Exception as e:
|
| 155 |
+
raise RuntimeError(f"Structured parsing failed: {str(e)}")
|
app/core/performance.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Performance Optimization Module
|
| 4 |
+
|
| 5 |
+
This module provides performance optimizations using prebuilt libraries:
|
| 6 |
+
- diskcache: Smart caching system
|
| 7 |
+
- joblib: Function result caching
|
| 8 |
+
- httpx: Async HTTP client
|
| 9 |
+
- backoff: Retry logic with exponential backoff
|
| 10 |
+
- psutil: System resource monitoring
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import asyncio
|
| 14 |
+
import hashlib
|
| 15 |
+
import logging
|
| 16 |
+
import time
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from typing import Dict, List, Any, Optional, Callable, TypeVar, Union
|
| 19 |
+
from functools import wraps
|
| 20 |
+
|
| 21 |
+
import diskcache
|
| 22 |
+
import joblib
|
| 23 |
+
import httpx
|
| 24 |
+
import backoff
|
| 25 |
+
import psutil
|
| 26 |
+
from tqdm import tqdm
|
| 27 |
+
|
| 28 |
+
# Optional imports for GPU/CPU optimization
|
| 29 |
+
try:
|
| 30 |
+
import accelerate
|
| 31 |
+
ACCELERATE_AVAILABLE = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
ACCELERATE_AVAILABLE = False
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
import memory_profiler
|
| 37 |
+
MEMORY_PROFILER_AVAILABLE = True
|
| 38 |
+
except ImportError:
|
| 39 |
+
MEMORY_PROFILER_AVAILABLE = False
|
| 40 |
+
|
| 41 |
+
from app.core.config import get_config
|
| 42 |
+
|
| 43 |
+
logger = logging.getLogger(__name__)
|
| 44 |
+
|
| 45 |
+
# Type hints
|
| 46 |
+
T = TypeVar('T')
|
| 47 |
+
|
| 48 |
+
class PerformanceManager:
|
| 49 |
+
"""Central manager for performance optimizations"""
|
| 50 |
+
|
| 51 |
+
def __init__(self):
|
| 52 |
+
self.config = get_config()
|
| 53 |
+
self._setup_caches()
|
| 54 |
+
self._setup_clients()
|
| 55 |
+
|
| 56 |
+
def _setup_caches(self):
|
| 57 |
+
"""Initialize caching systems"""
|
| 58 |
+
faiss_dir = self.config.paths['faiss_dir']
|
| 59 |
+
faiss_dir.mkdir(parents=True, exist_ok=True)
|
| 60 |
+
|
| 61 |
+
# Document content cache
|
| 62 |
+
self.doc_cache = diskcache.Cache(
|
| 63 |
+
str(faiss_dir / '.doc_cache'),
|
| 64 |
+
size_limit=500 * 1024 * 1024, # 500MB
|
| 65 |
+
eviction_policy='least-recently-used'
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Embedding cache
|
| 69 |
+
self.embedding_cache = diskcache.Cache(
|
| 70 |
+
str(faiss_dir / '.embedding_cache'),
|
| 71 |
+
size_limit=2 * 1024 * 1024 * 1024, # 2GB
|
| 72 |
+
eviction_policy='least-recently-used'
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Joblib memory cache for expensive computations
|
| 76 |
+
self.memory = joblib.Memory(
|
| 77 |
+
location=str(faiss_dir / '.joblib_cache'),
|
| 78 |
+
verbose=0,
|
| 79 |
+
compress=True
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
def _setup_clients(self):
|
| 83 |
+
"""Initialize HTTP clients"""
|
| 84 |
+
# Async HTTP client for AI API calls
|
| 85 |
+
self.http_client = httpx.AsyncClient(
|
| 86 |
+
timeout=httpx.Timeout(60.0, connect=10.0),
|
| 87 |
+
limits=httpx.Limits(max_connections=10, max_keepalive_connections=5)
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
@staticmethod
|
| 91 |
+
def get_file_hash(file_path: Path) -> str:
|
| 92 |
+
"""Calculate SHA256 hash of file content"""
|
| 93 |
+
hash_sha256 = hashlib.sha256()
|
| 94 |
+
with open(file_path, 'rb') as f:
|
| 95 |
+
for chunk in iter(lambda: f.read(4096), b""):
|
| 96 |
+
hash_sha256.update(chunk)
|
| 97 |
+
return hash_sha256.hexdigest()
|
| 98 |
+
|
| 99 |
+
def cache_document_content(self, file_path: Path, content: str) -> None:
|
| 100 |
+
"""Cache document content with hash-based key"""
|
| 101 |
+
file_hash = self.get_file_hash(file_path)
|
| 102 |
+
cache_key = f"doc_content:{file_hash}"
|
| 103 |
+
self.doc_cache.set(cache_key, content, expire=86400 * 30) # 30 days
|
| 104 |
+
|
| 105 |
+
def get_cached_document_content(self, file_path: Path) -> Optional[str]:
|
| 106 |
+
"""Get cached document content"""
|
| 107 |
+
file_hash = self.get_file_hash(file_path)
|
| 108 |
+
cache_key = f"doc_content:{file_hash}"
|
| 109 |
+
return self.doc_cache.get(cache_key)
|
| 110 |
+
|
| 111 |
+
def cache_embeddings(self, text_hash: str, embeddings: List[List[float]]) -> None:
|
| 112 |
+
"""Cache embeddings with content hash"""
|
| 113 |
+
cache_key = f"embeddings:{text_hash}"
|
| 114 |
+
self.embedding_cache.set(cache_key, embeddings, expire=86400 * 30)
|
| 115 |
+
|
| 116 |
+
def get_cached_embeddings(self, text_hash: str) -> Optional[List[List[float]]]:
|
| 117 |
+
"""Get cached embeddings"""
|
| 118 |
+
cache_key = f"embeddings:{text_hash}"
|
| 119 |
+
return self.embedding_cache.get(cache_key)
|
| 120 |
+
|
| 121 |
+
@backoff.on_exception(
|
| 122 |
+
backoff.expo,
|
| 123 |
+
(httpx.RequestError, httpx.TimeoutException),
|
| 124 |
+
max_tries=3,
|
| 125 |
+
jitter=backoff.random_jitter
|
| 126 |
+
)
|
| 127 |
+
async def make_api_request(self, url: str, **kwargs) -> httpx.Response:
|
| 128 |
+
"""Make API request with automatic retry logic"""
|
| 129 |
+
return await self.http_client.request(url=url, **kwargs)
|
| 130 |
+
|
| 131 |
+
def monitor_memory_usage(self) -> Dict[str, float]:
|
| 132 |
+
"""Monitor current memory usage"""
|
| 133 |
+
process = psutil.Process()
|
| 134 |
+
memory_info = process.memory_info()
|
| 135 |
+
|
| 136 |
+
result = {
|
| 137 |
+
'rss': memory_info.rss / 1024 / 1024, # MB
|
| 138 |
+
'vms': memory_info.vms / 1024 / 1024, # MB
|
| 139 |
+
'percent': process.memory_percent()
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
# Add GPU memory info if available
|
| 143 |
+
if ACCELERATE_AVAILABLE:
|
| 144 |
+
try:
|
| 145 |
+
import torch
|
| 146 |
+
if torch.cuda.is_available():
|
| 147 |
+
gpu_memory = torch.cuda.get_device_properties(0)
|
| 148 |
+
result.update({
|
| 149 |
+
'gpu_total': gpu_memory.total_memory / 1024 / 1024 / 1024, # GB
|
| 150 |
+
'gpu_allocated': torch.cuda.memory_allocated(0) / 1024 / 1024 / 1024, # GB
|
| 151 |
+
'gpu_reserved': torch.cuda.memory_reserved(0) / 1024 / 1024 / 1024, # GB
|
| 152 |
+
})
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.debug(f"Could not get GPU memory info: {e}")
|
| 155 |
+
|
| 156 |
+
return result
|
| 157 |
+
|
| 158 |
+
def should_gc_collect(self, memory_usage: Dict[str, float]) -> bool:
|
| 159 |
+
"""Determine if garbage collection should be triggered"""
|
| 160 |
+
return memory_usage['percent'] > 80.0 or memory_usage['rss'] > 2000 # 80% or 2GB
|
| 161 |
+
|
| 162 |
+
def cleanup_cache(self) -> Dict[str, int]:
|
| 163 |
+
"""Clean up expired cache entries"""
|
| 164 |
+
doc_cleaned = self.doc_cache.expire()
|
| 165 |
+
embedding_cleaned = self.embedding_cache.expire()
|
| 166 |
+
|
| 167 |
+
return {
|
| 168 |
+
'doc_cache_cleaned': doc_cleaned,
|
| 169 |
+
'embedding_cache_cleaned': embedding_cleaned
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
async def close(self):
|
| 173 |
+
"""Clean up resources"""
|
| 174 |
+
await self.http_client.aclose()
|
| 175 |
+
self.doc_cache.close()
|
| 176 |
+
self.embedding_cache.close()
|
| 177 |
+
|
| 178 |
+
def optimize_batch_size(self, available_memory: float, item_size_estimate: float = 0.1) -> int:
|
| 179 |
+
"""Dynamically optimize batch size based on available memory"""
|
| 180 |
+
# Reserve 20% of memory for overhead
|
| 181 |
+
usable_memory = available_memory * 0.8
|
| 182 |
+
|
| 183 |
+
# Estimate optimal batch size
|
| 184 |
+
optimal_batch = int(usable_memory / item_size_estimate)
|
| 185 |
+
|
| 186 |
+
# Clamp to reasonable bounds
|
| 187 |
+
return max(1, min(optimal_batch, 1000))
|
| 188 |
+
|
| 189 |
+
def get_optimal_device(self) -> str:
|
| 190 |
+
"""Get the optimal device for computations"""
|
| 191 |
+
if ACCELERATE_AVAILABLE:
|
| 192 |
+
try:
|
| 193 |
+
import torch
|
| 194 |
+
if torch.cuda.is_available():
|
| 195 |
+
return 'cuda'
|
| 196 |
+
except:
|
| 197 |
+
pass
|
| 198 |
+
return 'cpu'
|
| 199 |
+
|
| 200 |
+
def setup_accelerate(self):
|
| 201 |
+
"""Setup accelerate for optimal performance"""
|
| 202 |
+
if ACCELERATE_AVAILABLE:
|
| 203 |
+
try:
|
| 204 |
+
from accelerate import Accelerator
|
| 205 |
+
self.accelerator = Accelerator()
|
| 206 |
+
logger.info(f"Accelerate initialized with device: {self.accelerator.device}")
|
| 207 |
+
return self.accelerator
|
| 208 |
+
except Exception as e:
|
| 209 |
+
logger.warning(f"Failed to initialize accelerate: {e}")
|
| 210 |
+
return None
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
# Global performance manager instance
|
| 214 |
+
_perf_manager = None
|
| 215 |
+
|
| 216 |
+
def get_performance_manager() -> PerformanceManager:
|
| 217 |
+
"""Get global performance manager instance"""
|
| 218 |
+
global _perf_manager
|
| 219 |
+
if _perf_manager is None:
|
| 220 |
+
_perf_manager = PerformanceManager()
|
| 221 |
+
return _perf_manager
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
# Decorators for easy optimization
|
| 225 |
+
def cached_by_content(func: Callable[..., T]) -> Callable[..., T]:
|
| 226 |
+
"""Cache function results based on content hash"""
|
| 227 |
+
@wraps(func)
|
| 228 |
+
def wrapper(*args, **kwargs):
|
| 229 |
+
# Generate content hash from relevant arguments
|
| 230 |
+
content_parts = []
|
| 231 |
+
for arg in args[1:]: # Skip self
|
| 232 |
+
if isinstance(arg, (str, Path)):
|
| 233 |
+
content_parts.append(str(arg))
|
| 234 |
+
|
| 235 |
+
content_hash = hashlib.sha256(
|
| 236 |
+
'|'.join(content_parts).encode()
|
| 237 |
+
).hexdigest()[:16]
|
| 238 |
+
|
| 239 |
+
perf_manager = get_performance_manager()
|
| 240 |
+
cache_key = f"{func.__name__}:{content_hash}"
|
| 241 |
+
|
| 242 |
+
# Try cache first
|
| 243 |
+
result = perf_manager.doc_cache.get(cache_key)
|
| 244 |
+
if result is not None:
|
| 245 |
+
logger.debug(f"Cache hit for {func.__name__}")
|
| 246 |
+
return result
|
| 247 |
+
|
| 248 |
+
# Compute and cache
|
| 249 |
+
result = func(*args, **kwargs)
|
| 250 |
+
perf_manager.doc_cache.set(cache_key, result, expire=86400 * 7) # 7 days
|
| 251 |
+
return result
|
| 252 |
+
|
| 253 |
+
return wrapper
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def memory_cached(func: Callable[..., T]) -> Callable[..., T]:
|
| 257 |
+
"""Cache function results using joblib memory cache"""
|
| 258 |
+
perf_manager = get_performance_manager()
|
| 259 |
+
cached_func = perf_manager.memory.cache(func)
|
| 260 |
+
return cached_func
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def monitor_performance(func: Callable[..., T]) -> Callable[..., T]:
|
| 264 |
+
"""Monitor function performance and memory usage"""
|
| 265 |
+
@wraps(func)
|
| 266 |
+
def wrapper(*args, **kwargs):
|
| 267 |
+
start_time = time.time()
|
| 268 |
+
perf_manager = get_performance_manager()
|
| 269 |
+
|
| 270 |
+
# Memory before
|
| 271 |
+
mem_before = perf_manager.monitor_memory_usage()
|
| 272 |
+
|
| 273 |
+
try:
|
| 274 |
+
result = func(*args, **kwargs)
|
| 275 |
+
return result
|
| 276 |
+
finally:
|
| 277 |
+
# Memory after
|
| 278 |
+
mem_after = perf_manager.monitor_memory_usage()
|
| 279 |
+
duration = time.time() - start_time
|
| 280 |
+
|
| 281 |
+
logger.debug(
|
| 282 |
+
f"{func.__name__}: {duration:.2f}s, "
|
| 283 |
+
f"Memory: {mem_before['rss']:.1f}MB -> {mem_after['rss']:.1f}MB"
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
# Trigger GC if needed
|
| 287 |
+
if perf_manager.should_gc_collect(mem_after):
|
| 288 |
+
import gc
|
| 289 |
+
gc.collect()
|
| 290 |
+
logger.debug("Garbage collection triggered")
|
| 291 |
+
|
| 292 |
+
return wrapper
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
# Utility functions
|
| 296 |
+
def get_text_hash(text: str) -> str:
|
| 297 |
+
"""Generate hash for text content"""
|
| 298 |
+
return hashlib.sha256(text.encode()).hexdigest()[:16]
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
def parallel_process(items: List[T], func: Callable[[T], Any],
|
| 302 |
+
max_workers: int = 4, desc: str = "Processing") -> List[Any]:
|
| 303 |
+
"""Process items in parallel using ThreadPoolExecutor"""
|
| 304 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 305 |
+
|
| 306 |
+
results = []
|
| 307 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 308 |
+
futures = {executor.submit(func, item): item for item in items}
|
| 309 |
+
|
| 310 |
+
with tqdm(total=len(items), desc=desc) as pbar:
|
| 311 |
+
for future in as_completed(futures):
|
| 312 |
+
result = future.result()
|
| 313 |
+
results.append(result)
|
| 314 |
+
pbar.update(1)
|
| 315 |
+
|
| 316 |
+
return results
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def optimize_embedding_batch(texts: List[str], embeddings_model,
|
| 320 |
+
batch_size: int = 32) -> List[List[float]]:
|
| 321 |
+
"""Optimize embedding generation with dynamic batching"""
|
| 322 |
+
perf_manager = get_performance_manager()
|
| 323 |
+
|
| 324 |
+
# Get available memory for batch optimization
|
| 325 |
+
mem_info = perf_manager.monitor_memory_usage()
|
| 326 |
+
available_memory = mem_info['rss']
|
| 327 |
+
|
| 328 |
+
# Dynamically adjust batch size based on memory
|
| 329 |
+
optimal_batch = perf_manager.optimize_batch_size(available_memory, item_size_estimate=0.001)
|
| 330 |
+
batch_size = min(batch_size, optimal_batch)
|
| 331 |
+
|
| 332 |
+
logger.info(f"Using optimized batch size: {batch_size} (memory: {available_memory:.1f}MB)")
|
| 333 |
+
|
| 334 |
+
all_embeddings = []
|
| 335 |
+
|
| 336 |
+
# Process in optimized batches
|
| 337 |
+
for i in range(0, len(texts), batch_size):
|
| 338 |
+
batch = texts[i:i + batch_size]
|
| 339 |
+
|
| 340 |
+
# Monitor memory before processing
|
| 341 |
+
mem_before = perf_manager.monitor_memory_usage()
|
| 342 |
+
|
| 343 |
+
try:
|
| 344 |
+
# Generate embeddings for this batch
|
| 345 |
+
batch_embeddings = embeddings_model.embed_documents(batch)
|
| 346 |
+
all_embeddings.extend(batch_embeddings)
|
| 347 |
+
|
| 348 |
+
# Monitor memory after processing
|
| 349 |
+
mem_after = perf_manager.monitor_memory_usage()
|
| 350 |
+
|
| 351 |
+
# Trigger GC if memory usage is high
|
| 352 |
+
if perf_manager.should_gc_collect(mem_after):
|
| 353 |
+
import gc
|
| 354 |
+
gc.collect()
|
| 355 |
+
logger.debug("GC triggered during embedding generation")
|
| 356 |
+
|
| 357 |
+
except Exception as e:
|
| 358 |
+
logger.error(f"Failed to process embedding batch {i//batch_size}: {e}")
|
| 359 |
+
# Continue with empty embeddings for this batch
|
| 360 |
+
all_embeddings.extend([[] for _ in batch])
|
| 361 |
+
|
| 362 |
+
return all_embeddings
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
async def gather_with_concurrency(n: int, *tasks):
|
| 366 |
+
"""Run async tasks with controlled concurrency"""
|
| 367 |
+
semaphore = asyncio.Semaphore(n)
|
| 368 |
+
|
| 369 |
+
async def sem_task(task):
|
| 370 |
+
async with semaphore:
|
| 371 |
+
return await task
|
| 372 |
+
|
| 373 |
+
return await asyncio.gather(*(sem_task(task) for task in tasks))
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
# Cleanup function for graceful shutdown
|
| 377 |
+
async def cleanup_performance_resources():
|
| 378 |
+
"""Clean up performance resources"""
|
| 379 |
+
global _perf_manager
|
| 380 |
+
if _perf_manager:
|
| 381 |
+
await _perf_manager.close()
|
| 382 |
+
_perf_manager = None
|
app/core/ranking.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Ranking utilities for search results reranking.
|
| 4 |
+
|
| 5 |
+
This module provides functions for reranking search results using cross-encoder models
|
| 6 |
+
to improve relevance scoring. Separated from search.py to avoid circular imports.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Dict, List
|
| 10 |
+
from app.core.logging import logger
|
| 11 |
+
from app.core.model_cache import get_cached_cross_encoder
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def rerank_results(query: str, candidates: List[Dict]) -> List[Dict]:
|
| 15 |
+
"""
|
| 16 |
+
Rerank search results using cross-encoder model for improved relevance
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
query: The search query
|
| 20 |
+
candidates: List of candidate documents with 'text', 'score', etc.
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
Reranked list of candidates with updated scores
|
| 24 |
+
"""
|
| 25 |
+
if not candidates:
|
| 26 |
+
return candidates
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
# Get cached cross-encoder model
|
| 30 |
+
cross_encoder = get_cached_cross_encoder()
|
| 31 |
+
|
| 32 |
+
# Prepare input pairs for cross-encoder
|
| 33 |
+
query_doc_pairs = [(query, candidate['text']) for candidate in candidates]
|
| 34 |
+
|
| 35 |
+
# Get cross-encoder scores
|
| 36 |
+
ce_scores = cross_encoder.predict(query_doc_pairs)
|
| 37 |
+
|
| 38 |
+
# Update candidates with reranked scores
|
| 39 |
+
for i, candidate in enumerate(candidates):
|
| 40 |
+
candidate['reranked_score'] = float(ce_scores[i])
|
| 41 |
+
candidate['score'] = float(ce_scores[i]) # Update main score for consistency
|
| 42 |
+
|
| 43 |
+
# Sort by reranked score (higher is better for cross-encoder)
|
| 44 |
+
candidates.sort(key=lambda x: x['reranked_score'], reverse=True)
|
| 45 |
+
|
| 46 |
+
logger.info(f"Reranked {len(candidates)} results using cross-encoder")
|
| 47 |
+
return candidates
|
| 48 |
+
|
| 49 |
+
except Exception as e:
|
| 50 |
+
logger.warning(f"Cross-encoder reranking failed: {e}. Using original scores.")
|
| 51 |
+
return candidates
|
app/core/reports.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Report generation functions for due diligence analysis.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict
|
| 7 |
+
|
| 8 |
+
from app.core.logging import logger
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def generate_reports_from_cache(checklist_results: Dict, questions_answers: Dict, strategy_text: str, checklist_text: str, questions_text: str) -> Dict:
|
| 12 |
+
"""Generate reports from cached results (placeholder implementation)"""
|
| 13 |
+
logger.info("Generating reports from cache")
|
| 14 |
+
|
| 15 |
+
return {
|
| 16 |
+
'overview': "Report generated from cached data",
|
| 17 |
+
'strategic': strategy_text[:500] if strategy_text else "No strategy provided",
|
| 18 |
+
'checklist_summary': f"Processed {len(checklist_results)} categories",
|
| 19 |
+
'questions_summary': f"Processed {len(questions_answers)} questions"
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def generate_reports(checklist_results: Dict, questions_answers: Dict, strategy_text: str, checklist_text: str, questions_text: str) -> Dict:
|
| 24 |
+
"""Generate comprehensive reports (placeholder implementation)"""
|
| 25 |
+
logger.info("Generating comprehensive reports")
|
| 26 |
+
|
| 27 |
+
return {
|
| 28 |
+
'overview': "Comprehensive report generated",
|
| 29 |
+
'strategic': strategy_text[:1000] if strategy_text else "No strategy provided",
|
| 30 |
+
'checklist_summary': f"Processed {len(checklist_results)} categories with detailed analysis",
|
| 31 |
+
'questions_summary': f"Processed {len(questions_answers)} questions with detailed answers"
|
| 32 |
+
}
|
app/core/search.py
ADDED
|
@@ -0,0 +1,773 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Search and analysis functions for document retrieval and ranking.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
# Standard library imports
|
| 7 |
+
from typing import Dict, List
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Third-party imports for Unicode normalization
|
| 11 |
+
import unidecode
|
| 12 |
+
|
| 13 |
+
# Third-party imports
|
| 14 |
+
import numpy as np
|
| 15 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 16 |
+
from langchain.chains.retrieval import create_retrieval_chain
|
| 17 |
+
from langchain_community.vectorstores import FAISS
|
| 18 |
+
from langchain_core.prompts import PromptTemplate
|
| 19 |
+
|
| 20 |
+
# Local imports
|
| 21 |
+
from app.core.constants import SIMILARITY_THRESHOLD
|
| 22 |
+
from app.core.document_processor import DocumentProcessor
|
| 23 |
+
from app.core.logging import logger
|
| 24 |
+
from app.core.ranking import rerank_results
|
| 25 |
+
from app.core.sparse_index import load_sparse_index_for_store, BM25Index
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def search_and_analyze(queries: List[Dict], vector_store: FAISS, llm=None, threshold: float = SIMILARITY_THRESHOLD, search_type: str = 'items', store_name: str = None, session=None) -> Dict:
|
| 29 |
+
"""Unified search function for both checklist items and questions using direct FAISS search for accurate scores"""
|
| 30 |
+
|
| 31 |
+
# Create RAG chain if LLM is provided
|
| 32 |
+
qa_chain = None
|
| 33 |
+
if llm:
|
| 34 |
+
retriever = vector_store.as_retriever(
|
| 35 |
+
search_type="similarity_score_threshold",
|
| 36 |
+
search_kwargs={"score_threshold": threshold, "k": 5 if search_type == 'questions' else 10}
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
prompt_template = PromptTemplate(
|
| 40 |
+
input_variables=["context", "input"],
|
| 41 |
+
template="""Use the provided context to answer the question. Be concise and factual.
|
| 42 |
+
|
| 43 |
+
Context: {context}
|
| 44 |
+
|
| 45 |
+
Question: {input}
|
| 46 |
+
|
| 47 |
+
Answer:"""
|
| 48 |
+
)
|
| 49 |
+
# Create the document chain and then the retrieval chain
|
| 50 |
+
document_chain = create_stuff_documents_chain(llm, prompt_template)
|
| 51 |
+
qa_chain = create_retrieval_chain(retriever, document_chain)
|
| 52 |
+
|
| 53 |
+
if search_type == 'items':
|
| 54 |
+
return _process_checklist_items(queries, vector_store, threshold, store_name, session)
|
| 55 |
+
else:
|
| 56 |
+
return _process_questions(queries, vector_store, threshold, qa_chain, llm)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _process_checklist_items(checklist: Dict, vector_store: FAISS, threshold: float, store_name: str = None, session=None) -> Dict:
|
| 60 |
+
"""Compare checklist items directly against LLM-generated document type classifications"""
|
| 61 |
+
|
| 62 |
+
# Ensure checklist embeddings are preloaded first
|
| 63 |
+
if not hasattr(get_checklist_embedding, '_cache') or not get_checklist_embedding._cache:
|
| 64 |
+
logger.info("Checklist embeddings cache is empty, preloading...")
|
| 65 |
+
try:
|
| 66 |
+
from app.core.search import preload_checklist_embeddings
|
| 67 |
+
count = preload_checklist_embeddings()
|
| 68 |
+
logger.info(f"✅ Preloaded {count} checklist embeddings for processing")
|
| 69 |
+
except Exception as e:
|
| 70 |
+
logger.error(f"Failed to preload checklist embeddings: {e}")
|
| 71 |
+
return {}
|
| 72 |
+
|
| 73 |
+
# Ensure document type embeddings are available
|
| 74 |
+
if session:
|
| 75 |
+
logger.debug(f"Checklist processing session ID: {id(session)}, has embeddings: {hasattr(session, 'document_type_embeddings')}")
|
| 76 |
+
if hasattr(session, 'document_type_embeddings'):
|
| 77 |
+
logger.debug(f"Embeddings count: {len(session.document_type_embeddings) if session.document_type_embeddings else 0}")
|
| 78 |
+
|
| 79 |
+
# Try to auto-preload embeddings if missing
|
| 80 |
+
embeddings_missing = not session or not hasattr(session, 'document_type_embeddings') or not session.document_type_embeddings
|
| 81 |
+
|
| 82 |
+
if embeddings_missing and store_name:
|
| 83 |
+
logger.info(f"Document type embeddings missing, attempting auto-preload for {store_name}...")
|
| 84 |
+
try:
|
| 85 |
+
from app.core.search import preload_document_type_embeddings
|
| 86 |
+
type_embeddings = preload_document_type_embeddings(store_name)
|
| 87 |
+
if not hasattr(session, 'document_type_embeddings') or session.document_type_embeddings is None:
|
| 88 |
+
session.document_type_embeddings = {}
|
| 89 |
+
session.document_type_embeddings.update(type_embeddings)
|
| 90 |
+
logger.info(f"✅ Auto-preloaded {len(type_embeddings)} document type embeddings")
|
| 91 |
+
embeddings_missing = False
|
| 92 |
+
except Exception as e:
|
| 93 |
+
logger.warning(f"Failed to auto-preload document type embeddings: {e}")
|
| 94 |
+
|
| 95 |
+
if embeddings_missing:
|
| 96 |
+
logger.error("Document type embeddings not available. Checklist processing requires preloaded embeddings.")
|
| 97 |
+
logger.error("Make sure data room processing completed successfully or embeddings can be auto-loaded.")
|
| 98 |
+
return {}
|
| 99 |
+
|
| 100 |
+
# Load document type classifications - these are our primary comparison targets
|
| 101 |
+
doc_types = {}
|
| 102 |
+
if store_name:
|
| 103 |
+
doc_types = _load_document_types(vector_store, store_name)
|
| 104 |
+
|
| 105 |
+
if not doc_types:
|
| 106 |
+
logger.warning(f"No document type classifications found for {store_name}")
|
| 107 |
+
return {}
|
| 108 |
+
|
| 109 |
+
results = {}
|
| 110 |
+
for cat_letter, category in checklist.items():
|
| 111 |
+
cat_results = {
|
| 112 |
+
'name': category['name'],
|
| 113 |
+
'items': [],
|
| 114 |
+
'total_items': len(category['items']),
|
| 115 |
+
'matched_items': 0
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
for item in category['items']:
|
| 119 |
+
checklist_item_text = item['text'].lower().strip()
|
| 120 |
+
matches = []
|
| 121 |
+
|
| 122 |
+
# Compare checklist item against each document's type classification
|
| 123 |
+
for doc_path, doc_type in doc_types.items():
|
| 124 |
+
if not doc_type or doc_type == 'not classified':
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
doc_type_lower = doc_type.lower().strip()
|
| 128 |
+
|
| 129 |
+
# Calculate semantic similarity between checklist item and document type
|
| 130 |
+
try:
|
| 131 |
+
# Get checklist embedding from memory cache (preloaded during data room processing)
|
| 132 |
+
checklist_embedding = get_checklist_embedding(checklist_item_text)
|
| 133 |
+
|
| 134 |
+
# Get document type embedding (from preloaded cache)
|
| 135 |
+
doc_type_embedding = get_document_type_embedding(doc_type_lower, session)
|
| 136 |
+
|
| 137 |
+
# Calculate cosine similarity
|
| 138 |
+
import numpy as np
|
| 139 |
+
similarity = np.dot(checklist_embedding, doc_type_embedding) / (
|
| 140 |
+
np.linalg.norm(checklist_embedding) * np.linalg.norm(doc_type_embedding)
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Only include matches above threshold
|
| 144 |
+
if similarity >= threshold:
|
| 145 |
+
# Find the document metadata from the vector store
|
| 146 |
+
# We need to get the document name and other metadata
|
| 147 |
+
doc_name = _extract_doc_name_from_path(doc_path)
|
| 148 |
+
|
| 149 |
+
matches.append({
|
| 150 |
+
'name': doc_name,
|
| 151 |
+
'path': doc_path,
|
| 152 |
+
'full_path': doc_path, # For consistency
|
| 153 |
+
'score': round(float(similarity), 3),
|
| 154 |
+
'document_type': doc_type,
|
| 155 |
+
'text': f"Document type: {doc_type}" # Include document type as text
|
| 156 |
+
})
|
| 157 |
+
|
| 158 |
+
except Exception as e:
|
| 159 |
+
logger.warning(f"Error calculating similarity for {doc_path}: {e}")
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
# Sort matches by score (highest first)
|
| 163 |
+
matches.sort(key=lambda x: x['score'], reverse=True)
|
| 164 |
+
|
| 165 |
+
# Limit to top matches for performance
|
| 166 |
+
matches = matches[:10]
|
| 167 |
+
|
| 168 |
+
if matches:
|
| 169 |
+
cat_results['matched_items'] += 1
|
| 170 |
+
logger.info(f"✅ Found {len(matches)} matches for checklist item: '{checklist_item_text[:50]}...'")
|
| 171 |
+
|
| 172 |
+
cat_results['items'].append({
|
| 173 |
+
'text': item['text'],
|
| 174 |
+
'original': item['original'],
|
| 175 |
+
'matches': matches
|
| 176 |
+
})
|
| 177 |
+
|
| 178 |
+
results[cat_letter] = cat_results
|
| 179 |
+
|
| 180 |
+
return results
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def _load_document_types(vector_store, store_name: str):
|
| 184 |
+
"""Load document type classifications for the given store"""
|
| 185 |
+
try:
|
| 186 |
+
from pathlib import Path
|
| 187 |
+
from app.core.config import get_app_config
|
| 188 |
+
config = get_app_config()
|
| 189 |
+
doc_types_path = config.paths['faiss_dir'] / f"{store_name}_document_types.json"
|
| 190 |
+
if doc_types_path.exists():
|
| 191 |
+
import json
|
| 192 |
+
with open(doc_types_path, 'r') as f:
|
| 193 |
+
return json.load(f)
|
| 194 |
+
except Exception as e:
|
| 195 |
+
logger.warning(f"Failed to load document types for {store_name}: {e}")
|
| 196 |
+
return {}
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def _extract_doc_name_from_path(doc_path: str) -> str:
|
| 200 |
+
"""Extract document name from file path"""
|
| 201 |
+
try:
|
| 202 |
+
path_obj = Path(doc_path)
|
| 203 |
+
return path_obj.name
|
| 204 |
+
except Exception:
|
| 205 |
+
# Fallback: extract name from path string
|
| 206 |
+
return doc_path.split('/')[-1] if '/' in doc_path else doc_path.split('\\')[-1] if '\\' in doc_path else doc_path
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def get_checklist_embedding(checklist_text: str):
|
| 210 |
+
"""
|
| 211 |
+
Get cached embedding for checklist item from in-memory cache.
|
| 212 |
+
|
| 213 |
+
This function only uses in-memory cache that should be preloaded during
|
| 214 |
+
data room processing. It will fail if the embedding is not available.
|
| 215 |
+
|
| 216 |
+
Args:
|
| 217 |
+
checklist_text: The checklist item text to look up
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
numpy array: The embedding vector for the checklist text
|
| 221 |
+
|
| 222 |
+
Raises:
|
| 223 |
+
RuntimeError: If embedding is not found in cache
|
| 224 |
+
"""
|
| 225 |
+
# Initialize cache if not exists
|
| 226 |
+
if not hasattr(get_checklist_embedding, '_cache'):
|
| 227 |
+
get_checklist_embedding._cache = {}
|
| 228 |
+
logger.warning("Checklist embedding cache was not initialized - this should not happen!")
|
| 229 |
+
|
| 230 |
+
# Create cache key from checklist text with normalized Unicode
|
| 231 |
+
cache_key = checklist_text.lower().strip()
|
| 232 |
+
# Use unidecode for comprehensive Unicode to ASCII conversion
|
| 233 |
+
cache_key = unidecode.unidecode(cache_key)
|
| 234 |
+
|
| 235 |
+
# Check in-memory cache only
|
| 236 |
+
if cache_key in get_checklist_embedding._cache:
|
| 237 |
+
return get_checklist_embedding._cache[cache_key]
|
| 238 |
+
|
| 239 |
+
# Enhanced debugging for troubleshooting
|
| 240 |
+
cache_size = len(get_checklist_embedding._cache)
|
| 241 |
+
logger.warning(f"Checklist embedding not found: '{checklist_text[:50]}...'")
|
| 242 |
+
logger.warning(f"Cache key generated: '{cache_key}'")
|
| 243 |
+
logger.warning(f"Cache has {cache_size} items total")
|
| 244 |
+
|
| 245 |
+
if cache_size > 0:
|
| 246 |
+
# Look for similar keys to help debug
|
| 247 |
+
similar_keys = []
|
| 248 |
+
search_terms = checklist_text.lower().split()
|
| 249 |
+
for key in get_checklist_embedding._cache.keys():
|
| 250 |
+
if any(term in key for term in search_terms if len(term) > 3):
|
| 251 |
+
similar_keys.append(key)
|
| 252 |
+
|
| 253 |
+
if similar_keys:
|
| 254 |
+
logger.warning(f"Similar keys found: {similar_keys[:3]}")
|
| 255 |
+
else:
|
| 256 |
+
logger.warning("No similar keys found in cache")
|
| 257 |
+
|
| 258 |
+
# Show a few sample keys
|
| 259 |
+
sample_keys = list(get_checklist_embedding._cache.keys())[:5]
|
| 260 |
+
logger.warning(f"Sample cache keys: {sample_keys}")
|
| 261 |
+
else:
|
| 262 |
+
logger.error("Cache is completely empty - embeddings were not preloaded!")
|
| 263 |
+
|
| 264 |
+
# Fail if not found - no fallbacks
|
| 265 |
+
raise RuntimeError(
|
| 266 |
+
f"Checklist embedding not found for: '{checklist_text[:50]}...' (cache key: '{cache_key}'). "
|
| 267 |
+
f"Cache has {cache_size} items. "
|
| 268 |
+
"Make sure embeddings were preloaded during data room processing."
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def get_document_type_embedding(doc_type: str, session=None):
|
| 273 |
+
"""
|
| 274 |
+
Get cached embedding for document type from session cache.
|
| 275 |
+
|
| 276 |
+
Args:
|
| 277 |
+
doc_type: The document type text to get embedding for
|
| 278 |
+
session: The session object containing preloaded embeddings
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
numpy.ndarray: The embedding vector
|
| 282 |
+
|
| 283 |
+
Raises:
|
| 284 |
+
RuntimeError: If embedding is not found in cache
|
| 285 |
+
"""
|
| 286 |
+
if not session or not hasattr(session, 'document_type_embeddings') or not session.document_type_embeddings:
|
| 287 |
+
raise RuntimeError(f"Document type embedding not found for: '{doc_type[:50]}...'. Preloaded embeddings required.")
|
| 288 |
+
|
| 289 |
+
# Create cache key with normalized Unicode
|
| 290 |
+
cache_key = unidecode.unidecode(doc_type.lower().strip())
|
| 291 |
+
|
| 292 |
+
# Get from session cache only
|
| 293 |
+
if cache_key in session.document_type_embeddings:
|
| 294 |
+
return session.document_type_embeddings[cache_key]
|
| 295 |
+
|
| 296 |
+
raise RuntimeError(f"Document type embedding not found for: '{doc_type[:50]}...' (cache key: '{cache_key}')")
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def generate_checklist_embeddings():
|
| 300 |
+
"""
|
| 301 |
+
Generate embeddings for all checklist items and save to disk.
|
| 302 |
+
|
| 303 |
+
This function should be called during the build process to pre-calculate
|
| 304 |
+
embeddings for all checklist items from the available checklist files.
|
| 305 |
+
|
| 306 |
+
Returns:
|
| 307 |
+
int: Number of embeddings generated and saved
|
| 308 |
+
"""
|
| 309 |
+
try:
|
| 310 |
+
from app.core.config import get_config
|
| 311 |
+
from app.core.model_cache import get_cached_embeddings
|
| 312 |
+
import json
|
| 313 |
+
import numpy as np
|
| 314 |
+
|
| 315 |
+
config = get_config()
|
| 316 |
+
embeddings_model = get_cached_embeddings()
|
| 317 |
+
checklist_dir = config.paths['checklist_dir']
|
| 318 |
+
|
| 319 |
+
logger.info("🔄 Generating checklist embeddings...")
|
| 320 |
+
|
| 321 |
+
# Initialize embeddings cache
|
| 322 |
+
embeddings_cache = {}
|
| 323 |
+
|
| 324 |
+
# Process all checklist files
|
| 325 |
+
checklist_files = list(checklist_dir.glob("*.md"))
|
| 326 |
+
if not checklist_files:
|
| 327 |
+
logger.warning(f"No checklist files found in {checklist_dir}")
|
| 328 |
+
return 0
|
| 329 |
+
|
| 330 |
+
for checklist_file in checklist_files:
|
| 331 |
+
logger.info(f"Processing checklist: {checklist_file.name}")
|
| 332 |
+
|
| 333 |
+
try:
|
| 334 |
+
# Read checklist content
|
| 335 |
+
content = checklist_file.read_text(encoding='utf-8')
|
| 336 |
+
|
| 337 |
+
# Parse checklist items from markdown
|
| 338 |
+
checklist_items = _parse_checklist_items_from_markdown(content)
|
| 339 |
+
|
| 340 |
+
# Generate embeddings for each item
|
| 341 |
+
for item_text in checklist_items:
|
| 342 |
+
# Normalize Unicode in cache key
|
| 343 |
+
cache_key = item_text.lower().strip()
|
| 344 |
+
cache_key = unidecode.unidecode(cache_key)
|
| 345 |
+
|
| 346 |
+
# Skip if already processed
|
| 347 |
+
if cache_key in embeddings_cache:
|
| 348 |
+
continue
|
| 349 |
+
|
| 350 |
+
try:
|
| 351 |
+
# Generate embedding
|
| 352 |
+
embedding = embeddings_model.embed_query(item_text)
|
| 353 |
+
|
| 354 |
+
# Handle both list and numpy array cases
|
| 355 |
+
if hasattr(embedding, 'tolist'):
|
| 356 |
+
embeddings_cache[cache_key] = embedding.tolist()
|
| 357 |
+
else:
|
| 358 |
+
# Already a list
|
| 359 |
+
embeddings_cache[cache_key] = embedding
|
| 360 |
+
|
| 361 |
+
logger.debug(f"✅ Embedded: {item_text[:50]}...")
|
| 362 |
+
|
| 363 |
+
except Exception as e:
|
| 364 |
+
logger.warning(f"Failed to embed checklist item '{item_text[:50]}...': {e}")
|
| 365 |
+
continue
|
| 366 |
+
|
| 367 |
+
except Exception as e:
|
| 368 |
+
logger.error(f"Failed to process checklist file {checklist_file}: {e}")
|
| 369 |
+
continue
|
| 370 |
+
|
| 371 |
+
# Save to disk
|
| 372 |
+
cache_file = config.paths['faiss_dir'] / "checklist_embeddings.json"
|
| 373 |
+
cache_file.parent.mkdir(parents=True, exist_ok=True)
|
| 374 |
+
|
| 375 |
+
with open(cache_file, 'w', encoding='utf-8') as f:
|
| 376 |
+
json.dump(embeddings_cache, f, indent=2, ensure_ascii=False)
|
| 377 |
+
|
| 378 |
+
logger.info(f"💾 Saved {len(embeddings_cache)} checklist embeddings to {cache_file}")
|
| 379 |
+
return len(embeddings_cache)
|
| 380 |
+
|
| 381 |
+
except Exception as e:
|
| 382 |
+
error_msg = f"Failed to generate checklist embeddings: {e}"
|
| 383 |
+
logger.error(error_msg)
|
| 384 |
+
raise RuntimeError(error_msg)
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
def _parse_checklist_items_from_markdown(content: str) -> list:
|
| 388 |
+
"""
|
| 389 |
+
Parse checklist items from markdown content.
|
| 390 |
+
|
| 391 |
+
Args:
|
| 392 |
+
content: Markdown content containing checklist items
|
| 393 |
+
|
| 394 |
+
Returns:
|
| 395 |
+
list: List of checklist item texts
|
| 396 |
+
"""
|
| 397 |
+
import re
|
| 398 |
+
|
| 399 |
+
items = []
|
| 400 |
+
|
| 401 |
+
# Find numbered items like "1. Item text" or "- Item text"
|
| 402 |
+
# Look for patterns after category headers
|
| 403 |
+
lines = content.split('\n')
|
| 404 |
+
|
| 405 |
+
for line in lines:
|
| 406 |
+
line = line.strip()
|
| 407 |
+
|
| 408 |
+
# Skip empty lines and headers
|
| 409 |
+
if not line or line.startswith('#') or line.startswith('⸻'):
|
| 410 |
+
continue
|
| 411 |
+
|
| 412 |
+
# Look for numbered items: "1. ", "2. ", etc. or bullet points
|
| 413 |
+
if re.match(r'^\d+\.\s+', line) or line.startswith('- '):
|
| 414 |
+
# Clean up the item text
|
| 415 |
+
if line.startswith('- '):
|
| 416 |
+
item_text = line[2:].strip()
|
| 417 |
+
else:
|
| 418 |
+
# Remove the number prefix
|
| 419 |
+
item_text = re.sub(r'^\d+\.\s+', '', line).strip()
|
| 420 |
+
|
| 421 |
+
# Skip if too short or looks like a header
|
| 422 |
+
if len(item_text) > 10 and not item_text.isupper():
|
| 423 |
+
items.append(item_text)
|
| 424 |
+
|
| 425 |
+
logger.info(f"Parsed {len(items)} checklist items from markdown")
|
| 426 |
+
return items
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
def preload_checklist_embeddings():
|
| 430 |
+
"""
|
| 431 |
+
Preload all checklist embeddings into memory during data room processing.
|
| 432 |
+
|
| 433 |
+
This function loads pre-calculated embeddings from disk into the in-memory cache.
|
| 434 |
+
It should be called once during data room processing to prepare for fast searches.
|
| 435 |
+
|
| 436 |
+
Returns:
|
| 437 |
+
int: Number of embeddings successfully preloaded
|
| 438 |
+
|
| 439 |
+
Raises:
|
| 440 |
+
RuntimeError: If embeddings file doesn't exist or can't be loaded
|
| 441 |
+
"""
|
| 442 |
+
try:
|
| 443 |
+
from app.core.config import get_config
|
| 444 |
+
import json
|
| 445 |
+
import numpy as np
|
| 446 |
+
|
| 447 |
+
config = get_config()
|
| 448 |
+
cache_file = config.paths['faiss_dir'] / "checklist_embeddings.json"
|
| 449 |
+
|
| 450 |
+
if not cache_file.exists():
|
| 451 |
+
logger.warning(f"Checklist embeddings file not found: {cache_file}")
|
| 452 |
+
logger.info("Generating checklist embeddings now...")
|
| 453 |
+
|
| 454 |
+
# Try to generate embeddings on-the-fly
|
| 455 |
+
try:
|
| 456 |
+
generated_count = generate_checklist_embeddings()
|
| 457 |
+
if generated_count > 0:
|
| 458 |
+
logger.info(f"✅ Generated {generated_count} embeddings, now preloading...")
|
| 459 |
+
else:
|
| 460 |
+
raise RuntimeError("No checklist items found to embed")
|
| 461 |
+
except Exception as gen_error:
|
| 462 |
+
raise RuntimeError(
|
| 463 |
+
f"Could not generate checklist embeddings: {gen_error}. "
|
| 464 |
+
"Make sure checklist files exist and are properly formatted."
|
| 465 |
+
)
|
| 466 |
+
|
| 467 |
+
# Initialize cache
|
| 468 |
+
if not hasattr(get_checklist_embedding, '_cache'):
|
| 469 |
+
get_checklist_embedding._cache = {}
|
| 470 |
+
|
| 471 |
+
# Load all embeddings from disk
|
| 472 |
+
with open(cache_file, 'r', encoding='utf-8') as f:
|
| 473 |
+
cache_data = json.load(f)
|
| 474 |
+
|
| 475 |
+
# Convert and cache all embeddings in memory
|
| 476 |
+
preloaded_count = 0
|
| 477 |
+
for cache_key, embedding_list in cache_data.items():
|
| 478 |
+
# Normalize Unicode in cache key to match search normalization
|
| 479 |
+
normalized_key = unidecode.unidecode(cache_key)
|
| 480 |
+
embedding_array = np.array(embedding_list, dtype=np.float32)
|
| 481 |
+
get_checklist_embedding._cache[normalized_key] = embedding_array
|
| 482 |
+
preloaded_count += 1
|
| 483 |
+
|
| 484 |
+
logger.info(f"✅ Preloaded {preloaded_count} checklist embeddings into memory")
|
| 485 |
+
return preloaded_count
|
| 486 |
+
|
| 487 |
+
except Exception as e:
|
| 488 |
+
error_msg = f"Failed to preload checklist embeddings: {e}"
|
| 489 |
+
logger.error(error_msg)
|
| 490 |
+
raise RuntimeError(error_msg)
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
def preload_document_type_embeddings(store_name: str):
|
| 494 |
+
"""
|
| 495 |
+
Preload all document type embeddings into memory during data room processing.
|
| 496 |
+
|
| 497 |
+
This function loads document type classifications and computes their embeddings
|
| 498 |
+
once during data room processing to avoid runtime computation.
|
| 499 |
+
|
| 500 |
+
Returns:
|
| 501 |
+
dict: Dictionary mapping normalized document types to their embeddings
|
| 502 |
+
|
| 503 |
+
Raises:
|
| 504 |
+
RuntimeError: If document types can't be loaded or embeddings can't be computed
|
| 505 |
+
"""
|
| 506 |
+
try:
|
| 507 |
+
from app.core.model_cache import get_cached_embeddings
|
| 508 |
+
import numpy as np
|
| 509 |
+
|
| 510 |
+
# Load document type classifications
|
| 511 |
+
doc_types = _load_document_types(None, store_name)
|
| 512 |
+
if not doc_types:
|
| 513 |
+
raise RuntimeError(f"No document type classifications found for {store_name}")
|
| 514 |
+
|
| 515 |
+
# Get embeddings model
|
| 516 |
+
embeddings = get_cached_embeddings()
|
| 517 |
+
|
| 518 |
+
# Precompute embeddings for all unique document types
|
| 519 |
+
type_embeddings = {}
|
| 520 |
+
unique_types = set()
|
| 521 |
+
|
| 522 |
+
# Collect all unique document types
|
| 523 |
+
for doc_path, doc_type in doc_types.items():
|
| 524 |
+
if doc_type and doc_type != 'not classified':
|
| 525 |
+
normalized_type = unidecode.unidecode(doc_type.lower().strip())
|
| 526 |
+
unique_types.add(normalized_type)
|
| 527 |
+
|
| 528 |
+
# Precompute embeddings for each unique type
|
| 529 |
+
for doc_type in unique_types:
|
| 530 |
+
try:
|
| 531 |
+
embedding = embeddings.embed_query(doc_type)
|
| 532 |
+
# Ensure it's a numpy array
|
| 533 |
+
if hasattr(embedding, 'tolist'):
|
| 534 |
+
embedding_array = np.array(embedding, dtype=np.float32)
|
| 535 |
+
else:
|
| 536 |
+
embedding_array = np.array(embedding, dtype=np.float32)
|
| 537 |
+
type_embeddings[doc_type] = embedding_array
|
| 538 |
+
except Exception as e:
|
| 539 |
+
logger.warning(f"Failed to compute embedding for document type '{doc_type}': {e}")
|
| 540 |
+
continue
|
| 541 |
+
|
| 542 |
+
logger.info(f"✅ Precomputed {len(type_embeddings)} document type embeddings")
|
| 543 |
+
return type_embeddings
|
| 544 |
+
|
| 545 |
+
except Exception as e:
|
| 546 |
+
error_msg = f"Failed to preload document type embeddings: {e}"
|
| 547 |
+
logger.error(error_msg)
|
| 548 |
+
raise RuntimeError(error_msg)
|
| 549 |
+
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
def _process_questions(queries: List[Dict], vector_store: FAISS, threshold: float, qa_chain=None, llm=None) -> Dict:
|
| 554 |
+
"""Process questions using batch processing for parallel LLM calls"""
|
| 555 |
+
if not queries:
|
| 556 |
+
return {'questions': []}
|
| 557 |
+
|
| 558 |
+
if qa_chain and llm:
|
| 559 |
+
return _process_questions_with_rag_batch(queries, vector_store, threshold, llm)
|
| 560 |
+
elif qa_chain:
|
| 561 |
+
raise ValueError("LLM required for RAG processing but not provided")
|
| 562 |
+
else:
|
| 563 |
+
return _process_questions_simple_search(queries, vector_store, threshold)
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
def _process_questions_with_rag_batch(queries: List[Dict], vector_store: FAISS, threshold: float, llm) -> Dict:
|
| 567 |
+
"""Process questions using batch processing - fail fast, no fallbacks"""
|
| 568 |
+
from app.ai.agent_utils import create_batch_processor
|
| 569 |
+
from langchain_core.messages import HumanMessage
|
| 570 |
+
|
| 571 |
+
# Create batch processor
|
| 572 |
+
batch_processor = create_batch_processor(llm, max_concurrency=5)
|
| 573 |
+
|
| 574 |
+
logger.info(f"Processing {len(queries)} questions using batch processing")
|
| 575 |
+
|
| 576 |
+
# Prepare all batch inputs
|
| 577 |
+
batch_inputs = []
|
| 578 |
+
question_contexts = []
|
| 579 |
+
|
| 580 |
+
for query in queries:
|
| 581 |
+
question = query['question']
|
| 582 |
+
|
| 583 |
+
# Retrieve documents for this question
|
| 584 |
+
docs_with_scores = vector_store.similarity_search_with_score(question, k=5)
|
| 585 |
+
relevant_docs = [doc for doc, score in docs_with_scores if (1.0 - (score / 2.0) if score <= 2.0 else 0.0) >= threshold]
|
| 586 |
+
|
| 587 |
+
# Create context and sources
|
| 588 |
+
if relevant_docs:
|
| 589 |
+
context = "\n".join([f"- {doc.metadata.get('name', 'Unknown')}: {doc.page_content[:200]}..."
|
| 590 |
+
for doc in relevant_docs[:5]])
|
| 591 |
+
sources = [{'name': doc.metadata.get('name', ''),
|
| 592 |
+
'path': doc.metadata.get('path', ''),
|
| 593 |
+
'score': round(1.0 - (score / 2.0) if score <= 2.0 else 0.0, 3)}
|
| 594 |
+
for doc, score in docs_with_scores[:5] if (1.0 - (score / 2.0) if score <= 2.0 else 0.0) >= threshold]
|
| 595 |
+
else:
|
| 596 |
+
context = ""
|
| 597 |
+
sources = []
|
| 598 |
+
|
| 599 |
+
question_contexts.append(sources)
|
| 600 |
+
|
| 601 |
+
# Create prompt
|
| 602 |
+
prompt_content = f"""Use the provided context to answer the question. Be concise and factual.
|
| 603 |
+
|
| 604 |
+
Context: {context}
|
| 605 |
+
|
| 606 |
+
Question: {question}
|
| 607 |
+
|
| 608 |
+
Answer:"""
|
| 609 |
+
|
| 610 |
+
messages = [HumanMessage(content=prompt_content)]
|
| 611 |
+
batch_inputs.append((messages, query))
|
| 612 |
+
|
| 613 |
+
# Process batch - fail if anything goes wrong
|
| 614 |
+
batch_results = batch_processor.invoke(batch_inputs)
|
| 615 |
+
|
| 616 |
+
# Build results
|
| 617 |
+
results = []
|
| 618 |
+
for idx, result in enumerate(batch_results):
|
| 619 |
+
if not result['success'] or not result['response']:
|
| 620 |
+
raise RuntimeError(f"Failed to process question: {result['item_info']['question']}")
|
| 621 |
+
|
| 622 |
+
query = result['item_info']
|
| 623 |
+
answer = result['response'].content.strip()
|
| 624 |
+
sources = question_contexts[idx]
|
| 625 |
+
|
| 626 |
+
results.append({
|
| 627 |
+
'question': query['question'],
|
| 628 |
+
'category': query.get('category', ''),
|
| 629 |
+
'answer': answer,
|
| 630 |
+
'sources': sources,
|
| 631 |
+
'method': 'rag_batch',
|
| 632 |
+
'has_answer': bool(answer and answer.strip())
|
| 633 |
+
})
|
| 634 |
+
|
| 635 |
+
return {'questions': results}
|
| 636 |
+
|
| 637 |
+
|
| 638 |
+
|
| 639 |
+
|
| 640 |
+
def _process_questions_simple_search(queries: List[Dict], vector_store: FAISS, threshold: float) -> Dict:
|
| 641 |
+
"""Process questions using simple search without RAG (already fast, no batch needed)"""
|
| 642 |
+
results = []
|
| 643 |
+
|
| 644 |
+
for query in queries:
|
| 645 |
+
question = query['question']
|
| 646 |
+
category = query.get('category', '')
|
| 647 |
+
|
| 648 |
+
# Simple search without RAG
|
| 649 |
+
docs_with_scores = vector_store.similarity_search_with_score(question, k=5)
|
| 650 |
+
sources = []
|
| 651 |
+
for doc, score in docs_with_scores:
|
| 652 |
+
if score >= threshold:
|
| 653 |
+
sources.append({
|
| 654 |
+
'name': doc.metadata.get('name', ''),
|
| 655 |
+
'path': doc.metadata.get('path', ''),
|
| 656 |
+
'score': round(score, 3)
|
| 657 |
+
})
|
| 658 |
+
|
| 659 |
+
answer = f"Based on the following documents: {', '.join([s['name'] for s in sources])}" if sources else "No relevant documents found"
|
| 660 |
+
results.append({
|
| 661 |
+
'question': question,
|
| 662 |
+
'category': category,
|
| 663 |
+
'answer': answer,
|
| 664 |
+
'sources': sources,
|
| 665 |
+
'method': 'search',
|
| 666 |
+
'has_answer': bool(sources)
|
| 667 |
+
})
|
| 668 |
+
|
| 669 |
+
return {'questions': results}
|
| 670 |
+
|
| 671 |
+
|
| 672 |
+
def search_documents(query: str, document_processor: DocumentProcessor, top_k: int = 5, threshold: float = None):
|
| 673 |
+
"""Search documents using the document processor"""
|
| 674 |
+
if not document_processor:
|
| 675 |
+
return []
|
| 676 |
+
|
| 677 |
+
return document_processor.search(query, top_k=top_k, threshold=threshold)
|
| 678 |
+
|
| 679 |
+
|
| 680 |
+
def hybrid_search(query: str, vector_store: FAISS, store_name: str,
|
| 681 |
+
top_k: int = 10, sparse_weight: float = 0.3,
|
| 682 |
+
dense_weight: float = 0.7, threshold: float = SIMILARITY_THRESHOLD) -> List[Dict]:
|
| 683 |
+
"""
|
| 684 |
+
Hybrid search combining sparse (BM25) and dense retrieval.
|
| 685 |
+
|
| 686 |
+
Args:
|
| 687 |
+
query: Search query
|
| 688 |
+
vector_store: FAISS vector store for dense retrieval
|
| 689 |
+
store_name: Name of the document store
|
| 690 |
+
top_k: Number of top results to return
|
| 691 |
+
sparse_weight: Weight for sparse scores (0-1)
|
| 692 |
+
dense_weight: Weight for dense scores (0-1)
|
| 693 |
+
threshold: Minimum similarity threshold for dense retrieval
|
| 694 |
+
|
| 695 |
+
Returns:
|
| 696 |
+
Combined search results sorted by hybrid score
|
| 697 |
+
"""
|
| 698 |
+
logger.info(f"Performing hybrid search for query: {query[:50]}...")
|
| 699 |
+
|
| 700 |
+
# Get sparse results
|
| 701 |
+
sparse_results = []
|
| 702 |
+
bm25_index = load_sparse_index_for_store(store_name)
|
| 703 |
+
|
| 704 |
+
if bm25_index:
|
| 705 |
+
sparse_results = bm25_index.search(query, top_k=top_k*2)
|
| 706 |
+
logger.info(f"Sparse search returned {len(sparse_results)} results")
|
| 707 |
+
else:
|
| 708 |
+
logger.warning(f"No sparse index found for {store_name}, falling back to dense only")
|
| 709 |
+
|
| 710 |
+
# Get dense results
|
| 711 |
+
dense_docs = vector_store.similarity_search_with_score(query, k=top_k*2)
|
| 712 |
+
dense_results = []
|
| 713 |
+
|
| 714 |
+
for doc, score in dense_docs:
|
| 715 |
+
if score >= threshold:
|
| 716 |
+
dense_results.append({
|
| 717 |
+
'doc_id': doc.metadata.get('source', ''),
|
| 718 |
+
'document': doc.page_content,
|
| 719 |
+
'score': float(score),
|
| 720 |
+
'metadata': doc.metadata
|
| 721 |
+
})
|
| 722 |
+
|
| 723 |
+
logger.info(f"Dense search returned {len(dense_results)} results")
|
| 724 |
+
|
| 725 |
+
# Combine results using reciprocal rank fusion or weighted scoring
|
| 726 |
+
combined_scores = {}
|
| 727 |
+
|
| 728 |
+
# Process sparse results
|
| 729 |
+
for result in sparse_results:
|
| 730 |
+
doc_id = result['doc_id']
|
| 731 |
+
combined_scores[doc_id] = {
|
| 732 |
+
'sparse_score': result['score'] * sparse_weight,
|
| 733 |
+
'dense_score': 0.0,
|
| 734 |
+
'result': result
|
| 735 |
+
}
|
| 736 |
+
|
| 737 |
+
# Process dense results
|
| 738 |
+
for result in dense_results:
|
| 739 |
+
doc_id = result['doc_id']
|
| 740 |
+
if doc_id in combined_scores:
|
| 741 |
+
combined_scores[doc_id]['dense_score'] = result['score'] * dense_weight
|
| 742 |
+
else:
|
| 743 |
+
combined_scores[doc_id] = {
|
| 744 |
+
'sparse_score': 0.0,
|
| 745 |
+
'dense_score': result['score'] * dense_weight,
|
| 746 |
+
'result': result
|
| 747 |
+
}
|
| 748 |
+
|
| 749 |
+
# Calculate final hybrid scores
|
| 750 |
+
final_results = []
|
| 751 |
+
for doc_id, scores in combined_scores.items():
|
| 752 |
+
hybrid_score = scores['sparse_score'] + scores['dense_score']
|
| 753 |
+
|
| 754 |
+
# Create unified result format
|
| 755 |
+
result = scores['result'].copy()
|
| 756 |
+
result.update({
|
| 757 |
+
'hybrid_score': hybrid_score,
|
| 758 |
+
'sparse_score': scores['sparse_score'] / sparse_weight if sparse_weight > 0 else 0,
|
| 759 |
+
'dense_score': scores['dense_score'] / dense_weight if dense_weight > 0 else 0,
|
| 760 |
+
'score': hybrid_score # For backward compatibility
|
| 761 |
+
})
|
| 762 |
+
final_results.append(result)
|
| 763 |
+
|
| 764 |
+
# Sort by hybrid score
|
| 765 |
+
final_results.sort(key=lambda x: x['hybrid_score'], reverse=True)
|
| 766 |
+
|
| 767 |
+
# Return top_k results
|
| 768 |
+
top_results = final_results[:top_k]
|
| 769 |
+
logger.info(f"Hybrid search returned {len(top_results)} final results")
|
| 770 |
+
|
| 771 |
+
return top_results
|
| 772 |
+
|
| 773 |
+
|
app/core/sparse_index.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
BM25 Sparse Index Implementation for Due Diligence Documents
|
| 4 |
+
|
| 5 |
+
This module provides BM25-based sparse retrieval that complements the existing
|
| 6 |
+
dense retrieval system. The index is pre-calculated locally and persisted
|
| 7 |
+
to disk for fast loading on Streamlit Cloud.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import pickle
|
| 11 |
+
import os
|
| 12 |
+
import re
|
| 13 |
+
from typing import List, Dict, Optional, Callable, Tuple
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
from rank_bm25 import BM25Okapi
|
| 17 |
+
from app.core.logging import logger
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class BM25Index:
|
| 21 |
+
"""
|
| 22 |
+
BM25-based sparse index for document retrieval.
|
| 23 |
+
|
| 24 |
+
This class provides:
|
| 25 |
+
- Pre-calculated BM25 index persistence
|
| 26 |
+
- Custom tokenization for legal/financial documents
|
| 27 |
+
- Efficient search with relevance scoring
|
| 28 |
+
- Integration with existing document processing pipeline
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, index_path: str):
|
| 32 |
+
"""
|
| 33 |
+
Initialize BM25 index.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
index_path: Path to save/load the index file
|
| 37 |
+
"""
|
| 38 |
+
self.index_path = Path(index_path)
|
| 39 |
+
self.bm25: Optional[BM25Okapi] = None
|
| 40 |
+
self.documents: List[str] = []
|
| 41 |
+
self.doc_ids: List[str] = []
|
| 42 |
+
self.tokenized_docs: List[List[str]] = []
|
| 43 |
+
self.metadata: Dict = {}
|
| 44 |
+
|
| 45 |
+
def custom_tokenizer(self, text: str) -> List[str]:
|
| 46 |
+
"""
|
| 47 |
+
Custom tokenization optimized for legal and financial documents.
|
| 48 |
+
|
| 49 |
+
Handles:
|
| 50 |
+
- Legal abbreviations (LLC, Inc., Corp.)
|
| 51 |
+
- Financial terms (IPO, GAAP, EBITDA)
|
| 52 |
+
- Contract terminology (force majeure, indemnification)
|
| 53 |
+
- Proper names and entities
|
| 54 |
+
"""
|
| 55 |
+
if not text:
|
| 56 |
+
return []
|
| 57 |
+
|
| 58 |
+
# Convert to lowercase
|
| 59 |
+
text = text.lower()
|
| 60 |
+
|
| 61 |
+
# Preserve important legal/financial abbreviations
|
| 62 |
+
legal_abbrevs = [
|
| 63 |
+
'llc', 'inc', 'corp', 'ltd', 'co', 'lp', 'llp',
|
| 64 |
+
'ipo', 'gaap', 'sec', 'fdic', 'irs', 'sox', 'gdpr',
|
| 65 |
+
'nda', 'mou', 'spa', 'joa', 'ipa', 'dpa'
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
# Replace common terms to avoid splitting
|
| 69 |
+
for abbrev in legal_abbrevs:
|
| 70 |
+
text = text.replace(abbrev, abbrev.replace(' ', '_'))
|
| 71 |
+
|
| 72 |
+
# Split on whitespace and punctuation
|
| 73 |
+
tokens = re.findall(r'\b\w+\b', text)
|
| 74 |
+
|
| 75 |
+
# Restore underscores to spaces for abbreviations
|
| 76 |
+
tokens = [token.replace('_', '') for token in tokens]
|
| 77 |
+
|
| 78 |
+
# Filter out very short tokens (likely noise)
|
| 79 |
+
tokens = [token for token in tokens if len(token) > 1]
|
| 80 |
+
|
| 81 |
+
return tokens
|
| 82 |
+
|
| 83 |
+
def build_index(self, documents: List[Dict[str, str]], custom_tokenizer: Optional[Callable] = None):
|
| 84 |
+
"""
|
| 85 |
+
Build BM25 index from documents.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
documents: List of dicts with 'id' and 'content' keys
|
| 89 |
+
custom_tokenizer: Optional custom tokenization function
|
| 90 |
+
"""
|
| 91 |
+
logger.info(f"Building BM25 index from {len(documents)} documents")
|
| 92 |
+
|
| 93 |
+
# Extract content and IDs
|
| 94 |
+
self.documents = [doc['content'] for doc in documents]
|
| 95 |
+
self.doc_ids = [doc['id'] for doc in documents]
|
| 96 |
+
|
| 97 |
+
# Tokenize documents
|
| 98 |
+
tokenizer = custom_tokenizer or self.custom_tokenizer
|
| 99 |
+
self.tokenized_docs = [tokenizer(doc) for doc in self.documents]
|
| 100 |
+
|
| 101 |
+
# Build BM25 index
|
| 102 |
+
self.bm25 = BM25Okapi(self.tokenized_docs)
|
| 103 |
+
|
| 104 |
+
# Store metadata
|
| 105 |
+
self.metadata = {
|
| 106 |
+
'total_documents': len(self.documents),
|
| 107 |
+
'total_tokens': sum(len(tokens) for tokens in self.tokenized_docs),
|
| 108 |
+
'avg_tokens_per_doc': sum(len(tokens) for tokens in self.tokenized_docs) / len(self.documents) if self.documents else 0
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
# Save to disk
|
| 112 |
+
self._save_index()
|
| 113 |
+
|
| 114 |
+
logger.info(f"✅ BM25 index built and saved: {self.metadata}")
|
| 115 |
+
|
| 116 |
+
def _save_index(self):
|
| 117 |
+
"""Save index to pickle file"""
|
| 118 |
+
self.index_path.parent.mkdir(parents=True, exist_ok=True)
|
| 119 |
+
|
| 120 |
+
index_data = {
|
| 121 |
+
'bm25': self.bm25,
|
| 122 |
+
'documents': self.documents,
|
| 123 |
+
'doc_ids': self.doc_ids,
|
| 124 |
+
'tokenized_docs': self.tokenized_docs,
|
| 125 |
+
'metadata': self.metadata
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
with open(self.index_path, 'wb') as f:
|
| 129 |
+
pickle.dump(index_data, f)
|
| 130 |
+
|
| 131 |
+
logger.info(f"💾 BM25 index saved to {self.index_path}")
|
| 132 |
+
|
| 133 |
+
def load_index(self) -> bool:
|
| 134 |
+
"""
|
| 135 |
+
Load index from disk.
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
True if index loaded successfully, False otherwise
|
| 139 |
+
"""
|
| 140 |
+
if self.index_path.exists():
|
| 141 |
+
try:
|
| 142 |
+
with open(self.index_path, 'rb') as f:
|
| 143 |
+
index_data = pickle.load(f)
|
| 144 |
+
|
| 145 |
+
self.bm25 = index_data['bm25']
|
| 146 |
+
self.documents = index_data['documents']
|
| 147 |
+
self.doc_ids = index_data['doc_ids']
|
| 148 |
+
self.tokenized_docs = index_data['tokenized_docs']
|
| 149 |
+
self.metadata = index_data.get('metadata', {})
|
| 150 |
+
|
| 151 |
+
logger.info(f"📂 BM25 index loaded: {len(self.documents)} documents")
|
| 152 |
+
return True
|
| 153 |
+
|
| 154 |
+
except Exception as e:
|
| 155 |
+
logger.error(f"Failed to load BM25 index: {e}")
|
| 156 |
+
return False
|
| 157 |
+
else:
|
| 158 |
+
logger.warning(f"BM25 index not found: {self.index_path}")
|
| 159 |
+
return False
|
| 160 |
+
|
| 161 |
+
def search(self, query: str, top_k: int = 10, custom_tokenizer: Optional[Callable] = None) -> List[Dict]:
|
| 162 |
+
"""
|
| 163 |
+
Search the BM25 index.
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
query: Search query
|
| 167 |
+
top_k: Number of top results to return
|
| 168 |
+
custom_tokenizer: Optional custom tokenization function
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
List of search results with scores
|
| 172 |
+
"""
|
| 173 |
+
if not self.bm25:
|
| 174 |
+
logger.warning("BM25 index not loaded")
|
| 175 |
+
return []
|
| 176 |
+
|
| 177 |
+
# Tokenize query
|
| 178 |
+
tokenizer = custom_tokenizer or self.custom_tokenizer
|
| 179 |
+
tokenized_query = tokenizer(query)
|
| 180 |
+
|
| 181 |
+
if not tokenized_query:
|
| 182 |
+
logger.warning("Query produced no tokens")
|
| 183 |
+
return []
|
| 184 |
+
|
| 185 |
+
# Get BM25 scores
|
| 186 |
+
scores = self.bm25.get_scores(tokenized_query)
|
| 187 |
+
|
| 188 |
+
# Get top results
|
| 189 |
+
if len(scores) == 0:
|
| 190 |
+
return []
|
| 191 |
+
|
| 192 |
+
# Get indices of top scores (handling edge case of fewer results than requested)
|
| 193 |
+
num_results = min(top_k, len(scores))
|
| 194 |
+
top_indices = scores.argsort()[-num_results:][::-1]
|
| 195 |
+
|
| 196 |
+
results = []
|
| 197 |
+
for idx in top_indices:
|
| 198 |
+
if scores[idx] > 0: # Only return relevant results
|
| 199 |
+
results.append({
|
| 200 |
+
'doc_id': self.doc_ids[idx],
|
| 201 |
+
'document': self.documents[idx],
|
| 202 |
+
'score': float(scores[idx]),
|
| 203 |
+
'rank': len(results) + 1
|
| 204 |
+
})
|
| 205 |
+
|
| 206 |
+
logger.debug(f"BM25 search returned {len(results)} results for query: {query[:50]}...")
|
| 207 |
+
return results
|
| 208 |
+
|
| 209 |
+
def get_stats(self) -> Dict:
|
| 210 |
+
"""Get index statistics"""
|
| 211 |
+
if not self.index_path.exists():
|
| 212 |
+
return {'status': 'index_not_found'}
|
| 213 |
+
|
| 214 |
+
stats = {
|
| 215 |
+
'index_path': str(self.index_path),
|
| 216 |
+
'index_exists': self.index_path.exists(),
|
| 217 |
+
'is_loaded': self.bm25 is not None,
|
| 218 |
+
'index_size_mb': self.index_path.stat().st_size / (1024 * 1024) if self.index_path.exists() else 0
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
if self.metadata:
|
| 222 |
+
stats.update(self.metadata)
|
| 223 |
+
|
| 224 |
+
return stats
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def build_sparse_index_for_store(store_name: str, documents: List[Dict[str, str]],
|
| 228 |
+
index_dir: str = "data/search_indexes") -> BM25Index:
|
| 229 |
+
"""
|
| 230 |
+
Convenience function to build sparse index for a document store.
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
store_name: Name of the document store (e.g., 'summit-digital-solutions-inc')
|
| 234 |
+
documents: List of documents with 'id' and 'content' keys
|
| 235 |
+
index_dir: Directory to store the index
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
BM25Index instance
|
| 239 |
+
"""
|
| 240 |
+
index_path = f"{index_dir}/{store_name}_bm25.pkl"
|
| 241 |
+
bm25_index = BM25Index(index_path)
|
| 242 |
+
bm25_index.build_index(documents)
|
| 243 |
+
return bm25_index
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def load_sparse_index_for_store(store_name: str, index_dir: str = "data/search_indexes") -> Optional[BM25Index]:
|
| 247 |
+
"""
|
| 248 |
+
Convenience function to load sparse index for a document store.
|
| 249 |
+
|
| 250 |
+
Args:
|
| 251 |
+
store_name: Name of the document store
|
| 252 |
+
index_dir: Directory containing the index
|
| 253 |
+
|
| 254 |
+
Returns:
|
| 255 |
+
BM25Index instance if found, None otherwise
|
| 256 |
+
"""
|
| 257 |
+
index_path = f"{index_dir}/{store_name}_bm25.pkl"
|
| 258 |
+
bm25_index = BM25Index(index_path)
|
| 259 |
+
|
| 260 |
+
if bm25_index.load_index():
|
| 261 |
+
return bm25_index
|
| 262 |
+
|
| 263 |
+
return None
|
app/core/stage_manager.py
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Stage-based Build System for FAISS Index Generation
|
| 4 |
+
|
| 5 |
+
This module provides a stage-based build system that allows for incremental
|
| 6 |
+
builds, dependency management, and smart skipping of completed stages.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
import time
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Dict, List, Any, Optional, Set
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
import glob
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
# Stage definitions with dependencies and outputs
|
| 20 |
+
STAGES = {
|
| 21 |
+
'scan': {
|
| 22 |
+
'name': 'Document Scanning',
|
| 23 |
+
'description': 'Scan and catalog all documents',
|
| 24 |
+
'dependencies': [],
|
| 25 |
+
'outputs': ['.scan_cache.json'],
|
| 26 |
+
'estimated_duration': '30s'
|
| 27 |
+
},
|
| 28 |
+
'extract': {
|
| 29 |
+
'name': 'Text Extraction',
|
| 30 |
+
'description': 'Extract text from PDFs and documents',
|
| 31 |
+
'dependencies': ['scan'],
|
| 32 |
+
'outputs': ['.extraction_cache.json'],
|
| 33 |
+
'estimated_duration': '5-10m'
|
| 34 |
+
},
|
| 35 |
+
'classify': {
|
| 36 |
+
'name': 'Document Classification',
|
| 37 |
+
'description': 'Classify document types using AI',
|
| 38 |
+
'dependencies': ['extract'],
|
| 39 |
+
'outputs': ['*_document_types.json'],
|
| 40 |
+
'estimated_duration': '3-5m'
|
| 41 |
+
},
|
| 42 |
+
'chunk': {
|
| 43 |
+
'name': 'Text Chunking',
|
| 44 |
+
'description': 'Split documents into semantic chunks',
|
| 45 |
+
'dependencies': ['extract'],
|
| 46 |
+
'outputs': ['.chunking_cache.json'],
|
| 47 |
+
'estimated_duration': '2-3m'
|
| 48 |
+
},
|
| 49 |
+
'embed': {
|
| 50 |
+
'name': 'Vector Embeddings',
|
| 51 |
+
'description': 'Generate embeddings for all chunks',
|
| 52 |
+
'dependencies': ['chunk'],
|
| 53 |
+
'outputs': ['*.pkl'],
|
| 54 |
+
'estimated_duration': '5-8m'
|
| 55 |
+
},
|
| 56 |
+
'index': {
|
| 57 |
+
'name': 'FAISS Indexing',
|
| 58 |
+
'description': 'Build and save FAISS vector indices',
|
| 59 |
+
'dependencies': ['embed'],
|
| 60 |
+
'outputs': ['*.faiss'],
|
| 61 |
+
'estimated_duration': '1-2m'
|
| 62 |
+
},
|
| 63 |
+
'sparse': {
|
| 64 |
+
'name': 'BM25 Sparse Indexing',
|
| 65 |
+
'description': 'Build BM25 sparse indices for hybrid search',
|
| 66 |
+
'dependencies': ['extract'],
|
| 67 |
+
'outputs': ['*_bm25.pkl'],
|
| 68 |
+
'estimated_duration': '2-3m'
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class StageTracker:
|
| 74 |
+
"""Tracks the state and completion status of build stages"""
|
| 75 |
+
|
| 76 |
+
def __init__(self, faiss_dir: Path):
|
| 77 |
+
self.faiss_dir = faiss_dir
|
| 78 |
+
self.state_file = faiss_dir / '.build_state.json'
|
| 79 |
+
self.state = self._load_state()
|
| 80 |
+
|
| 81 |
+
def _load_state(self) -> Dict[str, Any]:
|
| 82 |
+
"""Load current build state from disk"""
|
| 83 |
+
if self.state_file.exists():
|
| 84 |
+
try:
|
| 85 |
+
return json.loads(self.state_file.read_text())
|
| 86 |
+
except json.JSONDecodeError as e:
|
| 87 |
+
logger.warning(f"Corrupted state file, starting fresh: {e}")
|
| 88 |
+
return self._create_initial_state()
|
| 89 |
+
else:
|
| 90 |
+
return self._create_initial_state()
|
| 91 |
+
|
| 92 |
+
def _create_initial_state(self) -> Dict[str, Any]:
|
| 93 |
+
"""Create initial state structure"""
|
| 94 |
+
return {
|
| 95 |
+
'stages': {},
|
| 96 |
+
'last_build': None,
|
| 97 |
+
'version': '1.0',
|
| 98 |
+
'total_builds': 0
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
def _save_state(self):
|
| 102 |
+
"""Save current state to disk"""
|
| 103 |
+
self.state_file.parent.mkdir(parents=True, exist_ok=True)
|
| 104 |
+
self.state_file.write_text(json.dumps(self.state, indent=2))
|
| 105 |
+
|
| 106 |
+
def is_stage_complete(self, stage_name: str) -> bool:
|
| 107 |
+
"""Check if stage is complete and all outputs exist"""
|
| 108 |
+
if stage_name not in self.state['stages']:
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
stage_info = self.state['stages'][stage_name]
|
| 112 |
+
stage_config = STAGES[stage_name]
|
| 113 |
+
|
| 114 |
+
# Check if all output files exist
|
| 115 |
+
for output_pattern in stage_config['outputs']:
|
| 116 |
+
pattern_path = self.faiss_dir / output_pattern
|
| 117 |
+
if not glob.glob(str(pattern_path)):
|
| 118 |
+
logger.debug(f"Missing output: {pattern_path}")
|
| 119 |
+
return False
|
| 120 |
+
|
| 121 |
+
return True
|
| 122 |
+
|
| 123 |
+
def mark_stage_complete(self, stage_name: str, metadata: dict = None):
|
| 124 |
+
"""Mark stage as completed with metadata"""
|
| 125 |
+
self.state['stages'][stage_name] = {
|
| 126 |
+
'completed_at': datetime.now().isoformat(),
|
| 127 |
+
'metadata': metadata or {}
|
| 128 |
+
}
|
| 129 |
+
self._save_state()
|
| 130 |
+
|
| 131 |
+
def mark_stage_failed(self, stage_name: str, error: str):
|
| 132 |
+
"""Mark stage as failed"""
|
| 133 |
+
self.state['stages'][stage_name] = {
|
| 134 |
+
'failed_at': datetime.now().isoformat(),
|
| 135 |
+
'error': error,
|
| 136 |
+
'status': 'failed'
|
| 137 |
+
}
|
| 138 |
+
self._save_state()
|
| 139 |
+
|
| 140 |
+
def should_skip_stage(self, stage_name: str, force_clean: bool) -> bool:
|
| 141 |
+
"""Determine if stage should be skipped"""
|
| 142 |
+
if force_clean:
|
| 143 |
+
return False
|
| 144 |
+
return self.is_stage_complete(stage_name)
|
| 145 |
+
|
| 146 |
+
def get_stage_status(self, stage_name: str) -> Dict[str, Any]:
|
| 147 |
+
"""Get detailed status of a stage"""
|
| 148 |
+
if stage_name not in self.state['stages']:
|
| 149 |
+
return {'status': 'not_started'}
|
| 150 |
+
|
| 151 |
+
stage_info = self.state['stages'][stage_name]
|
| 152 |
+
is_complete = self.is_stage_complete(stage_name)
|
| 153 |
+
|
| 154 |
+
return {
|
| 155 |
+
'status': 'completed' if is_complete else 'incomplete',
|
| 156 |
+
'completed_at': stage_info.get('completed_at'),
|
| 157 |
+
'metadata': stage_info.get('metadata', {}),
|
| 158 |
+
'error': stage_info.get('error'),
|
| 159 |
+
'is_complete': is_complete
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
def get_build_summary(self) -> Dict[str, Any]:
|
| 163 |
+
"""Get summary of current build state"""
|
| 164 |
+
completed_stages = []
|
| 165 |
+
incomplete_stages = []
|
| 166 |
+
failed_stages = []
|
| 167 |
+
|
| 168 |
+
for stage_name in STAGES.keys():
|
| 169 |
+
status = self.get_stage_status(stage_name)
|
| 170 |
+
if status['status'] == 'completed':
|
| 171 |
+
completed_stages.append(stage_name)
|
| 172 |
+
elif status.get('error'):
|
| 173 |
+
failed_stages.append(stage_name)
|
| 174 |
+
else:
|
| 175 |
+
incomplete_stages.append(stage_name)
|
| 176 |
+
|
| 177 |
+
return {
|
| 178 |
+
'completed_stages': completed_stages,
|
| 179 |
+
'incomplete_stages': incomplete_stages,
|
| 180 |
+
'failed_stages': failed_stages,
|
| 181 |
+
'last_build': self.state.get('last_build'),
|
| 182 |
+
'total_builds': self.state.get('total_builds', 0)
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
def reset_stage(self, stage_name: str):
|
| 186 |
+
"""Reset a specific stage to not started"""
|
| 187 |
+
if stage_name in self.state['stages']:
|
| 188 |
+
del self.state['stages'][stage_name]
|
| 189 |
+
self._save_state()
|
| 190 |
+
|
| 191 |
+
def reset_all_stages(self):
|
| 192 |
+
"""Reset all stages to not started"""
|
| 193 |
+
self.state['stages'] = {}
|
| 194 |
+
self._save_state()
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
class StageManager:
|
| 198 |
+
"""Manages execution of build stages with dependency resolution"""
|
| 199 |
+
|
| 200 |
+
def __init__(self, faiss_dir: Path):
|
| 201 |
+
self.faiss_dir = faiss_dir
|
| 202 |
+
self.tracker = StageTracker(faiss_dir)
|
| 203 |
+
|
| 204 |
+
def resolve_dependencies(self, target_stages: List[str], completed_stages: Set[str]) -> List[str]:
|
| 205 |
+
"""Resolve which stages need to run based on dependencies"""
|
| 206 |
+
to_run = []
|
| 207 |
+
|
| 208 |
+
for stage_name in target_stages:
|
| 209 |
+
if stage_name not in STAGES:
|
| 210 |
+
raise ValueError(f"Unknown stage: {stage_name}")
|
| 211 |
+
|
| 212 |
+
# Check dependencies recursively
|
| 213 |
+
for dep in STAGES[stage_name]['dependencies']:
|
| 214 |
+
if dep not in completed_stages:
|
| 215 |
+
dep_chain = self.resolve_dependencies([dep], completed_stages)
|
| 216 |
+
to_run.extend(dep_chain)
|
| 217 |
+
|
| 218 |
+
if stage_name not in completed_stages:
|
| 219 |
+
to_run.append(stage_name)
|
| 220 |
+
|
| 221 |
+
# Remove duplicates while preserving order
|
| 222 |
+
seen = set()
|
| 223 |
+
result = []
|
| 224 |
+
for stage in to_run:
|
| 225 |
+
if stage not in seen:
|
| 226 |
+
seen.add(stage)
|
| 227 |
+
result.append(stage)
|
| 228 |
+
|
| 229 |
+
return result
|
| 230 |
+
|
| 231 |
+
def get_completed_stages(self, force_clean: bool = False) -> Set[str]:
|
| 232 |
+
"""Get set of completed stages"""
|
| 233 |
+
if force_clean:
|
| 234 |
+
return set()
|
| 235 |
+
|
| 236 |
+
completed = set()
|
| 237 |
+
for stage_name in STAGES.keys():
|
| 238 |
+
if self.tracker.is_stage_complete(stage_name):
|
| 239 |
+
completed.add(stage_name)
|
| 240 |
+
return completed
|
| 241 |
+
|
| 242 |
+
def execute_stage(self, stage_name: str, **kwargs) -> Dict[str, Any]:
|
| 243 |
+
"""Execute a specific stage - to be implemented by subclasses"""
|
| 244 |
+
raise NotImplementedError(f"Stage execution not implemented for: {stage_name}")
|
| 245 |
+
|
| 246 |
+
def run_build_pipeline(self, target_stages: Optional[List[str]] = None,
|
| 247 |
+
force_clean: bool = False) -> Dict[str, Any]:
|
| 248 |
+
"""Run the build pipeline with dependency resolution"""
|
| 249 |
+
|
| 250 |
+
# Default to all stages if none specified
|
| 251 |
+
if target_stages is None:
|
| 252 |
+
target_stages = list(STAGES.keys())
|
| 253 |
+
|
| 254 |
+
# Get completed stages
|
| 255 |
+
completed_stages = self.get_completed_stages(force_clean)
|
| 256 |
+
|
| 257 |
+
# Resolve which stages need to run
|
| 258 |
+
stages_to_run = self.resolve_dependencies(target_stages, completed_stages)
|
| 259 |
+
|
| 260 |
+
logger.info(f"Build pipeline: {len(stages_to_run)} stages to execute")
|
| 261 |
+
|
| 262 |
+
results = []
|
| 263 |
+
for stage_name in stages_to_run:
|
| 264 |
+
stage_config = STAGES[stage_name]
|
| 265 |
+
|
| 266 |
+
if self.tracker.should_skip_stage(stage_name, force_clean):
|
| 267 |
+
logger.info(f"⏭️ Skipping stage '{stage_name}' (already complete)")
|
| 268 |
+
results.append({
|
| 269 |
+
'stage': stage_name,
|
| 270 |
+
'status': 'skipped',
|
| 271 |
+
'reason': 'already_complete'
|
| 272 |
+
})
|
| 273 |
+
continue
|
| 274 |
+
|
| 275 |
+
logger.info(f"🚀 Executing stage '{stage_name}': {stage_config['description']}")
|
| 276 |
+
start_time = time.time()
|
| 277 |
+
|
| 278 |
+
try:
|
| 279 |
+
# Execute the stage
|
| 280 |
+
result = self.execute_stage(stage_name, force_clean=force_clean)
|
| 281 |
+
|
| 282 |
+
# Mark as complete
|
| 283 |
+
execution_time = time.time() - start_time
|
| 284 |
+
self.tracker.mark_stage_complete(stage_name, {
|
| 285 |
+
'execution_time': execution_time,
|
| 286 |
+
'result': result
|
| 287 |
+
})
|
| 288 |
+
|
| 289 |
+
logger.info(f"✅ Stage '{stage_name}' completed in {execution_time:.1f}s")
|
| 290 |
+
results.append({
|
| 291 |
+
'stage': stage_name,
|
| 292 |
+
'status': 'completed',
|
| 293 |
+
'execution_time': execution_time,
|
| 294 |
+
'result': result
|
| 295 |
+
})
|
| 296 |
+
|
| 297 |
+
except Exception as e:
|
| 298 |
+
execution_time = time.time() - start_time
|
| 299 |
+
error_msg = f"Stage '{stage_name}' failed after {execution_time:.1f}s: {e}"
|
| 300 |
+
logger.error(f"❌ {error_msg}")
|
| 301 |
+
|
| 302 |
+
self.tracker.mark_stage_failed(stage_name, str(e))
|
| 303 |
+
|
| 304 |
+
results.append({
|
| 305 |
+
'stage': stage_name,
|
| 306 |
+
'status': 'failed',
|
| 307 |
+
'execution_time': execution_time,
|
| 308 |
+
'error': str(e)
|
| 309 |
+
})
|
| 310 |
+
|
| 311 |
+
# Don't continue with dependent stages on failure
|
| 312 |
+
break
|
| 313 |
+
|
| 314 |
+
# Update build metadata
|
| 315 |
+
self.tracker.state['last_build'] = datetime.now().isoformat()
|
| 316 |
+
self.tracker.state['total_builds'] = self.tracker.state.get('total_builds', 0) + 1
|
| 317 |
+
self.tracker._save_state()
|
| 318 |
+
|
| 319 |
+
return {
|
| 320 |
+
'success': all(r['status'] in ['completed', 'skipped'] for r in results),
|
| 321 |
+
'stages_executed': len([r for r in results if r['status'] == 'completed']),
|
| 322 |
+
'stages_skipped': len([r for r in results if r['status'] == 'skipped']),
|
| 323 |
+
'stages_failed': len([r for r in results if r['status'] == 'failed']),
|
| 324 |
+
'results': results,
|
| 325 |
+
'total_time': sum(r.get('execution_time', 0) for r in results)
|
| 326 |
+
}
|
app/core/utils.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Utility Functions Module
|
| 4 |
+
|
| 5 |
+
Collection of utility functions used throughout the application.
|
| 6 |
+
This module contains helper functions for file operations, formatting,
|
| 7 |
+
and document processing utilities.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from typing import List, Optional
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_mime_type(file_path: Path) -> str:
|
| 15 |
+
"""Get MIME type based on file extension"""
|
| 16 |
+
file_extension = file_path.suffix.lower()
|
| 17 |
+
if file_extension == '.pdf':
|
| 18 |
+
return 'application/pdf'
|
| 19 |
+
elif file_extension in ['.doc', '.docx']:
|
| 20 |
+
return 'application/msword'
|
| 21 |
+
elif file_extension == '.txt':
|
| 22 |
+
return 'text/plain'
|
| 23 |
+
elif file_extension == '.md':
|
| 24 |
+
return 'text/markdown'
|
| 25 |
+
else:
|
| 26 |
+
return 'application/octet-stream'
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def format_document_title(doc_name: str) -> str:
|
| 30 |
+
"""Format document name into a readable title"""
|
| 31 |
+
if '.' in doc_name:
|
| 32 |
+
doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
|
| 33 |
+
else:
|
| 34 |
+
doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
|
| 35 |
+
return doc_title
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def count_documents_in_directory(directory: Path, supported_extensions: Optional[List[str]] = None) -> int:
|
| 39 |
+
"""Count supported documents in a directory recursively"""
|
| 40 |
+
if supported_extensions is None:
|
| 41 |
+
supported_extensions = ['.pdf', '.docx', '.doc', '.txt', '.md']
|
| 42 |
+
|
| 43 |
+
return sum(1 for f in directory.rglob('*')
|
| 44 |
+
if f.is_file() and f.suffix.lower() in supported_extensions)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def create_document_processor(store_name: Optional[str] = None) -> 'DocumentProcessor':
|
| 48 |
+
"""
|
| 49 |
+
Create and initialize a DocumentProcessor.
|
| 50 |
+
|
| 51 |
+
This utility function encapsulates the common pattern of creating a DocumentProcessor
|
| 52 |
+
instance.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
store_name: Optional name for the FAISS store (uses config default if None)
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
Initialized DocumentProcessor instance
|
| 59 |
+
"""
|
| 60 |
+
from app.core.document_processor import DocumentProcessor
|
| 61 |
+
|
| 62 |
+
# Create document processor instance
|
| 63 |
+
processor = DocumentProcessor(store_name=store_name)
|
| 64 |
+
|
| 65 |
+
return processor
|
app/handlers/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Handlers Package
|
| 3 |
+
|
| 4 |
+
Contains business logic handlers that coordinate between UI and services.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .document_handler import DocumentHandler
|
| 8 |
+
from .ai_handler import AIHandler
|
| 9 |
+
from .export_handler import ExportHandler
|
| 10 |
+
|
| 11 |
+
__all__ = ['DocumentHandler', 'AIHandler', 'ExportHandler']
|
app/handlers/ai_handler.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
AI Handler
|
| 4 |
+
|
| 5 |
+
Handles AI operations and coordinates between UI and AI service.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Optional, List
|
| 9 |
+
|
| 10 |
+
from app.ui.session_manager import SessionManager
|
| 11 |
+
from app.services.ai_service import AIService, create_ai_service
|
| 12 |
+
from app.core.exceptions import AIError, ConfigError, create_ai_error
|
| 13 |
+
from app.ui.error_handler import handle_processing_errors
|
| 14 |
+
from app.core.logging import logger
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class AIHandler:
|
| 18 |
+
"""
|
| 19 |
+
AI handler that manages AI operations using the AI service.
|
| 20 |
+
|
| 21 |
+
Provides a clean interface between UI and AI service.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, session: SessionManager):
|
| 25 |
+
"""Initialize handler with session manager"""
|
| 26 |
+
self.session = session
|
| 27 |
+
self._ai_service: Optional[AIService] = None
|
| 28 |
+
|
| 29 |
+
@handle_processing_errors("AI service setup", "Please check your API key and try again")
|
| 30 |
+
def setup_agent(self, api_key: str, model_choice: str) -> bool:
|
| 31 |
+
"""
|
| 32 |
+
Setup AI service with given credentials.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
api_key: Anthropic API key
|
| 36 |
+
model_choice: Claude model to use
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
True if AI service was successfully initialized
|
| 40 |
+
|
| 41 |
+
Raises:
|
| 42 |
+
AIError: If AI service setup fails
|
| 43 |
+
ConfigError: If API key or model is invalid
|
| 44 |
+
"""
|
| 45 |
+
# Get appropriate max_tokens for the model
|
| 46 |
+
from app.core.config import get_app_config
|
| 47 |
+
config = get_app_config()
|
| 48 |
+
|
| 49 |
+
# Adjust max_tokens based on model limitations
|
| 50 |
+
max_tokens = config.model['max_tokens']
|
| 51 |
+
original_max_tokens = max_tokens
|
| 52 |
+
|
| 53 |
+
if 'haiku' in model_choice.lower():
|
| 54 |
+
# Claude Haiku has a maximum of 8192 output tokens
|
| 55 |
+
max_tokens = min(max_tokens, 8192)
|
| 56 |
+
elif 'sonnet' in model_choice.lower():
|
| 57 |
+
# Claude Sonnet models can handle higher token counts
|
| 58 |
+
max_tokens = min(max_tokens, 8192) # Conservative limit for reliability
|
| 59 |
+
|
| 60 |
+
if max_tokens != original_max_tokens:
|
| 61 |
+
logger.info(f"Adjusted max_tokens for {model_choice}: {original_max_tokens} -> {max_tokens}")
|
| 62 |
+
|
| 63 |
+
logger.info(f"Initializing AI service: model={model_choice}, max_tokens={max_tokens}, temperature={config.model['temperature']}")
|
| 64 |
+
|
| 65 |
+
# Create AI service with proper token limits
|
| 66 |
+
self._ai_service = create_ai_service(
|
| 67 |
+
api_key=api_key,
|
| 68 |
+
model=model_choice,
|
| 69 |
+
temperature=config.model['temperature'],
|
| 70 |
+
max_tokens=max_tokens
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# Check if service was created successfully
|
| 74 |
+
if self._ai_service is None:
|
| 75 |
+
raise create_ai_error(
|
| 76 |
+
"AI service creation failed",
|
| 77 |
+
recovery_hint="Please check your API key and try again"
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Test the service
|
| 81 |
+
if self._ai_service.is_available:
|
| 82 |
+
# Store the AI service in the session for other components to access
|
| 83 |
+
self.session.agent = self._ai_service
|
| 84 |
+
return True
|
| 85 |
+
else:
|
| 86 |
+
raise create_ai_error(
|
| 87 |
+
"AI service initialization failed",
|
| 88 |
+
recovery_hint="Please check your API key and network connection"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
def is_agent_available(self) -> bool:
|
| 92 |
+
"""
|
| 93 |
+
Check if AI service is available and ready.
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
True if AI service is available
|
| 97 |
+
"""
|
| 98 |
+
# Check local AI service first
|
| 99 |
+
if self._ai_service is not None and self._ai_service.is_available:
|
| 100 |
+
return True
|
| 101 |
+
|
| 102 |
+
# Check session for existing agent
|
| 103 |
+
if self.session.agent is not None:
|
| 104 |
+
# Update local reference if session has an agent
|
| 105 |
+
self._ai_service = self.session.agent
|
| 106 |
+
return self._ai_service.is_available
|
| 107 |
+
|
| 108 |
+
return False
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
@handle_processing_errors("Report generation", "Please check your documents and try again")
|
| 112 |
+
def generate_report(self, report_type: str, **kwargs) -> Optional[str]:
|
| 113 |
+
"""
|
| 114 |
+
Generate a report using the AI service.
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
report_type: Type of report ('overview', 'strategic', 'checklist', 'questions')
|
| 118 |
+
**kwargs: Additional arguments for report generation
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
Generated report content or None if failed
|
| 122 |
+
|
| 123 |
+
Raises:
|
| 124 |
+
AIError: If report generation fails
|
| 125 |
+
"""
|
| 126 |
+
if not self.is_agent_available():
|
| 127 |
+
raise create_ai_error(
|
| 128 |
+
"AI service not available",
|
| 129 |
+
recovery_hint="Please configure your API key in the sidebar"
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
documents = kwargs.get('documents', {})
|
| 133 |
+
strategy_text = kwargs.get('strategy_text')
|
| 134 |
+
checklist_results = kwargs.get('checklist_results')
|
| 135 |
+
|
| 136 |
+
return self._ai_service.analyze_documents(
|
| 137 |
+
documents=documents,
|
| 138 |
+
analysis_type=report_type,
|
| 139 |
+
strategy_text=strategy_text,
|
| 140 |
+
checklist_results=checklist_results
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@handle_processing_errors("Question answering", "Please try rephrasing your question")
|
| 145 |
+
def answer_question(self, question: str, context_docs: List[str]) -> str:
|
| 146 |
+
"""
|
| 147 |
+
Answer a specific question using AI.
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
question: The question to answer
|
| 151 |
+
context_docs: List of relevant document excerpts
|
| 152 |
+
|
| 153 |
+
Returns:
|
| 154 |
+
AI-generated answer
|
| 155 |
+
|
| 156 |
+
Raises:
|
| 157 |
+
AIError: If question answering fails
|
| 158 |
+
"""
|
| 159 |
+
if not self.is_agent_available():
|
| 160 |
+
raise create_ai_error(
|
| 161 |
+
"AI service not available",
|
| 162 |
+
recovery_hint="Please configure your API key in the sidebar"
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
return self._ai_service.answer_question(question, context_docs)
|
| 166 |
+
|
| 167 |
+
@property
|
| 168 |
+
def llm(self):
|
| 169 |
+
"""Get the underlying LLM instance"""
|
| 170 |
+
# Check local AI service first
|
| 171 |
+
if self._ai_service is not None:
|
| 172 |
+
return self._ai_service.llm
|
| 173 |
+
|
| 174 |
+
# Check session for existing agent
|
| 175 |
+
if self.session.agent is not None:
|
| 176 |
+
# Update local reference if session has an agent
|
| 177 |
+
self._ai_service = self.session.agent
|
| 178 |
+
return self._ai_service.llm
|
| 179 |
+
|
| 180 |
+
return None
|
app/handlers/document_handler.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Document Handler
|
| 4 |
+
|
| 5 |
+
Handles document processing operations and coordinates with the document processor.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, List, Any
|
| 10 |
+
|
| 11 |
+
from app.ui.session_manager import SessionManager
|
| 12 |
+
from app.core.exceptions import ProcessingError
|
| 13 |
+
from app.ui.error_handler import ErrorHandler, handle_processing_errors
|
| 14 |
+
from app.core.exceptions import DocumentProcessingError, FileOperationError, create_processing_error
|
| 15 |
+
from app.core.logging import logger
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DocumentHandler:
|
| 19 |
+
"""
|
| 20 |
+
Document handler that manages document processing operations.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, session: SessionManager):
|
| 24 |
+
"""Initialize handler with session manager"""
|
| 25 |
+
self.session = session
|
| 26 |
+
|
| 27 |
+
@handle_processing_errors("Data room processing", "Please check that the data room exists and contains documents")
|
| 28 |
+
def process_data_room_fast(self, data_room_path: str):
|
| 29 |
+
"""
|
| 30 |
+
Fast data room processing using pre-built FAISS indices.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
data_room_path: Path to the data room directory
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
Tuple of (documents_count, chunks_count) or None on error
|
| 37 |
+
"""
|
| 38 |
+
# Extract company name from path
|
| 39 |
+
company_name = Path(data_room_path).name.lower()
|
| 40 |
+
|
| 41 |
+
# Initialize document processor with loaded FAISS store
|
| 42 |
+
from app.core.utils import create_document_processor
|
| 43 |
+
document_processor = create_document_processor(store_name=company_name)
|
| 44 |
+
|
| 45 |
+
if not document_processor.vector_store:
|
| 46 |
+
raise create_processing_error(
|
| 47 |
+
f"No pre-built FAISS index found for '{company_name}'",
|
| 48 |
+
recovery_hint="Please run scripts/build_indexes.py first to create the index"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Quick document metadata scan
|
| 52 |
+
documents_dict = self._quick_document_scan(data_room_path)
|
| 53 |
+
|
| 54 |
+
# Get chunks from FAISS metadata
|
| 55 |
+
chunks = self._extract_chunks_from_faiss(document_processor)
|
| 56 |
+
|
| 57 |
+
# Store in session
|
| 58 |
+
self.session.documents = documents_dict
|
| 59 |
+
self.session.chunks = chunks
|
| 60 |
+
self.session.embeddings = document_processor.embeddings
|
| 61 |
+
self.session.vdr_store = company_name
|
| 62 |
+
|
| 63 |
+
# Preload checklist embeddings into memory for fast search
|
| 64 |
+
from app.core.search import preload_checklist_embeddings
|
| 65 |
+
logger.info("Attempting to preload checklist embeddings...")
|
| 66 |
+
try:
|
| 67 |
+
preloaded_count = preload_checklist_embeddings()
|
| 68 |
+
logger.info(f"✅ Successfully preloaded {preloaded_count} checklist embeddings for fast searching")
|
| 69 |
+
except RuntimeError as e:
|
| 70 |
+
logger.error(f"❌ Failed to preload checklist embeddings: {e}")
|
| 71 |
+
logger.error("This will cause checklist matching to fail - embeddings must be available for search")
|
| 72 |
+
# Don't fail the entire data room processing, but make it very clear this is a problem
|
| 73 |
+
raise # Re-raise to make this a hard failure
|
| 74 |
+
|
| 75 |
+
# Preload document type embeddings into memory for fast search
|
| 76 |
+
from app.core.search import preload_document_type_embeddings
|
| 77 |
+
logger.info("Attempting to preload document type embeddings...")
|
| 78 |
+
try:
|
| 79 |
+
type_embeddings = preload_document_type_embeddings(company_name)
|
| 80 |
+
# Store in session for use during search
|
| 81 |
+
self.session.document_type_embeddings = type_embeddings
|
| 82 |
+
logger.info(f"✅ Successfully preloaded {len(type_embeddings)} document type embeddings for fast searching")
|
| 83 |
+
logger.info(f"Session ID: {id(self.session)}, Embeddings stored: {bool(self.session.document_type_embeddings)}")
|
| 84 |
+
except RuntimeError as e:
|
| 85 |
+
logger.error(f"❌ Failed to preload document type embeddings: {e}")
|
| 86 |
+
logger.error("Checklist processing will fail - embeddings are required")
|
| 87 |
+
raise # Make this a hard failure since embeddings are now required
|
| 88 |
+
|
| 89 |
+
# Clear existing analysis
|
| 90 |
+
self.session.reset()
|
| 91 |
+
|
| 92 |
+
logger.info(f"Successfully processed {len(documents_dict)} documents and {len(chunks)} chunks")
|
| 93 |
+
return len(documents_dict), len(chunks)
|
| 94 |
+
|
| 95 |
+
def _quick_document_scan(self, data_room_path: str) -> Dict[str, Any]:
|
| 96 |
+
"""Quick scan of document files without loading content"""
|
| 97 |
+
documents_dict = {}
|
| 98 |
+
data_room_path_obj = Path(data_room_path)
|
| 99 |
+
|
| 100 |
+
# Validate data room path exists
|
| 101 |
+
if not data_room_path_obj.exists():
|
| 102 |
+
raise create_processing_error(
|
| 103 |
+
f"Data room path does not exist: {data_room_path}",
|
| 104 |
+
recovery_hint="Please select a valid data room directory"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Quick file system scan for supported extensions
|
| 108 |
+
from app.core import get_config
|
| 109 |
+
config = get_config()
|
| 110 |
+
supported_extensions = config.get_supported_extensions()
|
| 111 |
+
|
| 112 |
+
for ext in supported_extensions:
|
| 113 |
+
for file_path in data_room_path_obj.rglob(f"*{ext}"):
|
| 114 |
+
if file_path.is_file():
|
| 115 |
+
try:
|
| 116 |
+
rel_path = file_path.relative_to(data_room_path_obj)
|
| 117 |
+
documents_dict[str(file_path)] = {
|
| 118 |
+
'name': file_path.name,
|
| 119 |
+
'path': str(rel_path),
|
| 120 |
+
'content': f"[Indexed - {file_path.stat().st_size:,} bytes]",
|
| 121 |
+
'metadata': {
|
| 122 |
+
'source': str(file_path),
|
| 123 |
+
'name': file_path.name,
|
| 124 |
+
'path': str(rel_path)
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
except ValueError:
|
| 128 |
+
# Skip files outside data room path
|
| 129 |
+
continue
|
| 130 |
+
|
| 131 |
+
if not documents_dict:
|
| 132 |
+
raise create_processing_error(
|
| 133 |
+
f"No supported documents found in {data_room_path}",
|
| 134 |
+
recovery_hint="Please ensure the data room contains PDF, DOCX, or text files"
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
return documents_dict
|
| 138 |
+
|
| 139 |
+
def _extract_chunks_from_faiss(self, document_processor) -> List[Dict]:
|
| 140 |
+
"""Extract chunk information from loaded FAISS store"""
|
| 141 |
+
chunks = []
|
| 142 |
+
|
| 143 |
+
if not document_processor.vector_store:
|
| 144 |
+
logger.warning("No vector store available for chunk extraction")
|
| 145 |
+
return chunks
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
# Access the docstore to get document metadata
|
| 149 |
+
docstore = document_processor.vector_store.docstore
|
| 150 |
+
|
| 151 |
+
for doc_id in docstore._dict.keys():
|
| 152 |
+
doc = docstore._dict[doc_id]
|
| 153 |
+
chunk_text = doc.page_content
|
| 154 |
+
if len(chunk_text) > 500:
|
| 155 |
+
chunk_text = chunk_text[:500] + "..."
|
| 156 |
+
|
| 157 |
+
chunk_dict = {
|
| 158 |
+
'text': chunk_text,
|
| 159 |
+
'source': doc.metadata.get('name', ''),
|
| 160 |
+
'path': doc.metadata.get('path', ''),
|
| 161 |
+
'full_path': doc.metadata.get('source', ''),
|
| 162 |
+
'metadata': doc.metadata
|
| 163 |
+
}
|
| 164 |
+
chunks.append(chunk_dict)
|
| 165 |
+
|
| 166 |
+
except (DocumentProcessingError, FileOperationError) as e:
|
| 167 |
+
ErrorHandler.handle_error(
|
| 168 |
+
e,
|
| 169 |
+
"Failed to extract chunks from FAISS store",
|
| 170 |
+
recovery_hint="The FAISS index may be corrupted"
|
| 171 |
+
)
|
| 172 |
+
# Fallback: create minimal chunks
|
| 173 |
+
chunks = [{
|
| 174 |
+
'text': '[Content available in search]',
|
| 175 |
+
'source': 'indexed_content',
|
| 176 |
+
'path': '',
|
| 177 |
+
'full_path': '',
|
| 178 |
+
'metadata': {}
|
| 179 |
+
}]
|
| 180 |
+
|
| 181 |
+
return chunks
|
| 182 |
+
|
| 183 |
+
def get_document_processor(self, store_name: str = None):
|
| 184 |
+
"""
|
| 185 |
+
Get a configured document processor.
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
store_name: Optional store name for the processor
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
Configured DocumentProcessor instance
|
| 192 |
+
"""
|
| 193 |
+
from app.core.utils import create_document_processor
|
| 194 |
+
return create_document_processor(store_name=store_name)
|
| 195 |
+
|
| 196 |
+
def validate_data_room(self, data_room_path: str) -> bool:
|
| 197 |
+
"""
|
| 198 |
+
Validate that a data room path exists and contains documents.
|
| 199 |
+
|
| 200 |
+
Args:
|
| 201 |
+
data_room_path: Path to validate
|
| 202 |
+
|
| 203 |
+
Returns:
|
| 204 |
+
True if valid, False otherwise
|
| 205 |
+
"""
|
| 206 |
+
path_obj = Path(data_room_path)
|
| 207 |
+
if not path_obj.exists():
|
| 208 |
+
return False
|
| 209 |
+
|
| 210 |
+
return self._has_supported_files(path_obj)
|
| 211 |
+
|
| 212 |
+
def _has_supported_files(self, path_obj: Path) -> bool:
|
| 213 |
+
"""
|
| 214 |
+
Check if path contains files with supported extensions.
|
| 215 |
+
|
| 216 |
+
Args:
|
| 217 |
+
path_obj: Path object to check
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
True if supported files are found
|
| 221 |
+
"""
|
| 222 |
+
from app.core import get_config
|
| 223 |
+
config = get_config()
|
| 224 |
+
supported_extensions = config.get_supported_extensions()
|
| 225 |
+
|
| 226 |
+
for ext in supported_extensions:
|
| 227 |
+
if list(path_obj.rglob(f"*{ext}")):
|
| 228 |
+
return True
|
| 229 |
+
|
| 230 |
+
return False
|
app/handlers/export_handler.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Export Handler
|
| 4 |
+
|
| 5 |
+
Handles report export operations.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from app.ui.session_manager import SessionManager
|
| 11 |
+
from app.core.exceptions import ProcessingError
|
| 12 |
+
from app.ui.error_handler import handle_ui_errors
|
| 13 |
+
from app.core.exceptions import create_processing_error
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ExportHandler:
|
| 17 |
+
"""
|
| 18 |
+
Export handler that manages report export operations.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, session: SessionManager):
|
| 22 |
+
"""Initialize handler with session manager"""
|
| 23 |
+
self.session = session
|
| 24 |
+
|
| 25 |
+
@handle_ui_errors("Export overview report", "Please ensure overview analysis is complete")
|
| 26 |
+
def export_overview_report(self) -> tuple[str, str]:
|
| 27 |
+
"""
|
| 28 |
+
Export company overview report.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Tuple of (file_name, content)
|
| 32 |
+
"""
|
| 33 |
+
if not self.session.overview_summary:
|
| 34 |
+
raise create_processing_error(
|
| 35 |
+
"No overview analysis available for export",
|
| 36 |
+
recovery_hint="Please complete the overview analysis first"
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
company_name = self._get_company_name()
|
| 40 |
+
file_name = f"company_overview_{company_name}.md"
|
| 41 |
+
content = f"# Company Overview\n\n{self.session.overview_summary}"
|
| 42 |
+
|
| 43 |
+
return file_name, content
|
| 44 |
+
|
| 45 |
+
@handle_ui_errors("Export strategic report", "Please ensure strategic analysis is complete")
|
| 46 |
+
def export_strategic_report(self) -> tuple[str, str]:
|
| 47 |
+
"""
|
| 48 |
+
Export strategic analysis report.
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
Tuple of (file_name, content)
|
| 52 |
+
"""
|
| 53 |
+
if not self.session.strategic_summary:
|
| 54 |
+
raise create_processing_error(
|
| 55 |
+
"No strategic analysis available for export",
|
| 56 |
+
recovery_hint="Please complete the strategic analysis first"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
company_name = self._get_company_name()
|
| 60 |
+
file_name = f"dd_report_{company_name}.md"
|
| 61 |
+
|
| 62 |
+
content = "# Due Diligence Report\n\n"
|
| 63 |
+
if self.session.overview_summary:
|
| 64 |
+
content += f"## Company Overview\n\n{self.session.overview_summary}\n\n"
|
| 65 |
+
content += f"## Strategic Analysis\n\n{self.session.strategic_summary}"
|
| 66 |
+
|
| 67 |
+
return file_name, content
|
| 68 |
+
|
| 69 |
+
@handle_ui_errors("Export combined report", "Please ensure analysis is complete")
|
| 70 |
+
def export_combined_report(self) -> tuple[str, str]:
|
| 71 |
+
"""
|
| 72 |
+
Export combined due diligence report.
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Tuple of (file_name, content)
|
| 76 |
+
"""
|
| 77 |
+
if not (self.session.overview_summary or self.session.strategic_summary):
|
| 78 |
+
raise create_processing_error(
|
| 79 |
+
"No analysis data available for export",
|
| 80 |
+
recovery_hint="Please complete overview or strategic analysis first"
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
company_name = self._get_company_name()
|
| 84 |
+
file_name = f"complete_dd_report_{company_name}.md"
|
| 85 |
+
|
| 86 |
+
content = f"# Complete Due Diligence Report - {company_name.title()}\n\n"
|
| 87 |
+
|
| 88 |
+
if self.session.overview_summary:
|
| 89 |
+
content += f"## Company Overview\n\n{self.session.overview_summary}\n\n"
|
| 90 |
+
|
| 91 |
+
if self.session.strategic_summary:
|
| 92 |
+
content += f"## Strategic Analysis\n\n{self.session.strategic_summary}\n\n"
|
| 93 |
+
|
| 94 |
+
# Add checklist results if available
|
| 95 |
+
if self.session.checklist_results:
|
| 96 |
+
content += "## Checklist Analysis\n\n"
|
| 97 |
+
for category, items in self.session.checklist_results.items():
|
| 98 |
+
content += f"### {category}\n\n"
|
| 99 |
+
if isinstance(items, list):
|
| 100 |
+
for item in items:
|
| 101 |
+
if isinstance(item, dict):
|
| 102 |
+
content += f"- {item.get('text', str(item))}\n"
|
| 103 |
+
else:
|
| 104 |
+
content += f"- {str(item)}\n"
|
| 105 |
+
content += "\n"
|
| 106 |
+
|
| 107 |
+
# Add question answers if available
|
| 108 |
+
if self.session.question_answers:
|
| 109 |
+
content += "## Due Diligence Questions\n\n"
|
| 110 |
+
for question, answer in self.session.question_answers.items():
|
| 111 |
+
if isinstance(answer, dict) and answer.get('has_answer'):
|
| 112 |
+
content += f"### {question}\n\n{answer.get('answer', '')}\n\n"
|
| 113 |
+
|
| 114 |
+
return file_name, content
|
| 115 |
+
|
| 116 |
+
@handle_ui_errors("Export checklist report", "Please ensure checklist analysis is complete")
|
| 117 |
+
def export_checklist_report(self) -> tuple[str, str]:
|
| 118 |
+
"""
|
| 119 |
+
Export checklist analysis report.
|
| 120 |
+
|
| 121 |
+
Returns:
|
| 122 |
+
Tuple of (file_name, content)
|
| 123 |
+
"""
|
| 124 |
+
if not self.session.checklist_results:
|
| 125 |
+
raise create_processing_error(
|
| 126 |
+
"No checklist results available for export",
|
| 127 |
+
recovery_hint="Please complete the checklist analysis first"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
company_name = self._get_company_name()
|
| 131 |
+
file_name = f"checklist_analysis_{company_name}.md"
|
| 132 |
+
|
| 133 |
+
content = f"# Checklist Analysis Report - {company_name.title()}\n\n"
|
| 134 |
+
|
| 135 |
+
for category, items in self.session.checklist_results.items():
|
| 136 |
+
content += f"## {category}\n\n"
|
| 137 |
+
if isinstance(items, list):
|
| 138 |
+
for item in items:
|
| 139 |
+
if isinstance(item, dict):
|
| 140 |
+
content += f"- {item.get('text', str(item))}\n"
|
| 141 |
+
else:
|
| 142 |
+
content += f"- {str(item)}\n"
|
| 143 |
+
content += "\n"
|
| 144 |
+
|
| 145 |
+
return file_name, content
|
| 146 |
+
|
| 147 |
+
def _get_company_name(self) -> str:
|
| 148 |
+
"""Get company name from current documents"""
|
| 149 |
+
documents = self.session.documents
|
| 150 |
+
if documents:
|
| 151 |
+
company_name = Path(list(documents.keys())[0]).parent.name
|
| 152 |
+
return company_name
|
| 153 |
+
return 'export'
|
app/main.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Main Application Entry Point
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
# Standard library imports
|
| 7 |
+
import os
|
| 8 |
+
import warnings
|
| 9 |
+
|
| 10 |
+
# Third-party imports
|
| 11 |
+
import streamlit as st
|
| 12 |
+
|
| 13 |
+
# Local imports
|
| 14 |
+
from app.core.config import init_app_config
|
| 15 |
+
from app.core.logging import configure_langchain_logging
|
| 16 |
+
from app.handlers.ai_handler import AIHandler
|
| 17 |
+
from app.handlers.document_handler import DocumentHandler
|
| 18 |
+
from app.handlers.export_handler import ExportHandler
|
| 19 |
+
from app.ui.session_manager import SessionManager
|
| 20 |
+
from app.ui.sidebar import Sidebar
|
| 21 |
+
from app.ui.tabs.checklist_tab import ChecklistTab
|
| 22 |
+
from app.ui.tabs.graph_tab import GraphTab
|
| 23 |
+
from app.ui.tabs.overview_tab import OverviewTab
|
| 24 |
+
from app.ui.tabs.qa_tab import QATab
|
| 25 |
+
from app.ui.tabs.questions_tab import QuestionsTab
|
| 26 |
+
from app.ui.tabs.strategic_tab import StrategicTab
|
| 27 |
+
|
| 28 |
+
# Enable tokenizers parallelism for better performance
|
| 29 |
+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")
|
| 30 |
+
|
| 31 |
+
# Initialize for Streamlit Cloud deployment (must be done before other imports)
|
| 32 |
+
try:
|
| 33 |
+
from scripts.streamlit_cloud_config import initialize_for_streamlit_cloud
|
| 34 |
+
initialize_for_streamlit_cloud()
|
| 35 |
+
except ImportError:
|
| 36 |
+
# Local development - skip cloud initialization
|
| 37 |
+
pass
|
| 38 |
+
|
| 39 |
+
# Only suppress specific known non-critical warnings
|
| 40 |
+
warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
|
| 41 |
+
warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class App:
|
| 45 |
+
"""Main application class that orchestrates all components."""
|
| 46 |
+
|
| 47 |
+
def __init__(self):
|
| 48 |
+
"""Initialize the application"""
|
| 49 |
+
# Initialize configuration
|
| 50 |
+
self.config = init_app_config()
|
| 51 |
+
|
| 52 |
+
# Initialize session manager
|
| 53 |
+
self.session = SessionManager()
|
| 54 |
+
|
| 55 |
+
# Initialize handlers
|
| 56 |
+
self.document_handler = DocumentHandler(self.session)
|
| 57 |
+
self.ai_handler = AIHandler(self.session)
|
| 58 |
+
self.export_handler = ExportHandler(self.session)
|
| 59 |
+
|
| 60 |
+
# Initialize UI components
|
| 61 |
+
self.sidebar = Sidebar(self.session, self.config)
|
| 62 |
+
self.tabs = {
|
| 63 |
+
'overview': OverviewTab(self.session, self.config, self.ai_handler, self.export_handler),
|
| 64 |
+
'strategic': StrategicTab(self.session, self.config, self.ai_handler, self.export_handler),
|
| 65 |
+
'checklist': ChecklistTab(self.session, self.config, self.ai_handler),
|
| 66 |
+
'questions': QuestionsTab(self.session, self.config, self.ai_handler),
|
| 67 |
+
'qa': QATab(self.session, self.config, self.ai_handler),
|
| 68 |
+
'graph': GraphTab(self.session, self.config, self.ai_handler, self.export_handler)
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
# Configure Streamlit page
|
| 72 |
+
st.set_page_config(
|
| 73 |
+
page_title=self.config.ui['page_title'],
|
| 74 |
+
page_icon=self.config.ui['page_icon'],
|
| 75 |
+
layout=self.config.ui['layout']
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
def run(self):
|
| 79 |
+
"""Run the main application"""
|
| 80 |
+
# Render header
|
| 81 |
+
st.title("🤖 AI Due Diligence")
|
| 82 |
+
st.markdown("**Intelligent M&A Analysis:** Strategic assessment, automated document review, and AI-powered insights")
|
| 83 |
+
|
| 84 |
+
# Render sidebar and get selections
|
| 85 |
+
data_room_path, process_button = self.sidebar.render()
|
| 86 |
+
|
| 87 |
+
# Store the selected data room path
|
| 88 |
+
if data_room_path:
|
| 89 |
+
self.session.data_room_path = data_room_path
|
| 90 |
+
|
| 91 |
+
# Main tabs
|
| 92 |
+
tab_names = [
|
| 93 |
+
"🏢 Company Overview",
|
| 94 |
+
"🎯 Strategic Analysis",
|
| 95 |
+
"📊 Checklist Matching",
|
| 96 |
+
"❓ Due Diligence Questions",
|
| 97 |
+
"💬 Q&A with Citations",
|
| 98 |
+
"🧠 Knowledge Graph"
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
tabs = st.tabs(tab_names)
|
| 102 |
+
|
| 103 |
+
with tabs[0]:
|
| 104 |
+
self.tabs['overview'].render()
|
| 105 |
+
|
| 106 |
+
with tabs[1]:
|
| 107 |
+
self.tabs['strategic'].render()
|
| 108 |
+
|
| 109 |
+
with tabs[2]:
|
| 110 |
+
self.tabs['checklist'].render()
|
| 111 |
+
|
| 112 |
+
with tabs[3]:
|
| 113 |
+
self.tabs['questions'].render()
|
| 114 |
+
|
| 115 |
+
with tabs[4]:
|
| 116 |
+
self.tabs['qa'].render()
|
| 117 |
+
|
| 118 |
+
with tabs[5]:
|
| 119 |
+
self.tabs['graph'].render()
|
| 120 |
+
|
| 121 |
+
# Processing trigger
|
| 122 |
+
if process_button and data_room_path:
|
| 123 |
+
with st.spinner("🚀 Processing data room..."):
|
| 124 |
+
self.sidebar.process_data_room(data_room_path)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def main():
|
| 128 |
+
"""Main application entry point"""
|
| 129 |
+
# Configure LangChain logging
|
| 130 |
+
configure_langchain_logging(log_level="WARNING")
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
app = App()
|
| 134 |
+
app.run()
|
| 135 |
+
except Exception as e:
|
| 136 |
+
from app.ui.error_handler import ErrorHandler
|
| 137 |
+
ErrorHandler.handle_error(
|
| 138 |
+
e,
|
| 139 |
+
"Application startup failed",
|
| 140 |
+
recovery_hint="Please refresh the page and try again"
|
| 141 |
+
)
|
| 142 |
+
st.stop()
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
if __name__ == "__main__":
|
| 146 |
+
main()
|
app/services/ai_client.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
AI Client
|
| 4 |
+
|
| 5 |
+
Handles Anthropic API client and LLM interaction logic.
|
| 6 |
+
Provides clean interface for LLM operations and connection management.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Optional, Any, List
|
| 10 |
+
|
| 11 |
+
from app.core.exceptions import AIError
|
| 12 |
+
from app.services.ai_config import AIConfig
|
| 13 |
+
from app.core.exceptions import LLMConnectionError, LLMAuthenticationError, LLMTimeoutError, LLMQuotaExceededError, LLMInvalidResponseError
|
| 14 |
+
|
| 15 |
+
# Import specific exception types for robust error handling
|
| 16 |
+
try:
|
| 17 |
+
from anthropic import (
|
| 18 |
+
APIConnectionError, APIError, APITimeoutError, AuthenticationError,
|
| 19 |
+
BadRequestError, ConflictError, InternalServerError, NotFoundError,
|
| 20 |
+
PermissionDeniedError, RateLimitError, UnprocessableEntityError,
|
| 21 |
+
ServiceUnavailableError
|
| 22 |
+
)
|
| 23 |
+
except ImportError:
|
| 24 |
+
# Fallback if anthropic package is not directly available
|
| 25 |
+
APIConnectionError = APIError = APITimeoutError = AuthenticationError = None
|
| 26 |
+
BadRequestError = ConflictError = InternalServerError = NotFoundError = None
|
| 27 |
+
PermissionDeniedError = RateLimitError = UnprocessableEntityError = None
|
| 28 |
+
ServiceUnavailableError = None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class AIClient:
|
| 32 |
+
"""
|
| 33 |
+
Anthropic API client for LLM interactions.
|
| 34 |
+
|
| 35 |
+
This class manages the connection to Anthropic's Claude models,
|
| 36 |
+
handles initialization, and provides methods for LLM operations.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
def __init__(self, config: AIConfig) -> None:
|
| 40 |
+
"""
|
| 41 |
+
Initialize AI client with configuration.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
config: AIConfig object containing service configuration
|
| 45 |
+
|
| 46 |
+
Raises:
|
| 47 |
+
AIError: If initialization fails
|
| 48 |
+
"""
|
| 49 |
+
self.config: AIConfig = config
|
| 50 |
+
self._llm: Optional[Any] = None
|
| 51 |
+
self._initialized: bool = False
|
| 52 |
+
|
| 53 |
+
def _ensure_initialized(self) -> None:
|
| 54 |
+
"""
|
| 55 |
+
Ensure the AI client is properly initialized and ready for use.
|
| 56 |
+
|
| 57 |
+
This method handles lazy initialization of the AI client, creating
|
| 58 |
+
the underlying LLM connection and testing it with a simple query.
|
| 59 |
+
|
| 60 |
+
Raises:
|
| 61 |
+
AIError: If initialization fails due to configuration or connection issues
|
| 62 |
+
"""
|
| 63 |
+
if self._initialized:
|
| 64 |
+
return
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
from langchain_anthropic import ChatAnthropic
|
| 68 |
+
|
| 69 |
+
self._llm = ChatAnthropic(
|
| 70 |
+
api_key=self.config.api_key,
|
| 71 |
+
model=self.config.model,
|
| 72 |
+
temperature=self.config.temperature,
|
| 73 |
+
max_tokens=self.config.max_tokens
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Test the connection with a simple query that validates AI functionality
|
| 77 |
+
from langchain_core.messages import HumanMessage
|
| 78 |
+
test_response = self._llm.invoke([
|
| 79 |
+
HumanMessage(content="Please respond with 'AI connection successful' if you can read this message.")
|
| 80 |
+
])
|
| 81 |
+
if not test_response or not hasattr(test_response, 'content') or not test_response.content.strip():
|
| 82 |
+
raise AIError("AI service test failed - no valid response received")
|
| 83 |
+
|
| 84 |
+
# Verify the response contains expected content
|
| 85 |
+
response_text = test_response.content.strip().lower()
|
| 86 |
+
if "successful" not in response_text and "ai" not in response_text:
|
| 87 |
+
raise AIError("AI service test failed - unexpected response format")
|
| 88 |
+
|
| 89 |
+
self._initialized = True
|
| 90 |
+
|
| 91 |
+
except ImportError as e:
|
| 92 |
+
raise AIError(
|
| 93 |
+
f"Missing required AI library: {str(e)}",
|
| 94 |
+
user_message="AI libraries not installed",
|
| 95 |
+
recovery_hint="Please install required dependencies"
|
| 96 |
+
)
|
| 97 |
+
except Exception as e:
|
| 98 |
+
self._handle_llm_error(e)
|
| 99 |
+
|
| 100 |
+
def _handle_llm_error(self, error: Exception, include_invalid_response: bool = False) -> None:
|
| 101 |
+
"""
|
| 102 |
+
Handle LLM-related errors with robust error type detection.
|
| 103 |
+
|
| 104 |
+
This method uses exception type checking as the primary classification method,
|
| 105 |
+
with string-based fallbacks for compatibility with different library versions.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
error: The exception that occurred
|
| 109 |
+
include_invalid_response: Whether to include invalid response error handling
|
| 110 |
+
|
| 111 |
+
Raises:
|
| 112 |
+
Specific LLM error types based on exception type and content
|
| 113 |
+
"""
|
| 114 |
+
# Primary: Check exception types for robust classification
|
| 115 |
+
if self._is_authentication_error(error):
|
| 116 |
+
raise LLMAuthenticationError(
|
| 117 |
+
f"AI authentication failed: {str(error)}",
|
| 118 |
+
user_message="AI authentication failed",
|
| 119 |
+
recovery_hint="Please check your API key"
|
| 120 |
+
)
|
| 121 |
+
elif self._is_timeout_error(error):
|
| 122 |
+
raise LLMTimeoutError(
|
| 123 |
+
f"AI service timeout: {str(error)}",
|
| 124 |
+
user_message="AI service timed out",
|
| 125 |
+
recovery_hint="Please try again later"
|
| 126 |
+
)
|
| 127 |
+
elif self._is_quota_error(error):
|
| 128 |
+
raise LLMQuotaExceededError(
|
| 129 |
+
f"AI quota exceeded: {str(error)}",
|
| 130 |
+
user_message="AI quota exceeded",
|
| 131 |
+
recovery_hint="Please check your API usage limits"
|
| 132 |
+
)
|
| 133 |
+
elif self._is_connection_error(error):
|
| 134 |
+
raise LLMConnectionError(
|
| 135 |
+
f"AI connection failed: {str(error)}",
|
| 136 |
+
user_message="AI connection failed",
|
| 137 |
+
recovery_hint="Please check your network connection"
|
| 138 |
+
)
|
| 139 |
+
elif include_invalid_response and self._is_invalid_response_error(error):
|
| 140 |
+
raise LLMInvalidResponseError(
|
| 141 |
+
f"AI returned invalid response: {str(error)}",
|
| 142 |
+
user_message="AI returned invalid response",
|
| 143 |
+
recovery_hint="Please try again"
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Default error messages based on context
|
| 147 |
+
if include_invalid_response:
|
| 148 |
+
raise AIError(
|
| 149 |
+
f"Response generation failed: {str(error)}",
|
| 150 |
+
user_message="Failed to generate AI response",
|
| 151 |
+
recovery_hint="Please try again or check your API key"
|
| 152 |
+
)
|
| 153 |
+
else:
|
| 154 |
+
raise AIError(
|
| 155 |
+
f"Failed to initialize AI client: {str(error)}",
|
| 156 |
+
user_message="AI client initialization failed",
|
| 157 |
+
recovery_hint="Please check your API key and network connection"
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
def _is_authentication_error(self, error: Exception) -> bool:
|
| 161 |
+
"""Check if error is an authentication-related error."""
|
| 162 |
+
# Primary: Check exception types
|
| 163 |
+
if AuthenticationError and isinstance(error, AuthenticationError):
|
| 164 |
+
return True
|
| 165 |
+
if PermissionDeniedError and isinstance(error, PermissionDeniedError):
|
| 166 |
+
return True
|
| 167 |
+
|
| 168 |
+
# Fallback: String-based detection for compatibility
|
| 169 |
+
error_msg = str(error).lower()
|
| 170 |
+
return "authentication" in error_msg or "api key" in error_msg or "unauthorized" in error_msg
|
| 171 |
+
|
| 172 |
+
def _is_timeout_error(self, error: Exception) -> bool:
|
| 173 |
+
"""Check if error is a timeout-related error."""
|
| 174 |
+
# Primary: Check exception types
|
| 175 |
+
if APITimeoutError and isinstance(error, APITimeoutError):
|
| 176 |
+
return True
|
| 177 |
+
|
| 178 |
+
# Fallback: String-based detection
|
| 179 |
+
error_msg = str(error).lower()
|
| 180 |
+
return "timeout" in error_msg or "timed out" in error_msg
|
| 181 |
+
|
| 182 |
+
def _is_quota_error(self, error: Exception) -> bool:
|
| 183 |
+
"""Check if error is a quota/rate limit related error."""
|
| 184 |
+
# Primary: Check exception types
|
| 185 |
+
if RateLimitError and isinstance(error, RateLimitError):
|
| 186 |
+
return True
|
| 187 |
+
|
| 188 |
+
# Fallback: String-based detection
|
| 189 |
+
error_msg = str(error).lower()
|
| 190 |
+
return "quota" in error_msg or "rate limit" in error_msg or "limit exceeded" in error_msg
|
| 191 |
+
|
| 192 |
+
def _is_connection_error(self, error: Exception) -> bool:
|
| 193 |
+
"""Check if error is a connection/network related error."""
|
| 194 |
+
# Primary: Check exception types
|
| 195 |
+
if APIConnectionError and isinstance(error, APIConnectionError):
|
| 196 |
+
return True
|
| 197 |
+
if InternalServerError and isinstance(error, InternalServerError):
|
| 198 |
+
return True
|
| 199 |
+
if ServiceUnavailableError and isinstance(error, ServiceUnavailableError):
|
| 200 |
+
return True
|
| 201 |
+
|
| 202 |
+
# Fallback: String-based detection
|
| 203 |
+
error_msg = str(error).lower()
|
| 204 |
+
return ("connection" in error_msg or "network" in error_msg or
|
| 205 |
+
"connection reset" in error_msg or "connection refused" in error_msg)
|
| 206 |
+
|
| 207 |
+
def _is_invalid_response_error(self, error: Exception) -> bool:
|
| 208 |
+
"""Check if error is related to invalid/malformed responses."""
|
| 209 |
+
# Primary: Check exception types
|
| 210 |
+
if BadRequestError and isinstance(error, BadRequestError):
|
| 211 |
+
return True
|
| 212 |
+
if UnprocessableEntityError and isinstance(error, UnprocessableEntityError):
|
| 213 |
+
return True
|
| 214 |
+
|
| 215 |
+
# Fallback: String-based detection
|
| 216 |
+
error_msg = str(error).lower()
|
| 217 |
+
return ("invalid" in error_msg or "malformed" in error_msg or
|
| 218 |
+
"bad request" in error_msg or "unprocessable" in error_msg)
|
| 219 |
+
|
| 220 |
+
@property
|
| 221 |
+
def is_available(self) -> bool:
|
| 222 |
+
"""
|
| 223 |
+
Check if AI client is available and ready for operations.
|
| 224 |
+
|
| 225 |
+
This property performs lazy initialization if needed and returns
|
| 226 |
+
the availability status of the AI client.
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
True if AI client is initialized and ready, False otherwise
|
| 230 |
+
"""
|
| 231 |
+
try:
|
| 232 |
+
self._ensure_initialized()
|
| 233 |
+
return True
|
| 234 |
+
except (AIError):
|
| 235 |
+
return False
|
| 236 |
+
|
| 237 |
+
@property
|
| 238 |
+
def llm(self) -> Any:
|
| 239 |
+
"""
|
| 240 |
+
Get the underlying LLM instance for direct access.
|
| 241 |
+
|
| 242 |
+
This property provides access to the raw LangChain LLM object
|
| 243 |
+
for advanced use cases that require direct interaction.
|
| 244 |
+
|
| 245 |
+
Returns:
|
| 246 |
+
LangChain LLM instance (ChatAnthropic)
|
| 247 |
+
|
| 248 |
+
Raises:
|
| 249 |
+
AIError: If LLM is not initialized
|
| 250 |
+
"""
|
| 251 |
+
self._ensure_initialized()
|
| 252 |
+
return self._llm
|
| 253 |
+
|
| 254 |
+
def generate_response(self, messages: List[dict]) -> str:
|
| 255 |
+
"""
|
| 256 |
+
Generate a response using the LLM.
|
| 257 |
+
|
| 258 |
+
Args:
|
| 259 |
+
messages: List of message dictionaries for the LLM
|
| 260 |
+
|
| 261 |
+
Returns:
|
| 262 |
+
Generated response content
|
| 263 |
+
|
| 264 |
+
Raises:
|
| 265 |
+
AIError: If response generation fails
|
| 266 |
+
"""
|
| 267 |
+
self._ensure_initialized()
|
| 268 |
+
|
| 269 |
+
try:
|
| 270 |
+
response = self._llm.invoke(messages)
|
| 271 |
+
return response.content.strip()
|
| 272 |
+
except Exception as e:
|
| 273 |
+
self._handle_llm_error(e, include_invalid_response=True)
|
| 274 |
+
|
| 275 |
+
def generate_text(self, prompt: str, context: Optional[List[str]] = None) -> str:
|
| 276 |
+
"""
|
| 277 |
+
Generate text using the AI client.
|
| 278 |
+
|
| 279 |
+
Args:
|
| 280 |
+
prompt: The main prompt for text generation
|
| 281 |
+
context: Optional context documents
|
| 282 |
+
|
| 283 |
+
Returns:
|
| 284 |
+
Generated text response
|
| 285 |
+
"""
|
| 286 |
+
self._ensure_initialized()
|
| 287 |
+
|
| 288 |
+
# Prepare the full prompt
|
| 289 |
+
full_prompt = prompt
|
| 290 |
+
if context:
|
| 291 |
+
context_str = "\n\n".join(context[:3]) # Limit context to prevent token overflow
|
| 292 |
+
full_prompt = f"Context:\n{context_str}\n\n{prompt}"
|
| 293 |
+
|
| 294 |
+
try:
|
| 295 |
+
from langchain_core.messages import HumanMessage
|
| 296 |
+
|
| 297 |
+
response = self._llm.invoke([HumanMessage(content=full_prompt)])
|
| 298 |
+
return response.content.strip()
|
| 299 |
+
|
| 300 |
+
except Exception as e:
|
| 301 |
+
self._handle_llm_error(e, include_invalid_response=True)
|
app/services/ai_config.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
AI Configuration
|
| 4 |
+
|
| 5 |
+
Configuration settings for AI service operations.
|
| 6 |
+
Provides type safety and validation for AI service parameters.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
|
| 11 |
+
from app.core.exceptions import ConfigError
|
| 12 |
+
from app.core.constants import TEMPERATURE
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class AIConfig:
|
| 17 |
+
"""
|
| 18 |
+
Configuration settings for AI service operations.
|
| 19 |
+
|
| 20 |
+
This dataclass encapsulates all configuration parameters needed
|
| 21 |
+
for AI service initialization and operation, providing type safety
|
| 22 |
+
and validation.
|
| 23 |
+
|
| 24 |
+
Attributes:
|
| 25 |
+
api_key: Anthropic API key for authentication
|
| 26 |
+
model: Claude model name to use for operations
|
| 27 |
+
temperature: Sampling temperature (0.0 = deterministic, higher = more creative)
|
| 28 |
+
max_tokens: Maximum tokens to generate in responses
|
| 29 |
+
|
| 30 |
+
Example:
|
| 31 |
+
config = AIConfig(
|
| 32 |
+
api_key="sk-ant-...",
|
| 33 |
+
model="claude-3-5-sonnet",
|
| 34 |
+
temperature=TEMPERATURE,
|
| 35 |
+
max_tokens=4000
|
| 36 |
+
)
|
| 37 |
+
"""
|
| 38 |
+
api_key: str
|
| 39 |
+
model: str
|
| 40 |
+
temperature: float = TEMPERATURE
|
| 41 |
+
max_tokens: int = 4000
|
| 42 |
+
|
| 43 |
+
def validate(self) -> None:
|
| 44 |
+
"""
|
| 45 |
+
Validate the AI configuration for required values and consistency.
|
| 46 |
+
|
| 47 |
+
Performs comprehensive validation of all configuration parameters
|
| 48 |
+
to ensure they are valid for AI service operations.
|
| 49 |
+
|
| 50 |
+
Raises:
|
| 51 |
+
ConfigError: If any configuration values are invalid
|
| 52 |
+
"""
|
| 53 |
+
if not self.api_key or not self.api_key.strip():
|
| 54 |
+
raise ConfigError(
|
| 55 |
+
"AI API key is missing",
|
| 56 |
+
user_message="API key is required for AI features",
|
| 57 |
+
recovery_hint="Please configure your Anthropic API key in the sidebar"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
if not self.model or not self.model.strip():
|
| 61 |
+
raise ConfigError(
|
| 62 |
+
"AI model is not specified",
|
| 63 |
+
user_message="AI model selection is required",
|
| 64 |
+
recovery_hint="Please select a Claude model"
|
| 65 |
+
)
|
app/services/ai_service.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
AI Service
|
| 4 |
+
|
| 5 |
+
Provides a clean interface for AI operations.
|
| 6 |
+
Reduces coupling between AI components and the rest of the system.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Optional, Dict, List, Any
|
| 10 |
+
|
| 11 |
+
from app.core.exceptions import AIError, ConfigError
|
| 12 |
+
# Removed circular import: from app.ui.error_handler import handle_processing_errors
|
| 13 |
+
from app.core.exceptions import create_config_error
|
| 14 |
+
from app.core.constants import QA_MAX_TOKENS, SUPPORTED_ANALYSIS_TYPES
|
| 15 |
+
from app.services.ai_config import AIConfig
|
| 16 |
+
from app.services.ai_client import AIClient
|
| 17 |
+
from app.services.response_parser import ResponseParser
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class AIService:
|
| 21 |
+
"""
|
| 22 |
+
Simplified AI service providing clean, type-safe interface for AI operations.
|
| 23 |
+
|
| 24 |
+
This service replaces the complex DDChecklistAgent with a focused, simple interface
|
| 25 |
+
that handles the core AI operations needed by the application. It provides:
|
| 26 |
+
|
| 27 |
+
Features:
|
| 28 |
+
- Type-safe AI operations with comprehensive error handling
|
| 29 |
+
- Multiple analysis types (overview, strategic, checklist, questions)
|
| 30 |
+
- Token usage estimation and limits
|
| 31 |
+
- Configurable AI models and parameters
|
| 32 |
+
- Clean separation of concerns
|
| 33 |
+
|
| 34 |
+
Attributes:
|
| 35 |
+
config: AIConfig object containing service configuration
|
| 36 |
+
is_available: Property indicating if service is ready for use
|
| 37 |
+
|
| 38 |
+
Example:
|
| 39 |
+
config = AIConfig(api_key="sk-ant-...", model="claude-3-sonnet-20240229")
|
| 40 |
+
ai_service = AIService(config)
|
| 41 |
+
|
| 42 |
+
if ai_service.is_available:
|
| 43 |
+
result = ai_service.analyze_documents(docs, "overview")
|
| 44 |
+
answer = ai_service.answer_question("What is the revenue?", context)
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
def __init__(self, config: AIConfig) -> None:
|
| 48 |
+
"""
|
| 49 |
+
Initialize AI service with configuration and validate setup.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
config: AIConfig object containing service configuration
|
| 53 |
+
|
| 54 |
+
Raises:
|
| 55 |
+
ConfigError: If configuration validation fails
|
| 56 |
+
"""
|
| 57 |
+
self.config: AIConfig = config
|
| 58 |
+
self.config.validate()
|
| 59 |
+
self._client: Optional[AIClient] = None
|
| 60 |
+
|
| 61 |
+
@property
|
| 62 |
+
def _ensure_client(self) -> AIClient:
|
| 63 |
+
"""
|
| 64 |
+
Ensure the AI client is properly initialized.
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
Initialized AIClient instance
|
| 68 |
+
|
| 69 |
+
Raises:
|
| 70 |
+
AIError: If client initialization fails
|
| 71 |
+
"""
|
| 72 |
+
if self._client is None:
|
| 73 |
+
self._client = AIClient(self.config)
|
| 74 |
+
return self._client
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def is_available(self) -> bool:
|
| 78 |
+
"""
|
| 79 |
+
Check if AI service is available and ready for operations.
|
| 80 |
+
|
| 81 |
+
This property performs lazy initialization if needed and returns
|
| 82 |
+
the availability status of the AI service.
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
True if AI service is initialized and ready, False otherwise
|
| 86 |
+
"""
|
| 87 |
+
try:
|
| 88 |
+
return self._ensure_client.is_available
|
| 89 |
+
except (AIError, ConfigError):
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
@property
|
| 93 |
+
def llm(self) -> Any:
|
| 94 |
+
"""
|
| 95 |
+
Get the underlying LLM instance for direct access.
|
| 96 |
+
|
| 97 |
+
This property provides access to the raw LangChain LLM object
|
| 98 |
+
for advanced use cases that require direct interaction.
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
LangChain LLM instance (ChatAnthropic)
|
| 102 |
+
|
| 103 |
+
Raises:
|
| 104 |
+
AIError: If LLM is not initialized
|
| 105 |
+
"""
|
| 106 |
+
return self._ensure_client.llm
|
| 107 |
+
|
| 108 |
+
# Removed decorator to avoid circular imports
|
| 109 |
+
def generate_text(
|
| 110 |
+
self,
|
| 111 |
+
prompt: str,
|
| 112 |
+
context: Optional[List[str]] = None,
|
| 113 |
+
max_length: Optional[int] = None
|
| 114 |
+
) -> str:
|
| 115 |
+
"""
|
| 116 |
+
Generate text using the AI service.
|
| 117 |
+
|
| 118 |
+
Args:
|
| 119 |
+
prompt: The main prompt for text generation
|
| 120 |
+
context: Optional context documents
|
| 121 |
+
max_length: Maximum response length
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
Generated text response
|
| 125 |
+
"""
|
| 126 |
+
client = self._ensure_client
|
| 127 |
+
response = client.generate_text(prompt, context)
|
| 128 |
+
return ResponseParser.format_response(response, max_length)
|
| 129 |
+
|
| 130 |
+
# Removed decorator to avoid circular imports
|
| 131 |
+
def analyze_documents(
|
| 132 |
+
self,
|
| 133 |
+
documents: Dict[str, Dict[str, Any]],
|
| 134 |
+
analysis_type: str,
|
| 135 |
+
strategy_text: Optional[str] = None,
|
| 136 |
+
checklist_results: Optional[Dict[str, Any]] = None
|
| 137 |
+
) -> str:
|
| 138 |
+
"""
|
| 139 |
+
Analyze documents using AI with different analysis types.
|
| 140 |
+
|
| 141 |
+
This method performs comprehensive document analysis using AI, supporting
|
| 142 |
+
multiple analysis types for different business use cases.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
documents: Dictionary mapping document names to document data.
|
| 146 |
+
Each document dict should contain 'content' and other metadata.
|
| 147 |
+
analysis_type: Type of analysis to perform. Supported types:
|
| 148 |
+
- "overview": Company overview and business analysis
|
| 149 |
+
- "strategic": Strategic positioning and recommendations
|
| 150 |
+
- "checklist": Due diligence checklist analysis
|
| 151 |
+
- "questions": Answer due diligence questions
|
| 152 |
+
strategy_text: Optional strategy document content for context
|
| 153 |
+
checklist_results: Optional existing checklist results for strategic analysis
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
AI-generated analysis text with comprehensive insights
|
| 157 |
+
|
| 158 |
+
Raises:
|
| 159 |
+
AIError: If analysis fails or service is unavailable
|
| 160 |
+
ValueError: If analysis_type is not supported
|
| 161 |
+
|
| 162 |
+
Example:
|
| 163 |
+
docs = {
|
| 164 |
+
"annual_report.pdf": {"content": "Company financials...", "name": "Annual Report"},
|
| 165 |
+
"strategy.docx": {"content": "Strategic plan...", "name": "Strategy"}
|
| 166 |
+
}
|
| 167 |
+
analysis = ai_service.analyze_documents(docs, "overview")
|
| 168 |
+
"""
|
| 169 |
+
# Input validation
|
| 170 |
+
if not documents:
|
| 171 |
+
raise ValueError("Documents dictionary cannot be None or empty")
|
| 172 |
+
|
| 173 |
+
if not isinstance(documents, dict):
|
| 174 |
+
raise ValueError("Documents must be a dictionary")
|
| 175 |
+
|
| 176 |
+
if analysis_type not in SUPPORTED_ANALYSIS_TYPES:
|
| 177 |
+
raise ValueError(f"Invalid analysis type: {analysis_type}. Supported types: {SUPPORTED_ANALYSIS_TYPES}")
|
| 178 |
+
|
| 179 |
+
# Validate each document has content
|
| 180 |
+
for doc_name, doc_data in documents.items():
|
| 181 |
+
if not isinstance(doc_data, dict):
|
| 182 |
+
raise ValueError(f"Document '{doc_name}' must be a dictionary")
|
| 183 |
+
if 'content' not in doc_data:
|
| 184 |
+
raise ValueError(f"Document '{doc_name}' must contain a 'content' key")
|
| 185 |
+
if not doc_data['content']:
|
| 186 |
+
raise ValueError(f"Document '{doc_name}' content cannot be empty")
|
| 187 |
+
|
| 188 |
+
# Prepare context from documents
|
| 189 |
+
context_docs = ResponseParser.prepare_context_documents(documents)
|
| 190 |
+
|
| 191 |
+
# Create analysis prompt based on type
|
| 192 |
+
prompt = self._get_analysis_prompt(analysis_type, context_docs, strategy_text, checklist_results)
|
| 193 |
+
|
| 194 |
+
return self.generate_text(prompt, max_length=3000)
|
| 195 |
+
|
| 196 |
+
def _get_analysis_prompt(self, analysis_type: str, context_docs: List[str],
|
| 197 |
+
strategy_text: Optional[str] = None,
|
| 198 |
+
checklist_results: Optional[Dict[str, Any]] = None) -> str:
|
| 199 |
+
"""
|
| 200 |
+
Get the appropriate analysis prompt based on analysis type.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
analysis_type: Type of analysis to perform
|
| 204 |
+
context_docs: Prepared context documents
|
| 205 |
+
strategy_text: Optional strategy document content
|
| 206 |
+
checklist_results: Optional existing checklist results
|
| 207 |
+
|
| 208 |
+
Returns:
|
| 209 |
+
Generated prompt for the specified analysis type
|
| 210 |
+
|
| 211 |
+
Raises:
|
| 212 |
+
ValueError: If analysis_type is not supported
|
| 213 |
+
"""
|
| 214 |
+
if analysis_type == "overview":
|
| 215 |
+
return ResponseParser.create_overview_prompt(context_docs, strategy_text, checklist_results)
|
| 216 |
+
if analysis_type == "strategic":
|
| 217 |
+
return ResponseParser.create_strategic_prompt(context_docs, strategy_text, checklist_results)
|
| 218 |
+
if analysis_type == "checklist":
|
| 219 |
+
return ResponseParser.create_checklist_prompt(context_docs)
|
| 220 |
+
if analysis_type == "questions":
|
| 221 |
+
return ResponseParser.create_questions_prompt(context_docs)
|
| 222 |
+
|
| 223 |
+
raise ValueError(f"Unknown analysis type: {analysis_type}")
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
# Removed decorator to avoid circular imports
|
| 227 |
+
def answer_question(
|
| 228 |
+
self,
|
| 229 |
+
question: str,
|
| 230 |
+
context_docs: List[str],
|
| 231 |
+
max_length: Optional[int] = None
|
| 232 |
+
) -> str:
|
| 233 |
+
"""
|
| 234 |
+
Answer a specific question using AI with document context.
|
| 235 |
+
|
| 236 |
+
This method performs question answering by analyzing the provided
|
| 237 |
+
question against relevant document excerpts to provide accurate,
|
| 238 |
+
context-aware answers.
|
| 239 |
+
|
| 240 |
+
Args:
|
| 241 |
+
question: The question to answer. Should be clear and specific
|
| 242 |
+
for best results (e.g., "What is the company's revenue?"
|
| 243 |
+
rather than "Tell me about revenue").
|
| 244 |
+
context_docs: List of relevant document excerpts that may contain
|
| 245 |
+
information to answer the question. Should be
|
| 246 |
+
pre-filtered to most relevant content.
|
| 247 |
+
max_length: Optional maximum length for the answer in characters.
|
| 248 |
+
If None, uses service default (typically 2000 chars).
|
| 249 |
+
|
| 250 |
+
Returns:
|
| 251 |
+
AI-generated answer with citations and context where applicable
|
| 252 |
+
|
| 253 |
+
Raises:
|
| 254 |
+
AIError: If question answering fails or service is unavailable
|
| 255 |
+
|
| 256 |
+
Example:
|
| 257 |
+
context = [
|
| 258 |
+
"The company reported $50M in revenue for Q4 2023...",
|
| 259 |
+
"Revenue growth was 15% compared to previous year..."
|
| 260 |
+
]
|
| 261 |
+
answer = ai_service.answer_question(
|
| 262 |
+
"What was the company's revenue for Q4 2023?",
|
| 263 |
+
context
|
| 264 |
+
)
|
| 265 |
+
"""
|
| 266 |
+
# Input validation
|
| 267 |
+
if not question or not isinstance(question, str):
|
| 268 |
+
raise ValueError("Question must be a non-empty string")
|
| 269 |
+
|
| 270 |
+
if not context_docs:
|
| 271 |
+
raise ValueError("Context documents list cannot be None or empty")
|
| 272 |
+
|
| 273 |
+
if not isinstance(context_docs, list):
|
| 274 |
+
raise ValueError("Context documents must be a list")
|
| 275 |
+
|
| 276 |
+
# Validate each context document
|
| 277 |
+
for i, doc in enumerate(context_docs):
|
| 278 |
+
if not isinstance(doc, str):
|
| 279 |
+
raise ValueError(f"Context document at index {i} must be a string")
|
| 280 |
+
if not doc.strip():
|
| 281 |
+
raise ValueError(f"Context document at index {i} cannot be empty or whitespace only")
|
| 282 |
+
|
| 283 |
+
prompt = ResponseParser.create_question_answer_prompt(question, context_docs)
|
| 284 |
+
return self.generate_text(prompt, max_length=max_length or QA_MAX_TOKENS)
|
| 285 |
+
|
| 286 |
+
def get_token_usage_estimate(self, text: str) -> int:
|
| 287 |
+
"""
|
| 288 |
+
Estimate token usage for a given text using character-based approximation.
|
| 289 |
+
|
| 290 |
+
This method provides a rough estimate of token count based on character
|
| 291 |
+
length. Actual token counts may vary depending on the specific tokenizer
|
| 292 |
+
used by the AI model.
|
| 293 |
+
|
| 294 |
+
Args:
|
| 295 |
+
text: Text to estimate token count for. Can be any string content
|
| 296 |
+
including prompts, documents, or responses.
|
| 297 |
+
|
| 298 |
+
Returns:
|
| 299 |
+
Estimated token count (integer). Uses approximation of ~4 characters
|
| 300 |
+
per token, which is typical for English text with Claude models.
|
| 301 |
+
|
| 302 |
+
Note:
|
| 303 |
+
This is an approximation. For precise token counting, use the
|
| 304 |
+
actual tokenizer for the specific AI model being used.
|
| 305 |
+
|
| 306 |
+
Example:
|
| 307 |
+
estimate = ai_service.get_token_usage_estimate("Hello, how are you?")
|
| 308 |
+
# Returns approximately 5-6 tokens
|
| 309 |
+
"""
|
| 310 |
+
if not text:
|
| 311 |
+
return 0
|
| 312 |
+
|
| 313 |
+
# Rough estimation: ~4 characters per token for English text
|
| 314 |
+
# This is a conservative estimate that works well for Claude models
|
| 315 |
+
return len(text) // 4
|
| 316 |
+
|
| 317 |
+
def is_within_token_limit(self, text: str, max_tokens: int = 100000) -> bool:
|
| 318 |
+
"""
|
| 319 |
+
Check if text is within specified token limits.
|
| 320 |
+
|
| 321 |
+
This method helps prevent token overflow by checking if the estimated
|
| 322 |
+
token count for a given text is within acceptable limits.
|
| 323 |
+
|
| 324 |
+
Args:
|
| 325 |
+
text: Text to check for token limit compliance
|
| 326 |
+
max_tokens: Maximum allowed tokens. Default is 100,000 which is
|
| 327 |
+
a conservative limit for most AI models.
|
| 328 |
+
|
| 329 |
+
Returns:
|
| 330 |
+
True if estimated token count is within the specified limit,
|
| 331 |
+
False if it exceeds the limit.
|
| 332 |
+
|
| 333 |
+
Note:
|
| 334 |
+
Uses character-based estimation which may not be perfectly accurate.
|
| 335 |
+
For critical token limit checking, consider using the actual tokenizer.
|
| 336 |
+
|
| 337 |
+
Example:
|
| 338 |
+
if ai_service.is_within_token_limit(document_content, 8000):
|
| 339 |
+
# Safe to process
|
| 340 |
+
analysis = ai_service.analyze_documents(docs, "overview")
|
| 341 |
+
else:
|
| 342 |
+
# Need to truncate or split content
|
| 343 |
+
print("Content too long for processing")
|
| 344 |
+
"""
|
| 345 |
+
if not text:
|
| 346 |
+
return True
|
| 347 |
+
|
| 348 |
+
estimated_tokens = self.get_token_usage_estimate(text)
|
| 349 |
+
return estimated_tokens <= max_tokens
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
# Factory function for easy service creation
|
| 353 |
+
def create_ai_service(
|
| 354 |
+
api_key: str,
|
| 355 |
+
model: str,
|
| 356 |
+
temperature: float = 0.1,
|
| 357 |
+
max_tokens: int = 4000
|
| 358 |
+
) -> AIService:
|
| 359 |
+
"""
|
| 360 |
+
Create and configure an AI service instance with the given parameters.
|
| 361 |
+
|
| 362 |
+
This factory function provides a convenient way to create AI service instances
|
| 363 |
+
with proper configuration and validation. It handles all the setup steps
|
| 364 |
+
including configuration validation and service initialization.
|
| 365 |
+
|
| 366 |
+
Args:
|
| 367 |
+
api_key: Anthropic API key for authentication. Must be a valid
|
| 368 |
+
Anthropic API key with sufficient permissions.
|
| 369 |
+
model: Claude model to use for AI operations. Examples:
|
| 370 |
+
- "claude-3-5-sonnet" (recommended for most use cases)
|
| 371 |
+
- "claude-3-5-haiku-20241022" (faster, less expensive)
|
| 372 |
+
- "claude-3-opus-20240229" (most capable, more expensive)
|
| 373 |
+
temperature: Sampling temperature for response generation (0.0 to 1.0).
|
| 374 |
+
Lower values (0.1) produce more deterministic responses.
|
| 375 |
+
Higher values (0.7+) produce more creative responses.
|
| 376 |
+
max_tokens: Maximum tokens to generate in AI responses.
|
| 377 |
+
Default 4000 tokens provides good balance of length and cost.
|
| 378 |
+
|
| 379 |
+
Returns:
|
| 380 |
+
Fully configured and validated AIService instance ready for use
|
| 381 |
+
|
| 382 |
+
Raises:
|
| 383 |
+
ConfigError: If configuration parameters are invalid
|
| 384 |
+
AIError: If AI service initialization fails
|
| 385 |
+
|
| 386 |
+
Example:
|
| 387 |
+
# Basic usage
|
| 388 |
+
ai_service = create_ai_service("sk-ant-...", "claude-3-5-sonnet")
|
| 389 |
+
|
| 390 |
+
# Advanced configuration
|
| 391 |
+
ai_service = create_ai_service(
|
| 392 |
+
api_key="sk-ant-...",
|
| 393 |
+
model="claude-3-5-haiku-20241022",
|
| 394 |
+
temperature=0.2,
|
| 395 |
+
max_tokens=QA_MAX_TOKENS
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
# Use the service
|
| 399 |
+
if ai_service.is_available:
|
| 400 |
+
answer = ai_service.answer_question("What is AI?", ["AI is artificial intelligence..."])
|
| 401 |
+
"""
|
| 402 |
+
# Validate and resolve API key
|
| 403 |
+
api_key = _resolve_api_key(api_key)
|
| 404 |
+
|
| 405 |
+
config = AIConfig(
|
| 406 |
+
api_key=api_key,
|
| 407 |
+
model=model,
|
| 408 |
+
temperature=temperature,
|
| 409 |
+
max_tokens=max_tokens
|
| 410 |
+
)
|
| 411 |
+
return AIService(config)
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
def _resolve_api_key(api_key: Optional[str]) -> str:
|
| 415 |
+
"""
|
| 416 |
+
Resolve API key from parameter or environment variable.
|
| 417 |
+
|
| 418 |
+
Args:
|
| 419 |
+
api_key: API key provided by user, or None
|
| 420 |
+
|
| 421 |
+
Returns:
|
| 422 |
+
Resolved API key string
|
| 423 |
+
|
| 424 |
+
Raises:
|
| 425 |
+
ConfigError: If no API key is available
|
| 426 |
+
"""
|
| 427 |
+
if api_key is not None:
|
| 428 |
+
return api_key
|
| 429 |
+
|
| 430 |
+
import os
|
| 431 |
+
env_key = os.getenv('ANTHROPIC_API_KEY')
|
| 432 |
+
if env_key is not None:
|
| 433 |
+
return env_key
|
| 434 |
+
|
| 435 |
+
raise create_config_error(
|
| 436 |
+
"AI API key is missing",
|
| 437 |
+
recovery_hint="Please set ANTHROPIC_API_KEY environment variable or pass api_key parameter"
|
| 438 |
+
)
|
app/services/response_parser.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Response Parser
|
| 4 |
+
|
| 5 |
+
Handles response parsing and formatting functions for AI operations.
|
| 6 |
+
Provides methods for creating prompts and processing AI responses.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
|
| 11 |
+
from app.core.exceptions import ProcessingError
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ResponseParser:
|
| 15 |
+
"""
|
| 16 |
+
Parser for AI responses and prompt generation.
|
| 17 |
+
|
| 18 |
+
This class provides methods for creating structured prompts
|
| 19 |
+
and processing AI responses for different analysis types.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
@staticmethod
|
| 23 |
+
def create_overview_prompt(
|
| 24 |
+
context_docs: List[str],
|
| 25 |
+
strategy_text: Optional[str],
|
| 26 |
+
checklist_results: Optional[Dict]
|
| 27 |
+
) -> str:
|
| 28 |
+
"""Create overview analysis prompt"""
|
| 29 |
+
prompt = "Based on the following company documents, provide a comprehensive overview analysis:\n\n"
|
| 30 |
+
|
| 31 |
+
if context_docs:
|
| 32 |
+
prompt += "Company Documents:\n" + "\n\n".join(context_docs) + "\n\n"
|
| 33 |
+
|
| 34 |
+
if strategy_text:
|
| 35 |
+
prompt += f"Strategic Context:\n{strategy_text[:1000]}\n\n"
|
| 36 |
+
|
| 37 |
+
if checklist_results:
|
| 38 |
+
prompt += f"Checklist Findings:\n{str(checklist_results)[:1000]}\n\n"
|
| 39 |
+
|
| 40 |
+
prompt += """Please provide:
|
| 41 |
+
1. Company overview and business model
|
| 42 |
+
2. Key strengths and competitive advantages
|
| 43 |
+
3. Main risks and challenges
|
| 44 |
+
4. Financial health indicators
|
| 45 |
+
5. Strategic recommendations
|
| 46 |
+
|
| 47 |
+
Be specific, factual, and focus on the most important insights."""
|
| 48 |
+
|
| 49 |
+
return prompt
|
| 50 |
+
|
| 51 |
+
@staticmethod
|
| 52 |
+
def create_strategic_prompt(
|
| 53 |
+
context_docs: List[str],
|
| 54 |
+
strategy_text: Optional[str],
|
| 55 |
+
checklist_results: Optional[Dict]
|
| 56 |
+
) -> str:
|
| 57 |
+
"""Create strategic analysis prompt"""
|
| 58 |
+
prompt = "Provide a strategic analysis based on the following company information:\n\n"
|
| 59 |
+
|
| 60 |
+
if strategy_text:
|
| 61 |
+
prompt += f"Strategic Framework:\n{strategy_text[:1000]}\n\n"
|
| 62 |
+
|
| 63 |
+
if context_docs:
|
| 64 |
+
prompt += "Company Documents:\n" + "\n\n".join(context_docs) + "\n\n"
|
| 65 |
+
|
| 66 |
+
if checklist_results:
|
| 67 |
+
prompt += f"Operational Findings:\n{str(checklist_results)[:1000]}\n\n"
|
| 68 |
+
|
| 69 |
+
prompt += """Please analyze:
|
| 70 |
+
1. Strategic positioning and market opportunities
|
| 71 |
+
2. Operational strengths and weaknesses
|
| 72 |
+
3. Risk mitigation strategies
|
| 73 |
+
4. Growth potential and recommendations
|
| 74 |
+
5. Investment considerations
|
| 75 |
+
|
| 76 |
+
Focus on strategic implications and actionable insights."""
|
| 77 |
+
|
| 78 |
+
return prompt
|
| 79 |
+
|
| 80 |
+
@staticmethod
|
| 81 |
+
def create_checklist_prompt(context_docs: List[str]) -> str:
|
| 82 |
+
"""Create checklist analysis prompt"""
|
| 83 |
+
prompt = "Analyze the following documents against standard due diligence checklist items:\n\n"
|
| 84 |
+
|
| 85 |
+
if context_docs:
|
| 86 |
+
prompt += "Documents to Analyze:\n" + "\n\n".join(context_docs) + "\n\n"
|
| 87 |
+
|
| 88 |
+
prompt += """For each major due diligence category, identify:
|
| 89 |
+
1. What information is available in the documents
|
| 90 |
+
2. What information appears to be missing
|
| 91 |
+
3. Any red flags or concerns identified
|
| 92 |
+
4. Recommendations for further investigation
|
| 93 |
+
|
| 94 |
+
Be thorough and specific in your analysis."""
|
| 95 |
+
|
| 96 |
+
return prompt
|
| 97 |
+
|
| 98 |
+
@staticmethod
|
| 99 |
+
def create_questions_prompt(context_docs: List[str]) -> str:
|
| 100 |
+
"""Create questions analysis prompt"""
|
| 101 |
+
prompt = "Answer due diligence questions based on the following documents:\n\n"
|
| 102 |
+
|
| 103 |
+
if context_docs:
|
| 104 |
+
prompt += "Reference Documents:\n" + "\n\n".join(context_docs) + "\n\n"
|
| 105 |
+
|
| 106 |
+
prompt += """For each question, provide:
|
| 107 |
+
1. Direct answer based on available information
|
| 108 |
+
2. Supporting evidence from the documents
|
| 109 |
+
3. Confidence level in the answer
|
| 110 |
+
4. Any additional context or caveats
|
| 111 |
+
|
| 112 |
+
If information is not available, clearly state this and suggest what additional information would be needed."""
|
| 113 |
+
|
| 114 |
+
return prompt
|
| 115 |
+
|
| 116 |
+
@staticmethod
|
| 117 |
+
def create_question_answer_prompt(question: str, context_docs: List[str]) -> str:
|
| 118 |
+
"""Create prompt for answering a specific question"""
|
| 119 |
+
return f"""Based on the following document excerpts, please answer this question:
|
| 120 |
+
|
| 121 |
+
Question: {question}
|
| 122 |
+
|
| 123 |
+
Relevant Document Excerpts:
|
| 124 |
+
{"\n\n".join(context_docs[:5])}
|
| 125 |
+
|
| 126 |
+
Please provide a comprehensive, factual answer with specific references to the source documents.
|
| 127 |
+
If the information is not available in the provided context, clearly state this."""
|
| 128 |
+
|
| 129 |
+
@staticmethod
|
| 130 |
+
def format_response(response: str, max_length: Optional[int] = None) -> str:
|
| 131 |
+
"""
|
| 132 |
+
Format and clean AI response.
|
| 133 |
+
|
| 134 |
+
Args:
|
| 135 |
+
response: Raw AI response
|
| 136 |
+
max_length: Optional maximum length for the response
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
Formatted response
|
| 140 |
+
|
| 141 |
+
Raises:
|
| 142 |
+
ProcessingError: If response formatting fails
|
| 143 |
+
"""
|
| 144 |
+
try:
|
| 145 |
+
if not response:
|
| 146 |
+
raise ValueError("Response cannot be empty")
|
| 147 |
+
|
| 148 |
+
result = response.strip()
|
| 149 |
+
if max_length and len(result) > max_length:
|
| 150 |
+
result = result[:max_length] + "..."
|
| 151 |
+
return result
|
| 152 |
+
except Exception as e:
|
| 153 |
+
raise ProcessingError(f"Failed to format AI response: {e}")
|
| 154 |
+
|
| 155 |
+
@staticmethod
|
| 156 |
+
def prepare_context_documents(documents: Dict[str, Dict[str, Any]], max_docs: int = 5) -> List[str]:
|
| 157 |
+
"""
|
| 158 |
+
Prepare context documents for AI processing.
|
| 159 |
+
|
| 160 |
+
Args:
|
| 161 |
+
documents: Dictionary mapping document names to document data
|
| 162 |
+
max_docs: Maximum number of documents to process
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
List of formatted document contexts
|
| 166 |
+
|
| 167 |
+
Raises:
|
| 168 |
+
ProcessingError: If document preparation fails
|
| 169 |
+
"""
|
| 170 |
+
try:
|
| 171 |
+
if not documents:
|
| 172 |
+
raise ValueError("No documents provided for context preparation")
|
| 173 |
+
|
| 174 |
+
context_docs = []
|
| 175 |
+
for doc_key, doc_data in list(documents.items())[:max_docs]:
|
| 176 |
+
if isinstance(doc_data, dict) and 'content' in doc_data:
|
| 177 |
+
content = doc_data['content'][:1000] # Truncate long content
|
| 178 |
+
context_docs.append(f"Document: {doc_data.get('name', doc_key)}\n{content}")
|
| 179 |
+
|
| 180 |
+
if not context_docs:
|
| 181 |
+
raise ValueError("No valid documents found with content")
|
| 182 |
+
|
| 183 |
+
return context_docs
|
| 184 |
+
except Exception as e:
|
| 185 |
+
raise ProcessingError(f"Failed to prepare context documents: {e}")
|
app/ui/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
UI Components Package
|
| 3 |
+
|
| 4 |
+
Contains all user interface components and layout functions.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .sidebar import Sidebar
|
| 8 |
+
|
| 9 |
+
__all__ = ['Sidebar']
|
app/ui/error_handler.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Standardized Error Handling System
|
| 4 |
+
|
| 5 |
+
Provides consistent error handling patterns across all modules.
|
| 6 |
+
Centralizes error logging, user messaging, and recovery mechanisms.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
import streamlit as st
|
| 11 |
+
from typing import Any, Optional, Callable, TypeVar
|
| 12 |
+
from functools import wraps
|
| 13 |
+
|
| 14 |
+
from app.core.exceptions import (
|
| 15 |
+
AppException, ValidationError, ProcessingError,
|
| 16 |
+
AIError, ConfigError
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
# Re-export core exceptions for backward compatibility
|
| 22 |
+
AppError = AppException
|
| 23 |
+
|
| 24 |
+
T = TypeVar('T')
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# Exception classes are imported from app.core.exceptions above
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ErrorHandler:
|
| 31 |
+
"""
|
| 32 |
+
Centralized error handling system with consistent patterns.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
@staticmethod
|
| 36 |
+
def handle_error(
|
| 37 |
+
error: Exception,
|
| 38 |
+
context: str = "",
|
| 39 |
+
show_user_message: bool = True,
|
| 40 |
+
log_error: bool = True,
|
| 41 |
+
recovery_hint: Optional[str] = None
|
| 42 |
+
) -> None:
|
| 43 |
+
"""
|
| 44 |
+
Handle an error with consistent logging and user messaging.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
error: The exception that occurred
|
| 48 |
+
context: Description of where the error occurred
|
| 49 |
+
show_user_message: Whether to show error message to user
|
| 50 |
+
log_error: Whether to log the error
|
| 51 |
+
recovery_hint: Optional hint for user recovery
|
| 52 |
+
"""
|
| 53 |
+
if log_error:
|
| 54 |
+
ErrorHandler._log_error(error, context)
|
| 55 |
+
|
| 56 |
+
if show_user_message:
|
| 57 |
+
ErrorHandler._show_user_error(error, recovery_hint)
|
| 58 |
+
|
| 59 |
+
@staticmethod
|
| 60 |
+
def _log_error(error: Exception, context: str = "") -> None:
|
| 61 |
+
"""Log error with appropriate level based on error type"""
|
| 62 |
+
error_msg = f"{context}: {str(error)}" if context else str(error)
|
| 63 |
+
|
| 64 |
+
if isinstance(error, (ValidationError, ConfigError)):
|
| 65 |
+
logger.warning(error_msg)
|
| 66 |
+
elif isinstance(error, (ProcessingError, AIError)):
|
| 67 |
+
logger.error(error_msg)
|
| 68 |
+
else:
|
| 69 |
+
logger.exception(f"Unexpected error - {error_msg}")
|
| 70 |
+
|
| 71 |
+
@staticmethod
|
| 72 |
+
def _show_user_error(error: Exception, recovery_hint: Optional[str] = None) -> None:
|
| 73 |
+
"""Show appropriate error message to user"""
|
| 74 |
+
from app.ui.ui_components import status_message
|
| 75 |
+
|
| 76 |
+
if isinstance(error, AppError):
|
| 77 |
+
user_message = error.user_message
|
| 78 |
+
else:
|
| 79 |
+
# For unexpected errors, don't show internal details
|
| 80 |
+
user_message = "An unexpected error occurred. Please try again."
|
| 81 |
+
|
| 82 |
+
# Add recovery hint if provided
|
| 83 |
+
if recovery_hint:
|
| 84 |
+
user_message += f"\n\n💡 {recovery_hint}"
|
| 85 |
+
|
| 86 |
+
# Show error message to user
|
| 87 |
+
if isinstance(error, ValidationError):
|
| 88 |
+
status_message(user_message, "warning")
|
| 89 |
+
else:
|
| 90 |
+
status_message(user_message, "error")
|
| 91 |
+
|
| 92 |
+
@staticmethod
|
| 93 |
+
def handle_with_recovery(
|
| 94 |
+
func: Callable[..., T],
|
| 95 |
+
context: str = "",
|
| 96 |
+
default_value: Any = None,
|
| 97 |
+
show_spinner: bool = False,
|
| 98 |
+
spinner_text: str = "Processing...",
|
| 99 |
+
recovery_hint: Optional[str] = None
|
| 100 |
+
) -> Callable[..., T]:
|
| 101 |
+
"""
|
| 102 |
+
Decorator that provides consistent error handling with recovery.
|
| 103 |
+
|
| 104 |
+
Args:
|
| 105 |
+
func: Function to wrap
|
| 106 |
+
context: Description of the operation
|
| 107 |
+
default_value: Value to return on error
|
| 108 |
+
show_spinner: Whether to show spinner during operation
|
| 109 |
+
spinner_text: Text to show in spinner
|
| 110 |
+
recovery_hint: Hint for user recovery
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
Wrapped function with error handling
|
| 114 |
+
"""
|
| 115 |
+
@wraps(func)
|
| 116 |
+
def wrapper(*args, **kwargs) -> T:
|
| 117 |
+
try:
|
| 118 |
+
if show_spinner:
|
| 119 |
+
with st.spinner(spinner_text):
|
| 120 |
+
return func(*args, **kwargs)
|
| 121 |
+
else:
|
| 122 |
+
return func(*args, **kwargs)
|
| 123 |
+
except Exception as e:
|
| 124 |
+
ErrorHandler.handle_error(e, context, recovery_hint=recovery_hint)
|
| 125 |
+
return default_value
|
| 126 |
+
|
| 127 |
+
return wrapper
|
| 128 |
+
|
| 129 |
+
@staticmethod
|
| 130 |
+
def validate_input(value: Any, validator: Callable[[Any], bool], error_message: str) -> bool:
|
| 131 |
+
"""
|
| 132 |
+
Validate input with consistent error handling.
|
| 133 |
+
|
| 134 |
+
Args:
|
| 135 |
+
value: Value to validate
|
| 136 |
+
validator: Function that returns True if valid
|
| 137 |
+
error_message: Error message if validation fails
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
True if valid, False otherwise
|
| 141 |
+
"""
|
| 142 |
+
try:
|
| 143 |
+
if validator(value):
|
| 144 |
+
return True
|
| 145 |
+
else:
|
| 146 |
+
raise ValidationError(error_message)
|
| 147 |
+
except ValidationError:
|
| 148 |
+
raise
|
| 149 |
+
except Exception as e:
|
| 150 |
+
raise ValidationError(f"Validation failed: {str(e)}")
|
| 151 |
+
|
| 152 |
+
@staticmethod
|
| 153 |
+
def ensure_config_value(config_value: Any, config_name: str) -> Any:
|
| 154 |
+
"""
|
| 155 |
+
Ensure a configuration value exists and is valid.
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
config_value: The configuration value to check
|
| 159 |
+
config_name: Name of the configuration for error messages
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
The config value if valid
|
| 163 |
+
|
| 164 |
+
Raises:
|
| 165 |
+
ConfigError: If config value is missing or invalid
|
| 166 |
+
"""
|
| 167 |
+
if config_value is None or config_value == "":
|
| 168 |
+
raise ConfigError(
|
| 169 |
+
f"Configuration '{config_name}' is missing or empty",
|
| 170 |
+
user_message=f"Configuration error: {config_name} is not set",
|
| 171 |
+
recovery_hint="Please check your configuration and environment variables"
|
| 172 |
+
)
|
| 173 |
+
return config_value
|
| 174 |
+
|
| 175 |
+
@staticmethod
|
| 176 |
+
def handle_file_operation(
|
| 177 |
+
file_path: str,
|
| 178 |
+
operation: Callable[[], T],
|
| 179 |
+
operation_name: str = "file operation"
|
| 180 |
+
) -> T:
|
| 181 |
+
"""
|
| 182 |
+
Handle file operations with consistent error handling.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
file_path: Path to the file being operated on
|
| 186 |
+
operation: Function that performs the file operation
|
| 187 |
+
operation_name: Description of the operation
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
Result of the file operation
|
| 191 |
+
"""
|
| 192 |
+
try:
|
| 193 |
+
return operation()
|
| 194 |
+
except FileNotFoundError:
|
| 195 |
+
raise ProcessingError(
|
| 196 |
+
f"File not found: {file_path}",
|
| 197 |
+
user_message=f"File not found: {file_path}",
|
| 198 |
+
recovery_hint="Please ensure the file exists and try again"
|
| 199 |
+
)
|
| 200 |
+
except PermissionError:
|
| 201 |
+
raise ProcessingError(
|
| 202 |
+
f"Permission denied accessing file: {file_path}",
|
| 203 |
+
user_message=f"Cannot access file: {file_path}",
|
| 204 |
+
recovery_hint="Please check file permissions"
|
| 205 |
+
)
|
| 206 |
+
except Exception as e:
|
| 207 |
+
raise ProcessingError(
|
| 208 |
+
f"Failed to {operation_name} file {file_path}: {str(e)}",
|
| 209 |
+
user_message=f"File operation failed: {operation_name}",
|
| 210 |
+
recovery_hint="Please check the file and try again"
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
# Convenience decorators for common patterns
|
| 215 |
+
def handle_ui_errors(context: str = "", recovery_hint: Optional[str] = None):
|
| 216 |
+
"""
|
| 217 |
+
Decorator for UI operations that need error handling.
|
| 218 |
+
|
| 219 |
+
Args:
|
| 220 |
+
context: Description of the operation
|
| 221 |
+
recovery_hint: Optional hint for user recovery
|
| 222 |
+
"""
|
| 223 |
+
def decorator(func):
|
| 224 |
+
@wraps(func)
|
| 225 |
+
def wrapper(*args, **kwargs):
|
| 226 |
+
try:
|
| 227 |
+
return func(*args, **kwargs)
|
| 228 |
+
except Exception as e:
|
| 229 |
+
ErrorHandler.handle_error(e, context, recovery_hint=recovery_hint)
|
| 230 |
+
return None
|
| 231 |
+
return wrapper
|
| 232 |
+
return decorator
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def handle_processing_errors(context: str = "", recovery_hint: Optional[str] = None):
|
| 236 |
+
"""
|
| 237 |
+
Decorator for processing operations that need error handling.
|
| 238 |
+
|
| 239 |
+
Args:
|
| 240 |
+
context: Description of the operation
|
| 241 |
+
recovery_hint: Optional hint for user recovery
|
| 242 |
+
"""
|
| 243 |
+
def decorator(func):
|
| 244 |
+
@wraps(func)
|
| 245 |
+
def wrapper(*args, **kwargs):
|
| 246 |
+
try:
|
| 247 |
+
return func(*args, **kwargs)
|
| 248 |
+
except Exception as e:
|
| 249 |
+
ErrorHandler.handle_error(e, context, recovery_hint=recovery_hint)
|
| 250 |
+
raise # Re-raise for caller to handle
|
| 251 |
+
return wrapper
|
| 252 |
+
return decorator
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def validate_and_execute(
|
| 256 |
+
validator: Callable[[], bool],
|
| 257 |
+
operation: Callable[[], T],
|
| 258 |
+
validation_error_msg: str = "Validation failed",
|
| 259 |
+
context: str = ""
|
| 260 |
+
) -> T:
|
| 261 |
+
"""
|
| 262 |
+
Validate and execute operation with consistent error handling.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
validator: Function that returns True if validation passes
|
| 266 |
+
operation: Function to execute if validation passes
|
| 267 |
+
validation_error_msg: Error message for validation failure
|
| 268 |
+
context: Description of the operation
|
| 269 |
+
|
| 270 |
+
Returns:
|
| 271 |
+
Result of the operation
|
| 272 |
+
|
| 273 |
+
Raises:
|
| 274 |
+
ValidationError: If validation fails
|
| 275 |
+
"""
|
| 276 |
+
try:
|
| 277 |
+
if not validator():
|
| 278 |
+
raise ValidationError(validation_error_msg, recovery_hint="Please check your input and try again")
|
| 279 |
+
return operation()
|
| 280 |
+
except ValidationError:
|
| 281 |
+
raise
|
| 282 |
+
except Exception as e:
|
| 283 |
+
ErrorHandler.handle_error(e, f"{context} - validation/execution failed")
|
| 284 |
+
raise
|
app/ui/session_manager.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Session State Manager
|
| 4 |
+
|
| 5 |
+
Manages Streamlit session state with type-safe access.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
from typing import Any
|
| 10 |
+
|
| 11 |
+
from app.ui.error_handler import ErrorHandler
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class SessionProperty:
|
| 16 |
+
"""
|
| 17 |
+
Descriptor for session state properties with type-safe access.
|
| 18 |
+
|
| 19 |
+
This descriptor provides a clean interface to Streamlit's session state,
|
| 20 |
+
eliminating repetitive property definitions while maintaining type safety.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, default_value: Any = None):
|
| 24 |
+
self.default_value = default_value
|
| 25 |
+
self.name = None
|
| 26 |
+
|
| 27 |
+
def __set_name__(self, owner, name):
|
| 28 |
+
self.name = name
|
| 29 |
+
|
| 30 |
+
def __get__(self, instance, owner):
|
| 31 |
+
if instance is None:
|
| 32 |
+
return self
|
| 33 |
+
return st.session_state.get(self.name, self.default_value)
|
| 34 |
+
|
| 35 |
+
def __set__(self, instance, value):
|
| 36 |
+
st.session_state[self.name] = value
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class SessionManager:
|
| 40 |
+
"""Session state manager with type-safe access to session data."""
|
| 41 |
+
|
| 42 |
+
# Document processing state
|
| 43 |
+
documents = SessionProperty({})
|
| 44 |
+
chunks = SessionProperty([])
|
| 45 |
+
embeddings = SessionProperty(None)
|
| 46 |
+
|
| 47 |
+
# Analysis results
|
| 48 |
+
checklist_results = SessionProperty({})
|
| 49 |
+
question_answers = SessionProperty({})
|
| 50 |
+
overview_summary = SessionProperty("")
|
| 51 |
+
strategic_summary = SessionProperty("")
|
| 52 |
+
|
| 53 |
+
# User selections
|
| 54 |
+
strategy_path = SessionProperty(None)
|
| 55 |
+
strategy_text = SessionProperty("")
|
| 56 |
+
checklist_path = SessionProperty(None)
|
| 57 |
+
checklist_text = SessionProperty("")
|
| 58 |
+
questions_path = SessionProperty(None)
|
| 59 |
+
questions_text = SessionProperty("")
|
| 60 |
+
vdr_store = SessionProperty(None)
|
| 61 |
+
data_room_path = SessionProperty(None)
|
| 62 |
+
|
| 63 |
+
# Processing state
|
| 64 |
+
processing_active = SessionProperty(False)
|
| 65 |
+
agent = SessionProperty(None)
|
| 66 |
+
|
| 67 |
+
# Cached data
|
| 68 |
+
checklist = SessionProperty({})
|
| 69 |
+
questions = SessionProperty({})
|
| 70 |
+
|
| 71 |
+
def __init__(self) -> None:
|
| 72 |
+
"""Initialize session state manager with default values."""
|
| 73 |
+
self._init_defaults()
|
| 74 |
+
|
| 75 |
+
def _init_defaults(self) -> None:
|
| 76 |
+
"""Initialize default session state values."""
|
| 77 |
+
try:
|
| 78 |
+
# Get all descriptor properties and their defaults
|
| 79 |
+
all_properties = {
|
| 80 |
+
name: getattr(self.__class__, name).default_value
|
| 81 |
+
for name in dir(self.__class__)
|
| 82 |
+
if isinstance(getattr(self.__class__, name), SessionProperty)
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
for key, default_value in all_properties.items():
|
| 86 |
+
if key not in st.session_state:
|
| 87 |
+
st.session_state[key] = default_value
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
ErrorHandler.handle_error(
|
| 91 |
+
e,
|
| 92 |
+
"Session initialization failed",
|
| 93 |
+
recovery_hint="Please refresh the page and try again"
|
| 94 |
+
)
|
| 95 |
+
# Initialize with minimal defaults on error
|
| 96 |
+
st.session_state.clear()
|
| 97 |
+
st.session_state.update({
|
| 98 |
+
'documents': {},
|
| 99 |
+
'processing_active': False,
|
| 100 |
+
'agent': None,
|
| 101 |
+
})
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def reset(self) -> None:
|
| 105 |
+
"""Reset analysis results and cached data for fresh analysis."""
|
| 106 |
+
self.overview_summary = ""
|
| 107 |
+
self.strategic_summary = ""
|
| 108 |
+
self.checklist_results = {}
|
| 109 |
+
self.question_answers = {}
|
| 110 |
+
|
| 111 |
+
def reset_processing(self) -> None:
|
| 112 |
+
"""Reset processing flags to allow new operations."""
|
| 113 |
+
self.processing_active = False
|
| 114 |
+
|
| 115 |
+
def ready(self) -> bool:
|
| 116 |
+
"""Check if system is ready for analysis operations."""
|
| 117 |
+
return bool(self.documents is not None and len(self.documents) > 0 and not self.processing_active)
|
app/ui/sidebar.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Sidebar Component
|
| 4 |
+
|
| 5 |
+
Handles project selection, file selectors, and AI settings.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Tuple, Optional
|
| 11 |
+
|
| 12 |
+
from app.ui.session_manager import SessionManager
|
| 13 |
+
# Use lazy imports to avoid circular import issues
|
| 14 |
+
# from app.handlers.document_handler import DocumentHandler
|
| 15 |
+
# from app.handlers.ai_handler import AIHandler
|
| 16 |
+
# Import components directly to avoid circular import issues
|
| 17 |
+
import importlib.util
|
| 18 |
+
import os
|
| 19 |
+
|
| 20 |
+
# Load the ui_components.py module directly
|
| 21 |
+
components_path = os.path.join(os.path.dirname(__file__), 'ui_components.py')
|
| 22 |
+
spec = importlib.util.spec_from_file_location("components_module", components_path)
|
| 23 |
+
components_module = importlib.util.module_from_spec(spec)
|
| 24 |
+
spec.loader.exec_module(components_module)
|
| 25 |
+
|
| 26 |
+
# Import the specific functions we need
|
| 27 |
+
render_project_selector = components_module.render_project_selector
|
| 28 |
+
render_ai_settings = components_module.render_ai_settings
|
| 29 |
+
render_file_selector = components_module.render_file_selector
|
| 30 |
+
display_processing_error = components_module.display_processing_error
|
| 31 |
+
status_message = components_module.status_message
|
| 32 |
+
from app.core import logger
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class Sidebar:
|
| 36 |
+
"""
|
| 37 |
+
Simplified sidebar component that handles all sidebar functionality.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def __init__(self, session: SessionManager, config):
|
| 41 |
+
"""Initialize sidebar with session manager and config"""
|
| 42 |
+
self.session = session
|
| 43 |
+
self.config = config
|
| 44 |
+
# Handlers will be imported lazily when needed
|
| 45 |
+
self._document_handler = None
|
| 46 |
+
self._ai_handler = None
|
| 47 |
+
|
| 48 |
+
@property
|
| 49 |
+
def document_handler(self):
|
| 50 |
+
"""Lazy import of DocumentHandler"""
|
| 51 |
+
if self._document_handler is None:
|
| 52 |
+
from app.handlers.document_handler import DocumentHandler
|
| 53 |
+
self._document_handler = DocumentHandler(self.session)
|
| 54 |
+
return self._document_handler
|
| 55 |
+
|
| 56 |
+
@property
|
| 57 |
+
def ai_handler(self):
|
| 58 |
+
"""Lazy import of AIHandler"""
|
| 59 |
+
if self._ai_handler is None:
|
| 60 |
+
from app.handlers.ai_handler import AIHandler
|
| 61 |
+
self._ai_handler = AIHandler(self.session)
|
| 62 |
+
return self._ai_handler
|
| 63 |
+
|
| 64 |
+
def render(self) -> Tuple[Optional[str], bool]:
|
| 65 |
+
"""
|
| 66 |
+
Render sidebar with project selection, file selectors, and AI settings
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
Tuple of (data_room_path, process_button_pressed)
|
| 70 |
+
"""
|
| 71 |
+
with st.sidebar:
|
| 72 |
+
# Project and data room selection
|
| 73 |
+
selected_project_path, data_room_path = render_project_selector()
|
| 74 |
+
|
| 75 |
+
# Process button
|
| 76 |
+
process_button = st.button(
|
| 77 |
+
"🚀 Process Data Room",
|
| 78 |
+
type="primary",
|
| 79 |
+
width='stretch'
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
if process_button:
|
| 83 |
+
st.success("Processing... Check main area for progress")
|
| 84 |
+
|
| 85 |
+
st.divider()
|
| 86 |
+
|
| 87 |
+
# Analysis Configuration
|
| 88 |
+
st.subheader("📋 Analysis Configuration")
|
| 89 |
+
|
| 90 |
+
# Strategy selector
|
| 91 |
+
strategy_path, strategy_text = self._render_file_selector(
|
| 92 |
+
self.config.paths['strategy_dir'], "Strategy", "🎯"
|
| 93 |
+
)
|
| 94 |
+
self.session.strategy_path = strategy_path
|
| 95 |
+
self.session.strategy_text = strategy_text
|
| 96 |
+
|
| 97 |
+
# Checklist selector
|
| 98 |
+
checklist_path, checklist_text = self._render_file_selector(
|
| 99 |
+
self.config.paths['checklist_dir'], "Checklist", "📊"
|
| 100 |
+
)
|
| 101 |
+
self.session.checklist_path = checklist_path
|
| 102 |
+
self.session.checklist_text = checklist_text
|
| 103 |
+
|
| 104 |
+
# Questions selector
|
| 105 |
+
questions_path, questions_text = self._render_file_selector(
|
| 106 |
+
self.config.paths['questions_dir'], "Questions", "❓"
|
| 107 |
+
)
|
| 108 |
+
self.session.questions_path = questions_path
|
| 109 |
+
self.session.questions_text = questions_text
|
| 110 |
+
|
| 111 |
+
st.divider()
|
| 112 |
+
|
| 113 |
+
# AI settings
|
| 114 |
+
api_key, model_choice = render_ai_settings()
|
| 115 |
+
|
| 116 |
+
# Initialize AI agent if API key is available
|
| 117 |
+
if api_key:
|
| 118 |
+
existing_agent = self.session.agent
|
| 119 |
+
if existing_agent is None:
|
| 120 |
+
if self.ai_handler.setup_agent(api_key, model_choice):
|
| 121 |
+
st.success("✅ AI Agent ready")
|
| 122 |
+
else:
|
| 123 |
+
self.session.agent = None
|
| 124 |
+
|
| 125 |
+
return data_room_path, process_button
|
| 126 |
+
|
| 127 |
+
def _render_file_selector(self, directory: str, label: str, icon: str) -> Tuple[Optional[str], str]:
|
| 128 |
+
"""
|
| 129 |
+
Render a file selector for a specific directory
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
directory: Path to the directory containing files
|
| 133 |
+
label: Label for the selector
|
| 134 |
+
icon: Icon for the selector
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
Tuple of (selected_file_path, selected_file_content)
|
| 138 |
+
"""
|
| 139 |
+
try:
|
| 140 |
+
return render_file_selector(directory, label, "sidebar", icon)
|
| 141 |
+
except Exception as e:
|
| 142 |
+
logger.error(f"Failed to render {label.lower()} selector: {e}")
|
| 143 |
+
return None, ""
|
| 144 |
+
|
| 145 |
+
def process_data_room(self, data_room_path: str):
|
| 146 |
+
"""
|
| 147 |
+
Process a data room using the fast FAISS loading approach
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
data_room_path: Path to the data room directory
|
| 151 |
+
"""
|
| 152 |
+
try:
|
| 153 |
+
result = self.document_handler.process_data_room_fast(data_room_path)
|
| 154 |
+
|
| 155 |
+
if result:
|
| 156 |
+
doc_count, chunk_count = result
|
| 157 |
+
st.success(f"✅ Loaded {doc_count} documents and {chunk_count} chunks from pre-built index!")
|
| 158 |
+
st.rerun()
|
| 159 |
+
else:
|
| 160 |
+
display_processing_error("data room")
|
| 161 |
+
except Exception as e:
|
| 162 |
+
logger.error(f"Failed to process data room {data_room_path}: {e}")
|
| 163 |
+
display_processing_error("data room", e)
|
| 164 |
+
|
app/ui/tabs/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tab Components Package
|
| 3 |
+
|
| 4 |
+
Contains all tab-specific UI components and logic.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .tab_base import TabBase
|
| 8 |
+
from .overview_tab import OverviewTab
|
| 9 |
+
from .strategic_tab import StrategicTab
|
| 10 |
+
from .checklist_tab import ChecklistTab
|
| 11 |
+
from .questions_tab import QuestionsTab
|
| 12 |
+
from .qa_tab import QATab
|
| 13 |
+
|
| 14 |
+
__all__ = [
|
| 15 |
+
'TabBase',
|
| 16 |
+
'OverviewTab',
|
| 17 |
+
'StrategicTab',
|
| 18 |
+
'ChecklistTab',
|
| 19 |
+
'QuestionsTab',
|
| 20 |
+
'QATab'
|
| 21 |
+
]
|
app/ui/tabs/checklist_tab.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Checklist Tab Component
|
| 4 |
+
|
| 5 |
+
Handles checklist matching and display.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
|
| 10 |
+
from app.ui.session_manager import SessionManager
|
| 11 |
+
from app.ui.ui_components import (
|
| 12 |
+
status_message,
|
| 13 |
+
render_generate_buttons,
|
| 14 |
+
processing_guard,
|
| 15 |
+
display_generation_error,
|
| 16 |
+
display_initialization_error
|
| 17 |
+
)
|
| 18 |
+
from app.handlers.ai_handler import AIHandler
|
| 19 |
+
from app.core.logging import logger
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ChecklistTab:
|
| 23 |
+
"""
|
| 24 |
+
Checklist matching tab that handles checklist analysis and display.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, session: SessionManager, config, ai_handler: AIHandler):
|
| 28 |
+
"""Initialize tab with session manager, config, and AI handler"""
|
| 29 |
+
self.session = session
|
| 30 |
+
self.config = config
|
| 31 |
+
self.ai_handler = ai_handler
|
| 32 |
+
|
| 33 |
+
def render(self):
|
| 34 |
+
"""Render the checklist tab"""
|
| 35 |
+
documents = self.session.documents
|
| 36 |
+
if not documents:
|
| 37 |
+
status_message("👈 Configure and process data room first", "info")
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
# Use checklist from sidebar
|
| 41 |
+
file_text = self.session.checklist_text
|
| 42 |
+
|
| 43 |
+
if not file_text:
|
| 44 |
+
status_message("👈 Select a checklist in the sidebar first", "info")
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
# Generate button row
|
| 48 |
+
button_clicked = render_generate_buttons(
|
| 49 |
+
"📊 Generate Matching",
|
| 50 |
+
"regenerate_checklist_btn",
|
| 51 |
+
"checklist_results",
|
| 52 |
+
"Generate checklist matching analysis",
|
| 53 |
+
self.session
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Generate or display content
|
| 57 |
+
if button_clicked and not self.session.checklist_results:
|
| 58 |
+
self._generate_checklist_matching()
|
| 59 |
+
elif self.session.checklist_results:
|
| 60 |
+
from app.ui.ui_components import render_checklist_results
|
| 61 |
+
results = self.session.checklist_results
|
| 62 |
+
render_checklist_results(results, relevancy_threshold=self.config.processing['similarity_threshold'])
|
| 63 |
+
else:
|
| 64 |
+
status_message("👆 Click 'Generate Matching' to analyze checklist items against documents", "info")
|
| 65 |
+
|
| 66 |
+
@processing_guard()
|
| 67 |
+
def _generate_checklist_matching(self):
|
| 68 |
+
"""Generate checklist matching analysis"""
|
| 69 |
+
# Initialize document processor with loaded FAISS store
|
| 70 |
+
from app.core import create_document_processor
|
| 71 |
+
|
| 72 |
+
# Get the store name from session (set during data room processing)
|
| 73 |
+
store_name = self.session.vdr_store
|
| 74 |
+
if not store_name:
|
| 75 |
+
st.error("❌ No data room processed. Please process a data room first.")
|
| 76 |
+
return
|
| 77 |
+
|
| 78 |
+
document_processor = create_document_processor(store_name=store_name)
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
checklist_text = self.session.checklist_text
|
| 82 |
+
if not checklist_text or not self.session.chunks:
|
| 83 |
+
st.error("❌ No checklist or document chunks available")
|
| 84 |
+
return
|
| 85 |
+
|
| 86 |
+
# Check if data room has been processed
|
| 87 |
+
if not hasattr(self.session, 'documents') or not self.session.documents:
|
| 88 |
+
st.error("❌ No data room processed. Please process a data room first before running checklist analysis.")
|
| 89 |
+
return
|
| 90 |
+
|
| 91 |
+
# Note: Document type embeddings will be auto-loaded if missing during processing
|
| 92 |
+
|
| 93 |
+
with st.spinner("Processing checklist, please wait..."):
|
| 94 |
+
from app.core.parsers import parse_checklist
|
| 95 |
+
from app.core import search_and_analyze
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
# Parse raw checklist
|
| 99 |
+
llm = self.ai_handler.llm
|
| 100 |
+
if not llm:
|
| 101 |
+
raise ValueError("AI service not configured. Please set up your API key first.")
|
| 102 |
+
checklist = parse_checklist(checklist_text, llm)
|
| 103 |
+
self.session.checklist = checklist
|
| 104 |
+
|
| 105 |
+
# Use pre-built FAISS index from document processor
|
| 106 |
+
if not document_processor.vector_store:
|
| 107 |
+
raise ValueError("No pre-built FAISS index loaded. Please ensure data room is processed first.")
|
| 108 |
+
|
| 109 |
+
vector_store = document_processor.vector_store
|
| 110 |
+
|
| 111 |
+
# Process checklist items
|
| 112 |
+
checklist_results = search_and_analyze(
|
| 113 |
+
checklist,
|
| 114 |
+
vector_store,
|
| 115 |
+
self.ai_handler.session.agent.llm if self.ai_handler.is_agent_available() else None,
|
| 116 |
+
self.config.processing['similarity_threshold'],
|
| 117 |
+
'items',
|
| 118 |
+
store_name=getattr(document_processor, 'store_name', None),
|
| 119 |
+
session=self.session
|
| 120 |
+
)
|
| 121 |
+
self.session.checklist_results = checklist_results
|
| 122 |
+
|
| 123 |
+
status_message("✅ Checklist matching analysis completed!", "success")
|
| 124 |
+
st.rerun()
|
| 125 |
+
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"Checklist processing failed: {e}")
|
| 128 |
+
display_generation_error("checklist analysis", e)
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"Failed to initialize document processor: {e}")
|
| 132 |
+
display_initialization_error("document processor", e)
|
| 133 |
+
finally:
|
| 134 |
+
# Processing state is managed by processing_guard decorator
|
| 135 |
+
pass
|
| 136 |
+
|
app/ui/tabs/graph_tab.py
ADDED
|
@@ -0,0 +1,548 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Knowledge Graph Tab
|
| 4 |
+
|
| 5 |
+
This tab provides an interface for exploring pre-computed knowledge graphs
|
| 6 |
+
generated from due diligence documents. It offers entity search, relationship
|
| 7 |
+
exploration, and graph analysis capabilities.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import streamlit as st
|
| 11 |
+
import plotly.graph_objects as go
|
| 12 |
+
import plotly.express as px
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from typing import Dict, List, Any, Optional
|
| 15 |
+
|
| 16 |
+
from app.core.knowledge_graph import KnowledgeGraphManager, get_available_knowledge_graphs
|
| 17 |
+
from app.ui.tabs.tab_base import TabBase
|
| 18 |
+
from app.ui.error_handler import handle_ui_errors
|
| 19 |
+
from app.core.logging import logger
|
| 20 |
+
|
| 21 |
+
class GraphTab(TabBase):
|
| 22 |
+
"""Knowledge Graph exploration tab"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, session_manager, config, ai_handler, export_handler):
|
| 25 |
+
super().__init__(session_manager, config, ai_handler, export_handler)
|
| 26 |
+
self.tab_name = "Knowledge Graph"
|
| 27 |
+
self.tab_key = "graph"
|
| 28 |
+
|
| 29 |
+
@handle_ui_errors("Knowledge Graph", "Please try refreshing the page")
|
| 30 |
+
def render(self):
|
| 31 |
+
"""Render the knowledge graph tab"""
|
| 32 |
+
st.header("🧠 Knowledge Graph Explorer")
|
| 33 |
+
|
| 34 |
+
# Check if we have a loaded company
|
| 35 |
+
if not self.session.vdr_store:
|
| 36 |
+
st.info("📋 Please load a company first using the sidebar.")
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
company_name = self.session.vdr_store
|
| 40 |
+
|
| 41 |
+
# Initialize knowledge graph manager
|
| 42 |
+
if f'kg_manager_{company_name}' not in st.session_state:
|
| 43 |
+
st.session_state[f'kg_manager_{company_name}'] = KnowledgeGraphManager(company_name)
|
| 44 |
+
|
| 45 |
+
kg_manager = st.session_state[f'kg_manager_{company_name}']
|
| 46 |
+
|
| 47 |
+
# Load graph if not already loaded
|
| 48 |
+
if not kg_manager.is_available():
|
| 49 |
+
with st.spinner("Loading knowledge graph..."):
|
| 50 |
+
if not kg_manager.load_graph():
|
| 51 |
+
st.error("❌ Knowledge graph not found for this company.")
|
| 52 |
+
st.info("💡 Run `python scripts/build_knowledge_graphs.py` to generate knowledge graphs.")
|
| 53 |
+
return
|
| 54 |
+
|
| 55 |
+
# Display graph summary
|
| 56 |
+
self._render_graph_summary(kg_manager)
|
| 57 |
+
|
| 58 |
+
# Main interface tabs
|
| 59 |
+
tab1, tab2, tab3, tab4, tab5 = st.tabs([
|
| 60 |
+
"🔍 Entity Search",
|
| 61 |
+
"🔗 Relationship Explorer",
|
| 62 |
+
"📊 Graph Analysis",
|
| 63 |
+
"🎯 Path Finder",
|
| 64 |
+
"🧠 Semantic Search"
|
| 65 |
+
])
|
| 66 |
+
|
| 67 |
+
with tab1:
|
| 68 |
+
self._render_entity_search(kg_manager)
|
| 69 |
+
|
| 70 |
+
with tab2:
|
| 71 |
+
self._render_relationship_explorer(kg_manager)
|
| 72 |
+
|
| 73 |
+
with tab3:
|
| 74 |
+
self._render_graph_analysis(kg_manager)
|
| 75 |
+
|
| 76 |
+
with tab4:
|
| 77 |
+
self._render_path_finder(kg_manager)
|
| 78 |
+
|
| 79 |
+
with tab5:
|
| 80 |
+
self._render_semantic_search(kg_manager)
|
| 81 |
+
|
| 82 |
+
def _render_graph_summary(self, kg_manager: KnowledgeGraphManager):
|
| 83 |
+
"""Render graph summary statistics"""
|
| 84 |
+
stats = kg_manager.get_summary_stats()
|
| 85 |
+
|
| 86 |
+
if not stats:
|
| 87 |
+
return
|
| 88 |
+
|
| 89 |
+
# Summary metrics
|
| 90 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 91 |
+
|
| 92 |
+
with col1:
|
| 93 |
+
st.metric("Total Entities", stats.get('num_entities', 0))
|
| 94 |
+
|
| 95 |
+
with col2:
|
| 96 |
+
st.metric("Relationships", stats.get('num_relationships', 0))
|
| 97 |
+
|
| 98 |
+
with col3:
|
| 99 |
+
entity_types = stats.get('entity_types', {})
|
| 100 |
+
st.metric("Entity Types", len(entity_types))
|
| 101 |
+
|
| 102 |
+
with col4:
|
| 103 |
+
rel_types = stats.get('relationship_types', {})
|
| 104 |
+
st.metric("Relationship Types", len(rel_types))
|
| 105 |
+
|
| 106 |
+
# Entity distribution chart
|
| 107 |
+
if entity_types:
|
| 108 |
+
with st.expander("📊 Entity Distribution", expanded=False):
|
| 109 |
+
fig = px.pie(
|
| 110 |
+
values=list(entity_types.values()),
|
| 111 |
+
names=list(entity_types.keys()),
|
| 112 |
+
title="Distribution of Entity Types"
|
| 113 |
+
)
|
| 114 |
+
st.plotly_chart(fig, width='stretch')
|
| 115 |
+
|
| 116 |
+
def _render_entity_search(self, kg_manager: KnowledgeGraphManager):
|
| 117 |
+
"""Render entity search interface"""
|
| 118 |
+
st.subheader("🔍 Search Entities")
|
| 119 |
+
|
| 120 |
+
# Search controls
|
| 121 |
+
col1, col2 = st.columns([3, 1])
|
| 122 |
+
|
| 123 |
+
with col1:
|
| 124 |
+
search_query = st.text_input(
|
| 125 |
+
"Search for entities (companies, people, contracts, etc.)",
|
| 126 |
+
placeholder="e.g., Microsoft, John Smith, acquisition...",
|
| 127 |
+
key="entity_search_query"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
with col2:
|
| 131 |
+
entity_types = ['All'] + list(kg_manager.get_summary_stats().get('entity_types', {}).keys())
|
| 132 |
+
selected_type = st.selectbox(
|
| 133 |
+
"Filter by type",
|
| 134 |
+
entity_types,
|
| 135 |
+
key="entity_type_filter"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
if search_query:
|
| 139 |
+
# Perform search
|
| 140 |
+
filter_type = None if selected_type == 'All' else selected_type
|
| 141 |
+
results = kg_manager.search_entities(
|
| 142 |
+
search_query,
|
| 143 |
+
entity_type=filter_type,
|
| 144 |
+
limit=20
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
if results:
|
| 148 |
+
st.success(f"Found {len(results)} matching entities")
|
| 149 |
+
|
| 150 |
+
# Display results
|
| 151 |
+
for i, entity in enumerate(results):
|
| 152 |
+
with st.expander(f"🏷️ {entity['name']} ({entity['type']})", expanded=i==0):
|
| 153 |
+
col1, col2 = st.columns([2, 1])
|
| 154 |
+
|
| 155 |
+
with col1:
|
| 156 |
+
st.write(f"**Type:** {entity['type']}")
|
| 157 |
+
st.write(f"**Sources:** {entity['sources']}")
|
| 158 |
+
st.write(f"**Document Type:** {entity['document_type']}")
|
| 159 |
+
|
| 160 |
+
# Show context samples
|
| 161 |
+
if entity.get('context_samples'):
|
| 162 |
+
st.write("**Context:**")
|
| 163 |
+
for context in entity['context_samples']:
|
| 164 |
+
if context.strip():
|
| 165 |
+
st.write(f"_{context.strip()}_")
|
| 166 |
+
|
| 167 |
+
with col2:
|
| 168 |
+
st.metric("Relevance Score", f"{entity['score']:.2f}")
|
| 169 |
+
|
| 170 |
+
# Button to explore relationships
|
| 171 |
+
if st.button(f"Explore Relationships", key=f"explore_{i}"):
|
| 172 |
+
st.session_state['selected_entity'] = entity['name']
|
| 173 |
+
st.rerun()
|
| 174 |
+
else:
|
| 175 |
+
st.info("No entities found matching your search criteria.")
|
| 176 |
+
|
| 177 |
+
def _render_relationship_explorer(self, kg_manager: KnowledgeGraphManager):
|
| 178 |
+
"""Render relationship exploration interface"""
|
| 179 |
+
st.subheader("🔗 Relationship Explorer")
|
| 180 |
+
|
| 181 |
+
# Entity selection
|
| 182 |
+
selected_entity = st.session_state.get('selected_entity', '')
|
| 183 |
+
entity_input = st.text_input(
|
| 184 |
+
"Enter entity name to explore relationships",
|
| 185 |
+
value=selected_entity,
|
| 186 |
+
placeholder="e.g., Microsoft, John Smith...",
|
| 187 |
+
key="relationship_entity_input"
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
if entity_input:
|
| 191 |
+
# Get relationships
|
| 192 |
+
relationships = kg_manager.get_entity_relationships(entity_input)
|
| 193 |
+
|
| 194 |
+
if relationships['outgoing'] or relationships['incoming']:
|
| 195 |
+
# Display outgoing relationships
|
| 196 |
+
if relationships['outgoing']:
|
| 197 |
+
st.write("### ➡️ Outgoing Relationships")
|
| 198 |
+
outgoing_data = []
|
| 199 |
+
for rel in relationships['outgoing']:
|
| 200 |
+
outgoing_data.append({
|
| 201 |
+
'Target': rel['target'],
|
| 202 |
+
'Type': rel['target_type'],
|
| 203 |
+
'Relationship': rel['relationship'],
|
| 204 |
+
'Source Doc': rel['source_document'],
|
| 205 |
+
'Confidence': f"{rel['confidence']:.2f}"
|
| 206 |
+
})
|
| 207 |
+
|
| 208 |
+
df_out = pd.DataFrame(outgoing_data)
|
| 209 |
+
st.dataframe(df_out, width='stretch')
|
| 210 |
+
|
| 211 |
+
# Show relationship context on selection
|
| 212 |
+
if st.checkbox("Show relationship contexts", key="show_outgoing_context"):
|
| 213 |
+
for i, rel in enumerate(relationships['outgoing']):
|
| 214 |
+
if rel['context'].strip():
|
| 215 |
+
st.write(f"**{rel['target']} ({rel['relationship']}):**")
|
| 216 |
+
st.write(f"_{rel['context']}_")
|
| 217 |
+
st.write("---")
|
| 218 |
+
|
| 219 |
+
# Display incoming relationships
|
| 220 |
+
if relationships['incoming']:
|
| 221 |
+
st.write("### ⬅️ Incoming Relationships")
|
| 222 |
+
incoming_data = []
|
| 223 |
+
for rel in relationships['incoming']:
|
| 224 |
+
incoming_data.append({
|
| 225 |
+
'Source': rel['source'],
|
| 226 |
+
'Type': rel['source_type'],
|
| 227 |
+
'Relationship': rel['relationship'],
|
| 228 |
+
'Source Doc': rel['source_document'],
|
| 229 |
+
'Confidence': f"{rel['confidence']:.2f}"
|
| 230 |
+
})
|
| 231 |
+
|
| 232 |
+
df_in = pd.DataFrame(incoming_data)
|
| 233 |
+
st.dataframe(df_in, width='stretch')
|
| 234 |
+
|
| 235 |
+
# Show relationship context on selection
|
| 236 |
+
if st.checkbox("Show relationship contexts", key="show_incoming_context"):
|
| 237 |
+
for i, rel in enumerate(relationships['incoming']):
|
| 238 |
+
if rel['context'].strip():
|
| 239 |
+
st.write(f"**{rel['source']} ({rel['relationship']}):**")
|
| 240 |
+
st.write(f"_{rel['context']}_")
|
| 241 |
+
st.write("---")
|
| 242 |
+
|
| 243 |
+
# Relationship type distribution
|
| 244 |
+
all_rels = relationships['outgoing'] + relationships['incoming']
|
| 245 |
+
rel_types = {}
|
| 246 |
+
for rel in all_rels:
|
| 247 |
+
rel_type = rel['relationship']
|
| 248 |
+
rel_types[rel_type] = rel_types.get(rel_type, 0) + 1
|
| 249 |
+
|
| 250 |
+
if rel_types:
|
| 251 |
+
st.write("### 📊 Relationship Type Distribution")
|
| 252 |
+
fig = px.bar(
|
| 253 |
+
x=list(rel_types.keys()),
|
| 254 |
+
y=list(rel_types.values()),
|
| 255 |
+
title=f"Relationships for {entity_input}"
|
| 256 |
+
)
|
| 257 |
+
st.plotly_chart(fig, width='stretch')
|
| 258 |
+
|
| 259 |
+
else:
|
| 260 |
+
st.info(f"No relationships found for '{entity_input}'. Try a different entity name.")
|
| 261 |
+
|
| 262 |
+
def _render_graph_analysis(self, kg_manager: KnowledgeGraphManager):
|
| 263 |
+
"""Render graph analysis interface"""
|
| 264 |
+
st.subheader("📊 Graph Analysis")
|
| 265 |
+
|
| 266 |
+
# Central entities
|
| 267 |
+
st.write("### 🎯 Most Important Entities")
|
| 268 |
+
central_entities = kg_manager.get_central_entities(limit=15)
|
| 269 |
+
|
| 270 |
+
if central_entities:
|
| 271 |
+
# Create a bar chart of centrality scores
|
| 272 |
+
names = [e['name'] for e in central_entities]
|
| 273 |
+
scores = [e['centrality_score'] for e in central_entities]
|
| 274 |
+
types = [e['type'] for e in central_entities]
|
| 275 |
+
|
| 276 |
+
fig = px.bar(
|
| 277 |
+
x=scores,
|
| 278 |
+
y=names,
|
| 279 |
+
orientation='h',
|
| 280 |
+
color=types,
|
| 281 |
+
title="Entity Centrality Scores",
|
| 282 |
+
labels={'x': 'Centrality Score', 'y': 'Entity'}
|
| 283 |
+
)
|
| 284 |
+
fig.update_layout(height=500)
|
| 285 |
+
st.plotly_chart(fig, width='stretch')
|
| 286 |
+
|
| 287 |
+
# Display detailed table
|
| 288 |
+
with st.expander("📋 Detailed Central Entities", expanded=False):
|
| 289 |
+
central_df = pd.DataFrame([{
|
| 290 |
+
'Entity': e['name'],
|
| 291 |
+
'Type': e['type'],
|
| 292 |
+
'Centrality Score': e['centrality_score'],
|
| 293 |
+
'Connections': e['num_connections'],
|
| 294 |
+
'Sources': e['sources']
|
| 295 |
+
} for e in central_entities])
|
| 296 |
+
st.dataframe(central_df, width='stretch')
|
| 297 |
+
|
| 298 |
+
# Entity clusters
|
| 299 |
+
st.write("### 🎭 Entity Clusters")
|
| 300 |
+
clusters = kg_manager.get_entity_clusters()
|
| 301 |
+
|
| 302 |
+
if clusters:
|
| 303 |
+
st.info(f"Found {len(clusters)} clusters of related entities")
|
| 304 |
+
|
| 305 |
+
for i, cluster in enumerate(clusters):
|
| 306 |
+
with st.expander(f"Cluster {i+1} ({len(cluster)} entities)", expanded=i==0):
|
| 307 |
+
# Display cluster as tags
|
| 308 |
+
cluster_html = " • ".join([f"**{entity}**" for entity in cluster])
|
| 309 |
+
st.write(cluster_html)
|
| 310 |
+
else:
|
| 311 |
+
st.info("No significant entity clusters found.")
|
| 312 |
+
|
| 313 |
+
def _render_path_finder(self, kg_manager: KnowledgeGraphManager):
|
| 314 |
+
"""Render path finding interface"""
|
| 315 |
+
st.subheader("🎯 Path Finder")
|
| 316 |
+
st.write("Find connections between two entities in the knowledge graph.")
|
| 317 |
+
|
| 318 |
+
col1, col2 = st.columns(2)
|
| 319 |
+
|
| 320 |
+
with col1:
|
| 321 |
+
source_entity = st.text_input(
|
| 322 |
+
"Source Entity",
|
| 323 |
+
placeholder="e.g., Microsoft",
|
| 324 |
+
key="path_source_entity"
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
with col2:
|
| 328 |
+
target_entity = st.text_input(
|
| 329 |
+
"Target Entity",
|
| 330 |
+
placeholder="e.g., OpenAI",
|
| 331 |
+
key="path_target_entity"
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
max_length = st.slider("Maximum Path Length", 1, 5, 3, key="max_path_length")
|
| 335 |
+
|
| 336 |
+
if source_entity and target_entity and st.button("Find Paths", key="find_paths_btn"):
|
| 337 |
+
with st.spinner("Searching for paths..."):
|
| 338 |
+
paths = kg_manager.find_paths(source_entity, target_entity, max_length)
|
| 339 |
+
|
| 340 |
+
if paths:
|
| 341 |
+
st.success(f"Found {len(paths)} path(s) between {source_entity} and {target_entity}")
|
| 342 |
+
|
| 343 |
+
for i, path in enumerate(paths):
|
| 344 |
+
st.write(f"**Path {i+1}:**")
|
| 345 |
+
path_str = " → ".join(path)
|
| 346 |
+
st.write(f"🔗 {path_str}")
|
| 347 |
+
|
| 348 |
+
# Show path length
|
| 349 |
+
st.write(f"_Length: {len(path)-1} steps_")
|
| 350 |
+
st.write("---")
|
| 351 |
+
else:
|
| 352 |
+
st.info(f"No paths found between {source_entity} and {target_entity} within {max_length} steps.")
|
| 353 |
+
|
| 354 |
+
# Path finding tips
|
| 355 |
+
with st.expander("💡 Path Finding Tips", expanded=False):
|
| 356 |
+
st.write("""
|
| 357 |
+
- **Entity names**: Use exact or partial entity names as they appear in the documents
|
| 358 |
+
- **Path length**: Shorter paths show direct connections, longer paths reveal indirect relationships
|
| 359 |
+
- **Multiple paths**: Different paths can reveal different types of business relationships
|
| 360 |
+
- **Use cases**:
|
| 361 |
+
- Find how two companies are connected
|
| 362 |
+
- Trace investment or acquisition chains
|
| 363 |
+
- Discover business partnerships and alliances
|
| 364 |
+
""")
|
| 365 |
+
|
| 366 |
+
def _render_semantic_search(self, kg_manager: KnowledgeGraphManager):
|
| 367 |
+
"""Render semantic search interface using FAISS embeddings"""
|
| 368 |
+
st.subheader("🧠 Semantic Search")
|
| 369 |
+
st.write("Search entities using natural language queries powered by your existing FAISS embeddings.")
|
| 370 |
+
|
| 371 |
+
# Semantic entity search
|
| 372 |
+
st.write("### 🔍 Semantic Entity Search")
|
| 373 |
+
semantic_query = st.text_input(
|
| 374 |
+
"Describe what you're looking for (e.g., 'technology companies', 'financial partnerships', 'recent acquisitions')",
|
| 375 |
+
placeholder="e.g., companies involved in AI partnerships",
|
| 376 |
+
key="semantic_entity_query"
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
col1, col2 = st.columns([1, 1])
|
| 380 |
+
with col1:
|
| 381 |
+
semantic_limit = st.slider("Max results", 5, 20, 10, key="semantic_limit")
|
| 382 |
+
with col2:
|
| 383 |
+
similarity_threshold = st.slider("Similarity threshold", 0.1, 0.8, 0.3, key="similarity_threshold")
|
| 384 |
+
|
| 385 |
+
if semantic_query and st.button("🔍 Semantic Search", key="semantic_search_btn"):
|
| 386 |
+
with st.spinner("Searching using AI embeddings..."):
|
| 387 |
+
results = kg_manager.semantic_search_entities(
|
| 388 |
+
semantic_query,
|
| 389 |
+
limit=semantic_limit,
|
| 390 |
+
similarity_threshold=similarity_threshold
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
if results:
|
| 394 |
+
st.success(f"Found {len(results)} semantically relevant entities")
|
| 395 |
+
|
| 396 |
+
for i, entity in enumerate(results):
|
| 397 |
+
with st.expander(f"🏷️ {entity['name']} ({entity['type']}) - Score: {entity['similarity_score']:.3f}", expanded=i==0):
|
| 398 |
+
col1, col2 = st.columns([2, 1])
|
| 399 |
+
|
| 400 |
+
with col1:
|
| 401 |
+
st.write(f"**Type:** {entity['type']}")
|
| 402 |
+
st.write(f"**Sources:** {entity['sources']}")
|
| 403 |
+
st.write(f"**Document Type:** {entity['document_type']}")
|
| 404 |
+
|
| 405 |
+
# Show matching context
|
| 406 |
+
if entity.get('matching_context'):
|
| 407 |
+
st.write("**Relevant Context:**")
|
| 408 |
+
st.write(f"_{entity['matching_context']}_")
|
| 409 |
+
|
| 410 |
+
# Show original context samples
|
| 411 |
+
if entity.get('context_samples'):
|
| 412 |
+
st.write("**Entity Context:**")
|
| 413 |
+
for context in entity['context_samples']:
|
| 414 |
+
if context.strip():
|
| 415 |
+
st.write(f"_{context.strip()}_")
|
| 416 |
+
|
| 417 |
+
with col2:
|
| 418 |
+
st.metric("Similarity Score", f"{entity['similarity_score']:.3f}")
|
| 419 |
+
|
| 420 |
+
# Button to explore relationships
|
| 421 |
+
if st.button(f"Explore Relations", key=f"semantic_explore_{i}"):
|
| 422 |
+
st.session_state['selected_entity'] = entity['name']
|
| 423 |
+
st.rerun()
|
| 424 |
+
else:
|
| 425 |
+
st.info("No entities found matching your semantic query. Try adjusting the similarity threshold or rephrasing your query.")
|
| 426 |
+
|
| 427 |
+
# Context-based related entities
|
| 428 |
+
st.write("### 🔗 Find Related by Context")
|
| 429 |
+
st.write("Find entities that appear in similar contexts to a reference entity.")
|
| 430 |
+
|
| 431 |
+
context_entity = st.text_input(
|
| 432 |
+
"Reference entity name",
|
| 433 |
+
placeholder="e.g., Microsoft",
|
| 434 |
+
key="context_reference_entity"
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
context_limit = st.slider("Max related entities", 3, 15, 5, key="context_limit")
|
| 438 |
+
|
| 439 |
+
if context_entity and st.button("Find Related by Context", key="find_context_related_btn"):
|
| 440 |
+
with st.spinner("Finding contextually related entities..."):
|
| 441 |
+
related = kg_manager.find_related_entities_by_context(context_entity, limit=context_limit)
|
| 442 |
+
|
| 443 |
+
if related:
|
| 444 |
+
st.success(f"Found {len(related)} contextually related entities")
|
| 445 |
+
|
| 446 |
+
related_data = []
|
| 447 |
+
for entity in related:
|
| 448 |
+
related_data.append({
|
| 449 |
+
'Entity': entity['name'],
|
| 450 |
+
'Type': entity['type'],
|
| 451 |
+
'Similarity': f"{entity['similarity_score']:.3f}",
|
| 452 |
+
'Reason': entity['relationship_reason'],
|
| 453 |
+
'Sources': entity['sources']
|
| 454 |
+
})
|
| 455 |
+
|
| 456 |
+
df_related = pd.DataFrame(related_data)
|
| 457 |
+
st.dataframe(df_related, width='stretch')
|
| 458 |
+
|
| 459 |
+
# Show context samples for selected entities
|
| 460 |
+
if st.checkbox("Show context samples", key="show_related_contexts"):
|
| 461 |
+
for entity in related:
|
| 462 |
+
if entity.get('context_samples'):
|
| 463 |
+
st.write(f"**{entity['name']}:**")
|
| 464 |
+
for context in entity['context_samples']:
|
| 465 |
+
if context.strip():
|
| 466 |
+
st.write(f"_{context.strip()}_")
|
| 467 |
+
st.write("---")
|
| 468 |
+
else:
|
| 469 |
+
st.info(f"No contextually related entities found for '{context_entity}'.")
|
| 470 |
+
|
| 471 |
+
# Semantic path search
|
| 472 |
+
st.write("### 🎯 Semantic Path Discovery")
|
| 473 |
+
st.write("Find connection paths that are semantically relevant to your query.")
|
| 474 |
+
|
| 475 |
+
path_query = st.text_input(
|
| 476 |
+
"Describe the type of connections you want to find",
|
| 477 |
+
placeholder="e.g., investment relationships, technology partnerships",
|
| 478 |
+
key="semantic_path_query"
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
max_semantic_paths = st.slider("Max paths", 3, 10, 5, key="max_semantic_paths")
|
| 482 |
+
|
| 483 |
+
if path_query and st.button("Find Semantic Paths", key="semantic_paths_btn"):
|
| 484 |
+
with st.spinner("Discovering relevant connection paths..."):
|
| 485 |
+
paths = kg_manager.semantic_path_search(path_query, max_paths=max_semantic_paths)
|
| 486 |
+
|
| 487 |
+
if paths:
|
| 488 |
+
st.success(f"Found {len(paths)} relevant connection paths")
|
| 489 |
+
|
| 490 |
+
for i, path_info in enumerate(paths):
|
| 491 |
+
st.write(f"**Path {i+1}:** (Relevance: {path_info['relevance_score']:.3f})")
|
| 492 |
+
path_str = " → ".join(path_info['path'])
|
| 493 |
+
st.write(f"🔗 {path_str}")
|
| 494 |
+
st.write(f"_{path_info['query_relevance']}_")
|
| 495 |
+
st.write(f"Length: {path_info['path_length']} steps")
|
| 496 |
+
st.write("---")
|
| 497 |
+
else:
|
| 498 |
+
st.info(f"No semantically relevant paths found for '{path_query}'.")
|
| 499 |
+
|
| 500 |
+
# Semantic search tips
|
| 501 |
+
with st.expander("💡 Semantic Search Tips", expanded=False):
|
| 502 |
+
st.write("""
|
| 503 |
+
**Semantic Search Benefits:**
|
| 504 |
+
- Uses your existing FAISS embeddings for intelligent matching
|
| 505 |
+
- Finds entities based on meaning, not just keywords
|
| 506 |
+
- Discovers hidden relationships through context similarity
|
| 507 |
+
- Leverages the same AI models used in your document analysis
|
| 508 |
+
|
| 509 |
+
**Query Examples:**
|
| 510 |
+
- "technology companies with AI focus"
|
| 511 |
+
- "recent merger and acquisition activity"
|
| 512 |
+
- "financial services partnerships"
|
| 513 |
+
- "regulatory compliance issues"
|
| 514 |
+
- "key executive leadership"
|
| 515 |
+
|
| 516 |
+
**How it works:**
|
| 517 |
+
1. Your query is embedded using the same model as your documents
|
| 518 |
+
2. FAISS finds the most similar document chunks
|
| 519 |
+
3. Entities from those chunks are returned with similarity scores
|
| 520 |
+
4. Results are ranked by semantic relevance
|
| 521 |
+
|
| 522 |
+
**Performance Notes:**
|
| 523 |
+
- Requires existing FAISS indices (same as your document search)
|
| 524 |
+
- No additional models or external services needed
|
| 525 |
+
- Leverages your pre-computed embeddings for fast results
|
| 526 |
+
""")
|
| 527 |
+
|
| 528 |
+
def get_status(self) -> Dict[str, Any]:
|
| 529 |
+
"""Get current status of the knowledge graph tab"""
|
| 530 |
+
if not self.session.vdr_store:
|
| 531 |
+
return {
|
| 532 |
+
'ready': False,
|
| 533 |
+
'message': 'No company loaded'
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
company_name = self.session.vdr_store
|
| 537 |
+
available_graphs = get_available_knowledge_graphs()
|
| 538 |
+
|
| 539 |
+
if company_name not in available_graphs:
|
| 540 |
+
return {
|
| 541 |
+
'ready': False,
|
| 542 |
+
'message': f'Knowledge graph not available for {company_name}'
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
return {
|
| 546 |
+
'ready': True,
|
| 547 |
+
'message': f'Knowledge graph ready for {company_name}'
|
| 548 |
+
}
|
app/ui/tabs/overview_tab.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Overview Tab Component
|
| 4 |
+
|
| 5 |
+
Handles company overview generation and display.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# Standard library imports
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Third-party imports
|
| 12 |
+
import streamlit as st
|
| 13 |
+
|
| 14 |
+
# Local imports
|
| 15 |
+
from app.ui.tabs.tab_base import TabBase
|
| 16 |
+
from app.ui.ui_components import status_message
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class OverviewTab(TabBase):
|
| 20 |
+
"""
|
| 21 |
+
Company overview tab that handles overview generation and display.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def render(self):
|
| 25 |
+
"""Render the overview tab"""
|
| 26 |
+
if not self._check_documents_available():
|
| 27 |
+
return
|
| 28 |
+
|
| 29 |
+
# Generate button row
|
| 30 |
+
button_clicked = self._render_generate_buttons(
|
| 31 |
+
"🤖 Generate Overview",
|
| 32 |
+
"regenerate_overview_btn",
|
| 33 |
+
"overview_summary",
|
| 34 |
+
"Use AI to generate company overview analysis"
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Generate or display content
|
| 38 |
+
if self._should_generate_content(button_clicked, "overview_summary"):
|
| 39 |
+
self._generate_report("overview", "overview_summary", "✅ Company overview generated successfully!")
|
| 40 |
+
else:
|
| 41 |
+
self._render_content_or_placeholder(
|
| 42 |
+
"overview_summary",
|
| 43 |
+
"👆 Click 'Generate Overview' to create AI-powered company analysis"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
def _generate_report(self, report_type: str, session_attr: str, success_message: str):
|
| 47 |
+
"""Generate company overview report using AI"""
|
| 48 |
+
if not self._check_ai_availability():
|
| 49 |
+
return
|
| 50 |
+
|
| 51 |
+
with st.spinner("Agent running, please wait..."):
|
| 52 |
+
data_room_name = self._get_data_room_name()
|
| 53 |
+
|
| 54 |
+
overview_summary = self.ai_handler.generate_report(
|
| 55 |
+
report_type,
|
| 56 |
+
documents=self.session.documents,
|
| 57 |
+
data_room_name=data_room_name,
|
| 58 |
+
strategy_text=self.session.strategy_text,
|
| 59 |
+
checklist_results=self.session.checklist_results
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
if overview_summary:
|
| 63 |
+
setattr(self.session, session_attr, overview_summary)
|
| 64 |
+
status_message(success_message, "success")
|
| 65 |
+
st.rerun()
|
| 66 |
+
else:
|
| 67 |
+
status_message("Failed to generate overview. Please try again.", "error")
|
| 68 |
+
|
| 69 |
+
def _get_export_method_name(self) -> str:
|
| 70 |
+
"""Get export method name for overview reports"""
|
| 71 |
+
return "export_overview_report"
|
| 72 |
+
|
| 73 |
+
def _get_download_key(self) -> str:
|
| 74 |
+
"""Get download button key for overview reports"""
|
| 75 |
+
return "export_overview_btn"
|
| 76 |
+
|
app/ui/tabs/qa_tab.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Q&A Tab Component
|
| 4 |
+
|
| 5 |
+
Handles Q&A with citations functionality.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# Standard library imports
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Third-party imports
|
| 12 |
+
import streamlit as st
|
| 13 |
+
|
| 14 |
+
# Local imports
|
| 15 |
+
from app.core import RELEVANCY_THRESHOLD, logger
|
| 16 |
+
from app.handlers.ai_handler import AIHandler
|
| 17 |
+
from app.ui.session_manager import SessionManager
|
| 18 |
+
from app.ui.ui_components import (
|
| 19 |
+
display_processing_error,
|
| 20 |
+
display_generation_error,
|
| 21 |
+
display_download_error,
|
| 22 |
+
status_message
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class QATab:
|
| 27 |
+
"""
|
| 28 |
+
Q&A with citations tab that handles question answering and citation display.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, session: SessionManager, config, ai_handler: AIHandler):
|
| 32 |
+
"""Initialize tab with session manager, config, and AI handler"""
|
| 33 |
+
self.session = session
|
| 34 |
+
self.config = config
|
| 35 |
+
self.ai_handler = ai_handler
|
| 36 |
+
|
| 37 |
+
def render(self):
|
| 38 |
+
"""Render the Q&A tab"""
|
| 39 |
+
chunks = self.session.chunks
|
| 40 |
+
if not chunks:
|
| 41 |
+
status_message("👈 Process data room first to enable Q&A", "info")
|
| 42 |
+
return
|
| 43 |
+
|
| 44 |
+
# Question input
|
| 45 |
+
question = st.text_input(
|
| 46 |
+
"Ask a question about your documents:",
|
| 47 |
+
placeholder="e.g., What are the main risks? What is the revenue model? Who are the key customers?",
|
| 48 |
+
key="qa_question_input"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Handle Q&A query if there's a question
|
| 52 |
+
if question:
|
| 53 |
+
st.divider()
|
| 54 |
+
self._handle_qa_query(question)
|
| 55 |
+
|
| 56 |
+
def _handle_qa_query(self, question: str):
|
| 57 |
+
"""Handle Q&A query and display results"""
|
| 58 |
+
# Create a unique key for this Q&A session to prevent resets
|
| 59 |
+
qa_key = f"qa_results_{hash(question) % 100000}"
|
| 60 |
+
|
| 61 |
+
# Check if we already have results for this question in session state
|
| 62 |
+
if qa_key not in st.session_state:
|
| 63 |
+
try:
|
| 64 |
+
from app.core import search_documents
|
| 65 |
+
|
| 66 |
+
# Initialize document processor with loaded FAISS store
|
| 67 |
+
from app.core import create_document_processor
|
| 68 |
+
|
| 69 |
+
# Get the store name from session (set during data room processing)
|
| 70 |
+
store_name = self.session.vdr_store
|
| 71 |
+
if not store_name:
|
| 72 |
+
st.error("❌ No data room processed. Please process a data room first.")
|
| 73 |
+
return
|
| 74 |
+
|
| 75 |
+
document_processor = create_document_processor(store_name=store_name)
|
| 76 |
+
|
| 77 |
+
# Use lower threshold for Q&A to get more relevant results
|
| 78 |
+
qa_threshold = 0.15 # Lower threshold for QA to find more results
|
| 79 |
+
|
| 80 |
+
with st.spinner("🔍 Searching documents..."):
|
| 81 |
+
results = search_documents(
|
| 82 |
+
question,
|
| 83 |
+
document_processor,
|
| 84 |
+
top_k=self.config.ui['top_k_search_results'],
|
| 85 |
+
threshold=qa_threshold
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# Fallback: try with lower threshold if no results found
|
| 89 |
+
if not results:
|
| 90 |
+
logger.info(f"No results found with threshold {qa_threshold}, trying lower threshold...")
|
| 91 |
+
fallback_threshold = 0.05 # Very low threshold as last resort
|
| 92 |
+
results = search_documents(
|
| 93 |
+
question,
|
| 94 |
+
document_processor,
|
| 95 |
+
top_k=self.config.ui['top_k_search_results'],
|
| 96 |
+
threshold=fallback_threshold
|
| 97 |
+
)
|
| 98 |
+
if results:
|
| 99 |
+
st.info(f"ℹ️ Found results with lower relevance threshold ({fallback_threshold})")
|
| 100 |
+
|
| 101 |
+
# Store results in session state to prevent resets
|
| 102 |
+
st.session_state[qa_key] = {
|
| 103 |
+
'question': question,
|
| 104 |
+
'results': results,
|
| 105 |
+
'has_ai': self.ai_handler.is_agent_available()
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
logger.error(f"Failed to handle Q&A query: {e}")
|
| 110 |
+
display_processing_error("question", e)
|
| 111 |
+
return
|
| 112 |
+
|
| 113 |
+
# Render results from session state
|
| 114 |
+
qa_data = st.session_state[qa_key]
|
| 115 |
+
results = qa_data['results']
|
| 116 |
+
|
| 117 |
+
if results:
|
| 118 |
+
# Use agent to synthesize answer if available
|
| 119 |
+
if qa_data['has_ai']:
|
| 120 |
+
self._render_ai_answer(question, results)
|
| 121 |
+
else:
|
| 122 |
+
self._render_direct_results(results)
|
| 123 |
+
else:
|
| 124 |
+
status_message("No relevant information found for your question.", "warning")
|
| 125 |
+
|
| 126 |
+
def _render_ai_answer(self, question: str, results: list):
|
| 127 |
+
"""Render AI-generated answer with citations"""
|
| 128 |
+
st.markdown("### 🤖 AI Service Answer")
|
| 129 |
+
with st.spinner("AI processing, please wait..."):
|
| 130 |
+
try:
|
| 131 |
+
# Convert results to document format for context
|
| 132 |
+
context_docs = [f"From {r.get('source', 'Unknown')}:\n{r.get('text', '')}" for r in results[:3]]
|
| 133 |
+
|
| 134 |
+
# Use the AI handler
|
| 135 |
+
answer_text = self.ai_handler.answer_question(question, context_docs)
|
| 136 |
+
|
| 137 |
+
st.markdown(answer_text)
|
| 138 |
+
|
| 139 |
+
except Exception as e:
|
| 140 |
+
logger.error(f"Failed to generate AI answer: {e}")
|
| 141 |
+
display_generation_error("AI answer")
|
| 142 |
+
|
| 143 |
+
st.divider()
|
| 144 |
+
self._render_source_documents(results, question)
|
| 145 |
+
|
| 146 |
+
def _render_direct_results(self, results: list):
|
| 147 |
+
"""Render direct search results without AI synthesis"""
|
| 148 |
+
st.markdown("### 📚 Relevant Documents")
|
| 149 |
+
self._render_source_documents(results)
|
| 150 |
+
|
| 151 |
+
def _render_source_documents(self, results: list, question: str = ""):
|
| 152 |
+
"""Render source documents with download buttons"""
|
| 153 |
+
st.markdown("### 📚 Source Documents")
|
| 154 |
+
|
| 155 |
+
# Display source documents with download buttons
|
| 156 |
+
for i, result in enumerate(results[:3], 1):
|
| 157 |
+
with st.container():
|
| 158 |
+
col1, col2 = st.columns([5, 1])
|
| 159 |
+
with col1:
|
| 160 |
+
text_content = result.get('text', '')
|
| 161 |
+
excerpt = text_content[:200] + "..." if len(text_content) > 200 else text_content
|
| 162 |
+
st.markdown(f"{i}. \"{excerpt}\")")
|
| 163 |
+
|
| 164 |
+
# Create clickable link for the document
|
| 165 |
+
doc_path = result.get('path', result.get('full_path', ''))
|
| 166 |
+
doc_name = result.get('source', 'Unknown Document')
|
| 167 |
+
doc_title = self._format_document_title(doc_name)
|
| 168 |
+
|
| 169 |
+
# Show document info and citation
|
| 170 |
+
doc_source = result.get('source', 'Unknown')
|
| 171 |
+
citation = result.get('citation', '')
|
| 172 |
+
st.caption(f" 📄 {doc_source} ({citation})" if citation else f" 📄 {doc_source}")
|
| 173 |
+
|
| 174 |
+
with col2:
|
| 175 |
+
# Only show one download button
|
| 176 |
+
self._render_qa_download_button(result, i, question)
|
| 177 |
+
|
| 178 |
+
def _format_document_title(self, doc_name: str) -> str:
|
| 179 |
+
"""Format document title for display"""
|
| 180 |
+
try:
|
| 181 |
+
from app.core import format_document_title
|
| 182 |
+
return format_document_title(doc_name)
|
| 183 |
+
except Exception:
|
| 184 |
+
return doc_name
|
| 185 |
+
|
| 186 |
+
def _render_qa_download_button(self, result: dict, idx: int, question: str):
|
| 187 |
+
"""Render download button for Q&A results"""
|
| 188 |
+
doc_path = result.get('path', '')
|
| 189 |
+
if doc_path:
|
| 190 |
+
# Create a more stable key that won't cause resets
|
| 191 |
+
doc_source = result.get('source', 'document')
|
| 192 |
+
button_key = f"qa_dl_{idx}_{hash(doc_path + question) % 100000}"
|
| 193 |
+
|
| 194 |
+
# Use consistent path resolution logic
|
| 195 |
+
try:
|
| 196 |
+
from app.ui.ui_components import _resolve_document_path
|
| 197 |
+
resolved_path = _resolve_document_path(doc_path)
|
| 198 |
+
|
| 199 |
+
if resolved_path and resolved_path.exists():
|
| 200 |
+
with open(resolved_path, 'rb') as f:
|
| 201 |
+
file_bytes = f.read()
|
| 202 |
+
|
| 203 |
+
st.download_button(
|
| 204 |
+
label="📥 Download",
|
| 205 |
+
data=file_bytes,
|
| 206 |
+
file_name=resolved_path.name, # Use actual filename
|
| 207 |
+
mime="application/pdf",
|
| 208 |
+
key=button_key,
|
| 209 |
+
help=f"Download {doc_source}",
|
| 210 |
+
width='stretch'
|
| 211 |
+
)
|
| 212 |
+
else:
|
| 213 |
+
st.caption("(unavailable)")
|
| 214 |
+
except Exception as e:
|
| 215 |
+
logger.error(f"Download failed: {str(e)}")
|
| 216 |
+
st.caption("(error)")
|
app/ui/tabs/questions_tab.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Questions Tab Component
|
| 4 |
+
|
| 5 |
+
Handles due diligence questions analysis and display.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
|
| 10 |
+
from app.ui.session_manager import SessionManager
|
| 11 |
+
from app.ui.ui_components import (
|
| 12 |
+
status_message,
|
| 13 |
+
render_generate_buttons,
|
| 14 |
+
processing_guard,
|
| 15 |
+
display_generation_error,
|
| 16 |
+
display_initialization_error
|
| 17 |
+
)
|
| 18 |
+
from app.handlers.ai_handler import AIHandler
|
| 19 |
+
from app.core.logging import logger
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class QuestionsTab:
|
| 23 |
+
"""
|
| 24 |
+
Questions tab that handles due diligence questions analysis and display.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, session: SessionManager, config, ai_handler: AIHandler):
|
| 28 |
+
"""Initialize tab with session manager, config, and AI handler"""
|
| 29 |
+
self.session = session
|
| 30 |
+
self.config = config
|
| 31 |
+
self.ai_handler = ai_handler
|
| 32 |
+
|
| 33 |
+
def render(self):
|
| 34 |
+
"""Render the questions tab"""
|
| 35 |
+
documents = self.session.documents
|
| 36 |
+
if not documents:
|
| 37 |
+
status_message("👈 Configure and process data room first", "info")
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
# Use questions from sidebar
|
| 41 |
+
file_text = self.session.questions_text
|
| 42 |
+
|
| 43 |
+
if not file_text:
|
| 44 |
+
status_message("👈 Select a questions list in the sidebar first", "info")
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
# Generate button row
|
| 48 |
+
button_clicked = render_generate_buttons(
|
| 49 |
+
"❓ Generate Answers",
|
| 50 |
+
"regenerate_questions_btn",
|
| 51 |
+
"question_answers",
|
| 52 |
+
"Generate answers for due diligence questions",
|
| 53 |
+
self.session
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Generate or display content
|
| 57 |
+
if button_clicked and not self.session.question_answers:
|
| 58 |
+
self._generate_question_answers()
|
| 59 |
+
elif self.session.question_answers:
|
| 60 |
+
from app.ui.ui_components import render_question_results
|
| 61 |
+
answers = self.session.question_answers
|
| 62 |
+
# Convert from {'questions': [...]} format to {question_id: answer_data} format
|
| 63 |
+
if isinstance(answers, dict) and 'questions' in answers:
|
| 64 |
+
questions_dict = {}
|
| 65 |
+
for i, question_data in enumerate(answers['questions']):
|
| 66 |
+
questions_dict[f"question_{i}"] = question_data
|
| 67 |
+
render_question_results(questions_dict)
|
| 68 |
+
else:
|
| 69 |
+
render_question_results(answers)
|
| 70 |
+
else:
|
| 71 |
+
status_message("👆 Click 'Generate Answers' to find relevant documents for due diligence questions", "info")
|
| 72 |
+
|
| 73 |
+
@processing_guard()
|
| 74 |
+
def _generate_question_answers(self):
|
| 75 |
+
"""Generate question answering analysis"""
|
| 76 |
+
from app.core.document_processor import DocumentProcessor
|
| 77 |
+
|
| 78 |
+
# Initialize document processor with loaded FAISS store
|
| 79 |
+
from app.core.utils import create_document_processor
|
| 80 |
+
|
| 81 |
+
# Get the store name from session (set during data room processing)
|
| 82 |
+
store_name = self.session.vdr_store
|
| 83 |
+
if not store_name:
|
| 84 |
+
st.error("❌ No data room processed. Please process a data room first.")
|
| 85 |
+
return
|
| 86 |
+
|
| 87 |
+
document_processor = create_document_processor(store_name=store_name)
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
questions_text = self.session.questions_text
|
| 91 |
+
if not questions_text or not self.session.chunks:
|
| 92 |
+
st.error("❌ No questions or document chunks available")
|
| 93 |
+
return
|
| 94 |
+
|
| 95 |
+
# Show progress indicator
|
| 96 |
+
with st.spinner("🚀 Starting question analysis..."):
|
| 97 |
+
try:
|
| 98 |
+
from app.core.parsers import parse_questions
|
| 99 |
+
from app.core.search import search_and_analyze
|
| 100 |
+
|
| 101 |
+
# Step 1: Parse questions
|
| 102 |
+
st.info("📋 Parsing questions...")
|
| 103 |
+
llm = self.ai_handler.llm
|
| 104 |
+
if not llm:
|
| 105 |
+
raise ValueError("AI service not configured. Please set up your API key first.")
|
| 106 |
+
questions = parse_questions(questions_text, llm)
|
| 107 |
+
self.session.questions = questions
|
| 108 |
+
st.info(f"Found {len(questions)} questions to process")
|
| 109 |
+
|
| 110 |
+
# Step 2: Use pre-built FAISS index
|
| 111 |
+
st.info("🔍 Setting up document search...")
|
| 112 |
+
if not document_processor.vector_store:
|
| 113 |
+
raise ValueError("No pre-built FAISS index loaded. Please ensure data room is processed first.")
|
| 114 |
+
vector_store = document_processor.vector_store
|
| 115 |
+
|
| 116 |
+
# Step 3: Process questions with batch processing
|
| 117 |
+
st.info("🤖 Processing questions with AI (batch mode)...")
|
| 118 |
+
st.info("Using concurrent processing for faster results...")
|
| 119 |
+
|
| 120 |
+
question_answers = search_and_analyze(
|
| 121 |
+
questions,
|
| 122 |
+
vector_store,
|
| 123 |
+
self.ai_handler.session.agent.llm if self.ai_handler.is_agent_available() else None,
|
| 124 |
+
self.config.processing['relevancy_threshold'],
|
| 125 |
+
'questions',
|
| 126 |
+
store_name=getattr(document_processor, 'store_name', None)
|
| 127 |
+
)
|
| 128 |
+
self.session.question_answers = question_answers
|
| 129 |
+
|
| 130 |
+
# Complete
|
| 131 |
+
questions_list = question_answers.get('questions', [])
|
| 132 |
+
answered_count = sum(1 for a in questions_list if a.get('has_answer', False))
|
| 133 |
+
st.success(f"✅ Completed! {answered_count}/{len(questions)} questions answered")
|
| 134 |
+
|
| 135 |
+
status_message("✅ Question answering analysis completed!", "success")
|
| 136 |
+
st.rerun()
|
| 137 |
+
|
| 138 |
+
except Exception as e:
|
| 139 |
+
logger.error(f"Questions processing failed: {e}")
|
| 140 |
+
display_generation_error("question analysis", e)
|
| 141 |
+
except Exception as e:
|
| 142 |
+
logger.error(f"Failed to initialize document processor: {e}")
|
| 143 |
+
display_initialization_error("document processor", e)
|
app/ui/tabs/strategic_tab.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Strategic Analysis Tab Component
|
| 4 |
+
|
| 5 |
+
Handles strategic analysis generation and display.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
|
| 10 |
+
from app.ui.tabs.tab_base import TabBase
|
| 11 |
+
from app.ui.ui_components import status_message
|
| 12 |
+
from app.core import logger
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class StrategicTab(TabBase):
|
| 16 |
+
"""
|
| 17 |
+
Strategic analysis tab that handles strategic report generation and display.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def render(self):
|
| 21 |
+
"""Render the strategic analysis tab"""
|
| 22 |
+
if not self._check_documents_available():
|
| 23 |
+
return
|
| 24 |
+
|
| 25 |
+
# Generate button row
|
| 26 |
+
button_clicked = self._render_generate_buttons(
|
| 27 |
+
"🎯 Generate Analysis",
|
| 28 |
+
"regenerate_strategic_btn",
|
| 29 |
+
"strategic_summary",
|
| 30 |
+
"Use AI to generate strategic analysis"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Generate or display content
|
| 34 |
+
if self._should_generate_content(button_clicked, "strategic_summary"):
|
| 35 |
+
self._generate_report("strategic", "strategic_summary", "✅ Strategic analysis generated successfully!")
|
| 36 |
+
else:
|
| 37 |
+
self._render_content_or_placeholder(
|
| 38 |
+
"strategic_summary",
|
| 39 |
+
"👆 Click 'Generate Analysis' to create AI-powered strategic assessment"
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
def _generate_report(self, report_type: str, session_attr: str, success_message: str):
|
| 43 |
+
"""Generate strategic analysis report using AI"""
|
| 44 |
+
if not self._check_ai_availability():
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
if not self._check_processing_active():
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
# Set processing active
|
| 51 |
+
self._set_processing_active(True)
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
with st.spinner("Agent running, please wait..."):
|
| 55 |
+
data_room_name = self._get_data_room_name()
|
| 56 |
+
|
| 57 |
+
strategic_summary = self.ai_handler.generate_report(
|
| 58 |
+
report_type,
|
| 59 |
+
documents=self.session.documents,
|
| 60 |
+
data_room_name=data_room_name,
|
| 61 |
+
strategy_text=self.session.strategy_text,
|
| 62 |
+
checklist_results=self.session.checklist_results
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
if strategic_summary:
|
| 66 |
+
setattr(self.session, session_attr, strategic_summary)
|
| 67 |
+
status_message(success_message, "success")
|
| 68 |
+
st.rerun()
|
| 69 |
+
else:
|
| 70 |
+
status_message("Failed to generate strategic analysis. Please try again.", "error")
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.error(f"Failed to generate strategic analysis: {e}")
|
| 73 |
+
status_message(f"Failed to generate strategic analysis: {str(e)}", "error")
|
| 74 |
+
finally:
|
| 75 |
+
# Always reset processing state
|
| 76 |
+
self._set_processing_active(False)
|
| 77 |
+
|
| 78 |
+
def _get_export_method_name(self) -> str:
|
| 79 |
+
"""Get export method name for strategic reports"""
|
| 80 |
+
return "export_strategic_report"
|
| 81 |
+
|
| 82 |
+
def _get_download_key(self) -> str:
|
| 83 |
+
"""Get download button key for strategic reports"""
|
| 84 |
+
return "export_strategic_btn"
|
| 85 |
+
|
app/ui/tabs/tab_base.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Tab Base Component
|
| 4 |
+
|
| 5 |
+
Provides shared functionality for all tab components including common
|
| 6 |
+
initialization patterns, render methods, and export functionality.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
# Standard library imports
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Optional, Any, Dict
|
| 12 |
+
|
| 13 |
+
# Third-party imports
|
| 14 |
+
import streamlit as st
|
| 15 |
+
|
| 16 |
+
# Local imports
|
| 17 |
+
from app.ui.error_handler import handle_ui_errors
|
| 18 |
+
from app.handlers.ai_handler import AIHandler
|
| 19 |
+
from app.handlers.export_handler import ExportHandler
|
| 20 |
+
from app.ui.session_manager import SessionManager
|
| 21 |
+
from app.ui.ui_components import status_message, render_generate_buttons
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class TabBase:
|
| 25 |
+
"""
|
| 26 |
+
Base class for tab components with shared functionality.
|
| 27 |
+
|
| 28 |
+
Provides common patterns for initialization, rendering, and export functionality.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, session: SessionManager, config, ai_handler: AIHandler, export_handler: ExportHandler):
|
| 32 |
+
"""Initialize tab with session manager, config, and handlers"""
|
| 33 |
+
self.session = session
|
| 34 |
+
self.config = config
|
| 35 |
+
self.ai_handler = ai_handler
|
| 36 |
+
self.export_handler = export_handler
|
| 37 |
+
|
| 38 |
+
def render(self):
|
| 39 |
+
"""Render the tab - to be implemented by subclasses"""
|
| 40 |
+
raise NotImplementedError("Subclasses must implement render()")
|
| 41 |
+
|
| 42 |
+
def _check_documents_available(self) -> bool:
|
| 43 |
+
"""Check if documents are available and show message if not"""
|
| 44 |
+
if not self.session.documents:
|
| 45 |
+
status_message("👈 Configure and process data room first", "info")
|
| 46 |
+
return False
|
| 47 |
+
return True
|
| 48 |
+
|
| 49 |
+
def _render_generate_buttons(self, generate_label: str, regenerate_key: str,
|
| 50 |
+
session_attr: str, help_text: str) -> tuple[bool, bool]:
|
| 51 |
+
"""Render common generate and regenerate buttons using reusable component"""
|
| 52 |
+
return render_generate_buttons(
|
| 53 |
+
generate_label,
|
| 54 |
+
regenerate_key,
|
| 55 |
+
session_attr,
|
| 56 |
+
help_text,
|
| 57 |
+
self.session
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
def _should_generate_content(self, generate_clicked: bool, session_attr: str) -> bool:
|
| 61 |
+
"""Determine if content should be generated"""
|
| 62 |
+
return generate_clicked and not getattr(self.session, session_attr)
|
| 63 |
+
|
| 64 |
+
def _should_display_content(self, session_attr: str) -> bool:
|
| 65 |
+
"""Determine if content should be displayed"""
|
| 66 |
+
return bool(getattr(self.session, session_attr))
|
| 67 |
+
|
| 68 |
+
def _get_data_room_name(self) -> str:
|
| 69 |
+
"""Get the data room name from documents"""
|
| 70 |
+
if not self.session.documents:
|
| 71 |
+
return "Unknown"
|
| 72 |
+
return Path(list(self.session.documents.keys())[0]).parent.name
|
| 73 |
+
|
| 74 |
+
def _check_ai_availability(self) -> bool:
|
| 75 |
+
"""Check if AI agent is available"""
|
| 76 |
+
if not self.ai_handler.is_agent_available():
|
| 77 |
+
status_message("AI Agent not available. Please configure your API key in the sidebar.", "error")
|
| 78 |
+
return False
|
| 79 |
+
return True
|
| 80 |
+
|
| 81 |
+
def _check_processing_active(self) -> bool:
|
| 82 |
+
"""Check if processing is already active"""
|
| 83 |
+
if self.session.processing_active:
|
| 84 |
+
status_message("⚠️ Another operation is currently running. Please wait.", "warning")
|
| 85 |
+
return False
|
| 86 |
+
return True
|
| 87 |
+
|
| 88 |
+
def _set_processing_active(self, active: bool):
|
| 89 |
+
"""Set processing active state"""
|
| 90 |
+
self.session.processing_active = active
|
| 91 |
+
|
| 92 |
+
@handle_ui_errors("Report generation", "Please check your documents and try again")
|
| 93 |
+
def _generate_report(self, report_type: str, session_attr: str, success_message: str):
|
| 94 |
+
"""Generate report using AI - to be implemented by subclasses"""
|
| 95 |
+
raise NotImplementedError("Subclasses must implement _generate_report()")
|
| 96 |
+
|
| 97 |
+
def _render_export_button(self, export_method_name: str, download_key: str):
|
| 98 |
+
"""Render export button for reports"""
|
| 99 |
+
# Get the session attribute dynamically
|
| 100 |
+
session_attr = export_method_name.replace("export_", "").replace("_report", "_summary")
|
| 101 |
+
if not getattr(self.session, session_attr):
|
| 102 |
+
return
|
| 103 |
+
|
| 104 |
+
# Call the export method dynamically
|
| 105 |
+
export_method = getattr(self.export_handler, export_method_name)
|
| 106 |
+
file_name, export_data = export_method()
|
| 107 |
+
|
| 108 |
+
if file_name and export_data:
|
| 109 |
+
st.download_button(
|
| 110 |
+
"📥 Export Report",
|
| 111 |
+
data=export_data,
|
| 112 |
+
file_name=file_name,
|
| 113 |
+
mime="text/markdown",
|
| 114 |
+
key=download_key,
|
| 115 |
+
help="Download report as Markdown file"
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
def _render_content_or_placeholder(self, session_attr: str, placeholder_message: str):
|
| 119 |
+
"""Render content if available, otherwise show placeholder"""
|
| 120 |
+
content = getattr(self.session, session_attr)
|
| 121 |
+
if content:
|
| 122 |
+
if isinstance(content, str):
|
| 123 |
+
st.markdown(content)
|
| 124 |
+
else:
|
| 125 |
+
# Handle dict/other types as needed by subclasses
|
| 126 |
+
self._render_custom_content(content)
|
| 127 |
+
self._render_export_button(self._get_export_method_name(), self._get_download_key())
|
| 128 |
+
else:
|
| 129 |
+
status_message(placeholder_message, "info")
|
| 130 |
+
|
| 131 |
+
def _render_custom_content(self, content: Any):
|
| 132 |
+
"""Render custom content types - can be overridden by subclasses"""
|
| 133 |
+
pass
|
| 134 |
+
|
| 135 |
+
def _get_export_method_name(self) -> str:
|
| 136 |
+
"""Get export method name - to be implemented by subclasses"""
|
| 137 |
+
raise NotImplementedError("Subclasses must implement _get_export_method_name()")
|
| 138 |
+
|
| 139 |
+
def _get_download_key(self) -> str:
|
| 140 |
+
"""Get download button key - to be implemented by subclasses"""
|
| 141 |
+
raise NotImplementedError("Subclasses must implement _get_download_key()")
|