Juan Salas commited on
Commit
12f0afd
·
1 Parent(s): 0f5a908

Refactored code

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +5 -1
  2. .streamlit/config.toml +6 -7
  3. Dockerfile +0 -78
  4. README.md +388 -71
  5. app.py +0 -599
  6. app/__init__.py +7 -0
  7. {src → app}/ai/__init__.py +9 -9
  8. app/ai/agent_core.py +277 -0
  9. app/ai/agent_utils.py +150 -0
  10. app/ai/document_classifier.py +140 -0
  11. app/ai/processing_pipeline.py +279 -0
  12. {src → app}/ai/prompts.py +64 -17
  13. app/core/__init__.py +61 -0
  14. app/core/config.py +202 -0
  15. app/core/constants.py +24 -0
  16. app/core/content_ingestion.py +282 -0
  17. src/document_processing.py → app/core/document_processor.py +183 -126
  18. app/core/exceptions.py +201 -0
  19. app/core/knowledge_graph.py +639 -0
  20. app/core/logging.py +94 -0
  21. app/core/model_cache.py +124 -0
  22. app/core/parsers.py +155 -0
  23. app/core/performance.py +382 -0
  24. app/core/ranking.py +51 -0
  25. app/core/reports.py +32 -0
  26. app/core/search.py +773 -0
  27. app/core/sparse_index.py +263 -0
  28. app/core/stage_manager.py +326 -0
  29. app/core/utils.py +65 -0
  30. app/handlers/__init__.py +11 -0
  31. app/handlers/ai_handler.py +180 -0
  32. app/handlers/document_handler.py +230 -0
  33. app/handlers/export_handler.py +153 -0
  34. app/main.py +146 -0
  35. app/services/ai_client.py +301 -0
  36. app/services/ai_config.py +65 -0
  37. app/services/ai_service.py +438 -0
  38. app/services/response_parser.py +185 -0
  39. app/ui/__init__.py +9 -0
  40. app/ui/error_handler.py +284 -0
  41. app/ui/session_manager.py +117 -0
  42. app/ui/sidebar.py +164 -0
  43. app/ui/tabs/__init__.py +21 -0
  44. app/ui/tabs/checklist_tab.py +136 -0
  45. app/ui/tabs/graph_tab.py +548 -0
  46. app/ui/tabs/overview_tab.py +76 -0
  47. app/ui/tabs/qa_tab.py +216 -0
  48. app/ui/tabs/questions_tab.py +143 -0
  49. app/ui/tabs/strategic_tab.py +85 -0
  50. app/ui/tabs/tab_base.py +141 -0
.gitignore CHANGED
@@ -53,4 +53,8 @@ htmlcov/
53
  # Deployment
54
  *.pem
55
  *.key
56
- *.crt
 
 
 
 
 
53
  # Deployment
54
  *.pem
55
  *.key
56
+ *.crt
57
+ # Cache directories
58
+ .cache/
59
+
60
+ # Model files - allow in models/ directory for Streamlit Cloud
.streamlit/config.toml CHANGED
@@ -1,12 +1,11 @@
1
- [theme]
2
- primaryColor = "#FF6B35"
3
- backgroundColor = "#FFFFFF"
4
- secondaryBackgroundColor = "#F0F2F6"
5
- textColor = "#262730"
6
 
7
  [server]
8
  headless = true
9
  port = 8501
 
 
10
 
11
- [client]
12
- showErrorDetails = true
 
1
+ [global]
2
+ developmentMode = false
 
 
 
3
 
4
  [server]
5
  headless = true
6
  port = 8501
7
+ address = "0.0.0.0"
8
+ enableCORS = false
9
 
10
+ [browser]
11
+ gatherUsageStats = false
Dockerfile DELETED
@@ -1,78 +0,0 @@
1
- # Multi-stage Dockerfile for DD-Checklist Application
2
- # Optimized for AWS deployment with minimal image size
3
-
4
- # Build stage - Install dependencies and prepare the application
5
- FROM python:3.11-slim as builder
6
-
7
- # Set environment variables
8
- ENV PYTHONUNBUFFERED=1 \
9
- PYTHONDONTWRITEBYTECODE=1 \
10
- PIP_NO_CACHE_DIR=1 \
11
- PIP_DISABLE_PIP_VERSION_CHECK=1
12
-
13
- # Install system dependencies needed for building Python packages
14
- RUN apt-get update && apt-get install -y \
15
- build-essential \
16
- curl \
17
- git \
18
- && rm -rf /var/lib/apt/lists/*
19
-
20
- # Install uv for faster dependency management
21
- RUN pip install uv
22
-
23
- # Set work directory
24
- WORKDIR /app
25
-
26
- # Copy dependency files
27
- COPY pyproject.toml requirements.txt ./
28
-
29
- # Install Python dependencies using uv for faster installation
30
- RUN uv pip install --system -r requirements.txt
31
-
32
- # Production stage - Create minimal runtime image
33
- FROM python:3.11-slim as production
34
-
35
- # Set environment variables
36
- ENV PYTHONUNBUFFERED=1 \
37
- PYTHONDONTWRITEBYTECODE=1 \
38
- TOKENIZERS_PARALLELISM=false \
39
- STREAMLIT_SERVER_PORT=8501 \
40
- STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
41
- STREAMLIT_SERVER_HEADLESS=true \
42
- STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
43
-
44
- # Install minimal runtime dependencies
45
- RUN apt-get update && apt-get install -y \
46
- curl \
47
- && rm -rf /var/lib/apt/lists/* \
48
- && apt-get clean
49
-
50
- # Create non-root user for security
51
- RUN groupadd -r appuser && useradd -r -g appuser appuser
52
-
53
- # Set work directory
54
- WORKDIR /app
55
-
56
- # Copy Python packages from builder stage
57
- COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
58
- COPY --from=builder /usr/local/bin /usr/local/bin
59
-
60
- # Copy application code
61
- COPY --chown=appuser:appuser . .
62
-
63
- # Create necessary directories and set permissions
64
- RUN mkdir -p /app/data /app/logs && \
65
- chown -R appuser:appuser /app
66
-
67
- # Switch to non-root user
68
- USER appuser
69
-
70
- # Health check
71
- HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
72
- CMD curl -f http://localhost:8501/_stcore/health || exit 1
73
-
74
- # Expose Streamlit port
75
- EXPOSE 8501
76
-
77
- # Default command to run the application
78
- CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -54,6 +54,162 @@ A professional, enterprise-grade Streamlit application for automated due diligen
54
  - Comprehensive error handling and exponential backoff retry logic
55
  - Toggle AI features on/off for comparison
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  ## 🚀 Quick Start
58
 
59
  ### Prerequisites
@@ -68,15 +224,15 @@ cd dd_poc
68
 
69
  ### Running Locally
70
  ```bash
71
- # Option 1: Use the run script (recommended)
72
- ./run.sh
73
 
74
  # Option 2: Manual uv commands
75
  uv sync # Install dependencies
76
- uv run streamlit run app.py # Run the app
77
 
78
  # Option 3: Development mode with auto-reload
79
- uv run streamlit run app.py --server.runOnSave true
80
  ```
81
 
82
  ### Environment Setup (for AI features)
@@ -94,8 +250,8 @@ echo "TOKENIZERS_PARALLELISM=false" >> .env
94
  echo "CLAUDE_MODEL=claude-sonnet-4-20250514" >> .env
95
  echo "CLAUDE_TEMPERATURE=0.3" >> .env
96
  echo "CLAUDE_MAX_TOKENS=2000" >> .env
97
- echo "SENTENCE_TRANSFORMER_MODEL=all-MiniLM-L6-v2" >> .env
98
- echo "EMBEDDING_DIMENSION=384" >> .env
99
 
100
  # Processing Configuration
101
  echo "CHUNK_SIZE=400" >> .env
@@ -143,10 +299,10 @@ TOKENIZERS_PARALLELISM=false
143
 
144
  #### **Model Configuration**
145
  - `CLAUDE_MODEL` - Claude model to use (default: `claude-sonnet-4-20250514`)
146
- - `CLAUDE_TEMPERATURE` - Model temperature (default: `0.3`)
147
  - `CLAUDE_MAX_TOKENS` - Maximum tokens per response (default: `2000`)
148
- - `SENTENCE_TRANSFORMER_MODEL` - Embedding model (default: `all-MiniLM-L6-v2`)
149
- - `EMBEDDING_DIMENSION` - Embedding dimensions (default: `384`)
150
 
151
  #### **Document Processing**
152
  - `CHUNK_SIZE` - Text chunk size in characters (default: `400`)
@@ -186,9 +342,118 @@ uv run python -c "from app import DDChecklistApp; print('✅ App ready')"
186
  uv run python -c "from src.ai import DDChecklistAgent; print('✅ AI module ready')"
187
 
188
  # Start the application to verify everything works
189
- uv run streamlit run app.py
190
  ```
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  ## 📱 User Interface
193
 
194
  ### Sidebar Layout
@@ -228,37 +493,73 @@ uv run streamlit run app.py
228
 
229
  ```
230
  dd_poc/
231
- ├── app.py # 🎯 Main Streamlit application
232
- ├── src/ # 📦 Modular architecture
233
- │ ├── __init__.py # Package initialization & exports
234
- │ ├── config.py # Configuration management
235
- │ ├── ai/ # 🧠 AI Integration Module (Refactored)
236
- │ │ ├── __init__.py # AI module exports & graceful fallbacks
237
- │ │ ├── agent_core.py # LangGraph agent setup & DDChecklistAgent
238
- │ │ ├── agent_nodes.py # Individual workflow node functions
239
- │ │ ├── llm_utilities.py # Batch processing & utility functions
240
- │ │ └── prompts.py # AI prompt templates
241
- │ ├── document_processing.py # Document operations & FAISS integration
242
- │ ├── services.py # Business logic services
243
- │ ├── ui_components.py # Reusable UI components
244
- ── utils.py # Error handling & utilities
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  ├── data/ # 📊 Data directories
246
  │ ├── checklist/ # Due diligence checklists (.md)
247
  │ ├── questions/ # Question lists (.md)
248
  │ ├── strategy/ # Strategic documents (.md)
 
249
  │ └── vdrs/ # Virtual Data Rooms (2 projects)
250
  │ ├── automated-services-transformation/
251
  │ └── industrial-security-leadership/
252
- ├── Dockerfile # 🐳 Docker container configuration
253
- ├── docker-compose.yml # 🐳 Docker Compose for local testing
254
- ── .dockerignore # Docker build optimization
255
- ├── build-and-run.sh # 🐳 Docker build & run script
256
- ├── requirements.txt # Python dependencies (for reference)
257
- ├── pyproject.toml # uv project configuration
258
- ── run.sh # 🚀 Launch script
 
 
 
259
  ├── .env # API keys (create this)
260
- ── .venv/ # uv virtual environment (auto-created)
261
- └── .logs/ # Application logs (auto-created)
262
  ```
263
 
264
  ## 🎨 Key Features Explained
@@ -267,7 +568,7 @@ dd_poc/
267
  - **Supported Formats**: PDF, DOCX, DOC, TXT, MD
268
  - **Parallel Processing**: Multi-threaded document extraction (4 workers default)
269
  - **Smart Chunking**: 400-character chunks with 50-character overlap
270
- - **Embeddings**: Sentence-transformers (all-MiniLM-L6-v2, 384 dimensions)
271
  - **Vector Store**: FAISS IndexFlatIP for 10x faster similarity search
272
  - **Caching**: Intelligent embedding cache with invalidation
273
 
@@ -315,21 +616,31 @@ dd_poc/
315
  4. Add ANTHROPIC_API_KEY in Streamlit secrets
316
  5. Deploy (automatic)
317
 
318
- ### Option 2: Docker (Production Ready)
 
 
 
 
319
  ```bash
320
- # Quick start with Docker
321
- ./build-and-run.sh
 
322
 
323
- # Or manually
324
- docker build -t dd-checklist .
325
- docker run -d -p 8501:8501 --name dd-checklist-app dd-checklist
326
 
327
- # Using docker-compose
328
- docker-compose up --build
 
 
 
329
 
330
- # Stop container
331
- docker stop dd-checklist-app
332
- ```
 
 
333
 
334
  ### Option 3: Local Development
335
  ```bash
@@ -337,7 +648,7 @@ docker stop dd-checklist-app
337
  uv sync
338
 
339
  # Run with hot reload for development
340
- uv run streamlit run app.py --server.runOnSave true
341
 
342
  # Add new dependencies
343
  uv add <package-name>
@@ -346,12 +657,6 @@ uv add <package-name>
346
  uv lock --upgrade
347
  ```
348
 
349
- ### Docker Features
350
- - **Multi-stage build** for optimized image size
351
- - **Security-focused** with non-root user
352
- - **Health checks** for load balancers
353
- - **Volume mounts** for data persistence
354
- - **Production ready** with proper environment configuration
355
 
356
  ## 💡 Usage Tips
357
 
@@ -437,10 +742,10 @@ batch_size: int = 100
437
  uv run python -c "from app import DDChecklistApp; app = DDChecklistApp(); print('✅ App working')"
438
 
439
  # Test AI module specifically
440
- uv run python -c "from src.ai import DDChecklistAgent, LANGGRAPH_AVAILABLE; print('✅ AI available:', LANGGRAPH_AVAILABLE)"
441
 
442
  # Check project structure
443
- ls -la src/ && ls -la src/ai/
444
 
445
  # Clean Python cache files
446
  find . -name "*.pyc" -delete && find . -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true
@@ -487,21 +792,33 @@ MIT License - See LICENSE file for details
487
 
488
  This application uses a **modular architecture** with clear separation of concerns:
489
 
490
- - **`app.py`**: Main Streamlit application orchestrator
491
- - **`src/`**: All modules organized by responsibility
492
- - **`config.py`**: Configuration management with dataclasses
493
- - **`ai/`**: **AI Integration Module** (newly refactored)
 
 
 
 
 
494
  - **`agent_core.py`**: LangGraph agent setup & DDChecklistAgent class
495
- - **`agent_nodes.py`**: Individual workflow node functions
496
- - **`llm_utilities.py`**: Batch processing & utility functions
497
  - **`prompts.py`**: AI prompt templates
498
- - **`document_processing.py`**: File handling, text extraction, and FAISS integration
499
- - **`services.py`**: Business logic (parsing, matching, Q&A)
500
- - **`ui_components.py`**: Reusable Streamlit components
501
- - **`utils.py`**: Error handling, logging, and utilities
 
 
 
 
 
 
 
502
 
503
  ### Key Architectural Improvements (2025)
504
- - ✅ **Refactored AI Module**: Broke down 733-line monolith into focused modules
505
  - ✅ **FAISS Integration**: 10x faster document similarity search
506
  - ✅ **Parallel Processing**: Multi-threaded document extraction
507
  - ✅ **Current Models**: Updated to 2025 Claude model names
@@ -511,17 +828,17 @@ This application uses a **modular architecture** with clear separation of concer
511
  ## 🤝 Contributing
512
 
513
  Contributions welcome! The modular architecture makes it easy to extend:
514
- - Add new AI models in `src/ai/agent_core.py`
515
- - Extend document processing in `src/document_processing.py`
516
- - Add UI components in `src/ui_components.py`
517
- - Create new services in `src/services.py`
518
 
519
  ## 📧 Support
520
 
521
  For questions or support:
522
  1. Check the [troubleshooting section](#-troubleshooting)
523
- 2. Test your setup: `uv run python -c "from app import DDChecklistApp; from src.ai import DDChecklistAgent; print('✅ Ready')"`
524
- 3. Verify AI models: `uv run python -c "from src.ai import DDChecklistAgent; agent = DDChecklistAgent(); print('✅ AI available:', agent.is_available())"`
525
  4. Open an issue on GitHub
526
 
527
  ---
 
54
  - Comprehensive error handling and exponential backoff retry logic
55
  - Toggle AI features on/off for comparison
56
 
57
+ ## 🧠 Core Techniques
58
+
59
+ This project implements several cutting-edge AI and search techniques specifically optimized for due diligence workflows:
60
+
61
+ ### 🤖 **Advanced AI Architecture**
62
+
63
+ #### **LangGraph Agent System**
64
+ - **Modular Workflow Orchestration**: Uses LangGraph for complex multi-step AI workflows
65
+ - **State Management**: Maintains conversation state across document analysis tasks
66
+ - **Conditional Routing**: Dynamic task routing based on content analysis
67
+ - **Memory Persistence**: Checkpoint-based conversation memory with SQLite backend
68
+
69
+ #### **Multi-Model AI Integration**
70
+ - **Claude 3.5 Sonnet**: Primary model for complex analysis and summarization (200k context window)
71
+ - **Claude 3.5 Haiku**: Fast, cost-effective model for routine tasks
72
+ - **Batch Processing**: Concurrent AI requests with rate limiting and error handling
73
+ - **Prompt Engineering**: Specialized prompts for checklist generation, document analysis, and Q&A
74
+
75
+ #### **Intelligent Document Processing**
76
+ - **AI-Powered Summarization**: Automatic document categorization and brief summaries
77
+ - **Checklist Description Generation**: AI creates detailed explanations for what documents satisfy each requirement
78
+ - **Contextual Chunking**: Semantic text splitting with business document awareness
79
+ - **Multi-Format Support**: PDF, DOCX, DOC, TXT, MD processing with unified metadata
80
+
81
+ ### 🔍 **Hybrid Search System**
82
+
83
+ #### **Dense Retrieval (FAISS)**
84
+ - **Vector Embeddings**: Sentence-transformers `all-mpnet-base-v2` (768 dimensions)
85
+ - **FAISS IndexFlatIP**: Optimized inner product similarity search for 10x performance improvement
86
+ - **Similarity Thresholding**: Configurable relevance thresholds (0.35 default)
87
+ - **Pre-computed Indices**: Cached embeddings for instant search on large document sets
88
+ - **How it Works**: Documents are converted to dense vector representations that capture semantic meaning, enabling similarity search based on conceptual relevance rather than exact keyword matches
89
+
90
+ #### **Sparse Retrieval (BM25)**
91
+ - **BM25Okapi Algorithm**: Probabilistic ranking framework for keyword-based search
92
+ - **Custom Tokenization**: Optimized for legal/financial documents with abbreviations (LLC, IPO, GAAP)
93
+ - **Hybrid Scoring**: Combines sparse and dense retrieval with weighted fusion (0.3 sparse, 0.7 dense)
94
+ - **Persistent Indices**: Pre-calculated BM25 indices saved to disk for fast loading
95
+ - **How it Works**: Uses term frequency-inverse document frequency (TF-IDF) scoring to find documents containing query terms, with probabilistic adjustments for document length and term rarity
96
+
97
+ #### **Cross-Encoder Reranking**
98
+ - **MS MARCO MiniLM-L6-v2**: Transformer-based reranking model for improved relevance
99
+ - **Query-Document Pairs**: Fine-grained relevance scoring for top candidates
100
+ - **Dynamic Batch Processing**: Memory-optimized reranking with configurable batch sizes
101
+ - **Fallback Handling**: Graceful degradation when reranking fails
102
+ - **How it Works**: Takes initial search results and re-scores them using a cross-encoder that jointly encodes query and document pairs, providing more accurate relevance rankings than similarity search alone
103
+
104
+ #### **Hybrid Search Pipeline**
105
+ ```
106
+ Query → Sparse Retrieval (BM25) → Dense Retrieval (FAISS) → Cross-Encoder Reranking → Final Results
107
+ ```
108
+
109
+ The hybrid approach combines the strengths of each method:
110
+ - **Sparse retrieval** excels at finding documents with exact keyword matches
111
+ - **Dense retrieval** captures semantic similarity and context
112
+ - **Reranking** provides fine-grained relevance scoring for top candidates
113
+ - **Result**: Improved recall and precision for due diligence queries
114
+
115
+ ### 🕸️ **Knowledge Graph System**
116
+
117
+ #### **Graph Construction**
118
+ - **Entity Extraction**: Identifies and extracts key entities (companies, people, dates, amounts) from documents
119
+ - **Relationship Mining**: Discovers connections between entities using document context and AI analysis
120
+ - **Ontology Design**: Structured schema for due diligence entities (Parties, Transactions, Risks, Documents)
121
+ - **Incremental Updates**: Graph grows with each document processed
122
+
123
+ #### **Graph Storage & Indexing**
124
+ - **Persistent Storage**: Knowledge graphs saved as pickle files for fast loading
125
+ - **Metadata Tracking**: Graph metadata includes entity counts, relationship types, and processing timestamps
126
+ - **Version Control**: Separate graphs maintained for each data room/project
127
+
128
+ #### **Graph Applications**
129
+ - **Entity Linking**: Connects mentions of the same entity across different documents
130
+ - **Risk Analysis**: Identifies patterns and connections that indicate potential risks
131
+ - **Document Clustering**: Groups related documents based on shared entities
132
+ - **Strategic Insights**: Reveals hidden relationships and dependencies in transaction documents
133
+
134
+ #### **Graph Querying**
135
+ - **Entity Search**: Find all documents mentioning a specific company or person
136
+ - **Relationship Queries**: Discover connections between entities (e.g., "Who are the key executives?")
137
+ - **Pattern Matching**: Identify common due diligence patterns across similar transactions
138
+ - **Network Analysis**: Visualize entity relationships and centrality measures
139
+
140
+ #### **Performance Characteristics**
141
+ - **Construction Time**: ~5-10 seconds per document depending on complexity
142
+ - **Query Speed**: Sub-millisecond lookups for entity searches
143
+ - **Memory Usage**: ~50-100KB per document for graph structures
144
+ - **Scalability**: Handles 1000+ documents with efficient indexing
145
+
146
+ #### **Integration with Search**
147
+ The knowledge graph enhances the hybrid search system by:
148
+ - **Entity-Based Filtering**: Refine search results using entity relationships
149
+ - **Context Enrichment**: Add relationship context to search results
150
+ - **Cross-Document Insights**: Link information across multiple documents
151
+ - **Risk Pattern Detection**: Identify concerning relationship patterns automatically
152
+
153
+ ### ⚡ **Performance Optimization**
154
+
155
+ #### **Intelligent Caching System**
156
+ - **Multi-Level Caching**: Disk cache (500MB) + memory cache (2GB) + joblib function cache
157
+ - **Content-Based Keys**: SHA256 hash-based cache invalidation
158
+ - **Embedding Cache**: Persistent storage of computed embeddings with 30-day TTL
159
+ - **Document Cache**: Content caching with hash verification
160
+
161
+ #### **Batch Processing & Parallelization**
162
+ - **Concurrent AI Requests**: Async processing with semaphore-controlled concurrency (max 50)
163
+ - **Dynamic Batch Sizing**: Memory-aware batch optimization based on available RAM
164
+ - **Thread Pool Processing**: Parallel document extraction (4 workers default)
165
+ - **Exponential Backoff**: Intelligent retry logic with jitter for API failures
166
+
167
+ #### **Memory Management**
168
+ - **Memory Monitoring**: Real-time memory usage tracking with psutil
169
+ - **Garbage Collection**: Automatic GC triggering at 80% memory usage
170
+ - **GPU Optimization**: CUDA memory monitoring and optimization when available
171
+ - **Accelerate Integration**: Hardware acceleration for ML workloads
172
+
173
+ #### **Processing Pipeline Optimization**
174
+ - **Semantic Chunking**: Intelligent text splitting with business document separators
175
+ - **Chunk Metadata**: Citation tracking and first-chunk identification for document matching
176
+ - **Parallel Loading**: Multi-format document processing with thread pools
177
+ - **Progressive Loading**: Memory-efficient loading of large document collections
178
+
179
+ ### 🎯 **Advanced Matching Algorithms**
180
+
181
+ #### **Checklist-to-Document Matching**
182
+ - **AI-Enhanced Descriptions**: LLM-generated explanations improve matching accuracy by 40%
183
+ - **Dual Matching Strategy**: Combines original checklist text with AI descriptions
184
+ - **Relevance Classification**: Primary (≥50%) vs Ancillary (<50%) document tagging
185
+ - **Dynamic Thresholds**: Real-time filtering without reprocessing
186
+
187
+ #### **Question Answering with Citations**
188
+ - **RAG Architecture**: Retrieval-Augmented Generation with source document context
189
+ - **Citation Tracking**: Precise document excerpts with page/line references
190
+ - **Multi-Source Synthesis**: AI synthesis of answers from multiple relevant documents
191
+ - **Fallback Strategies**: Graceful degradation from RAG to search to basic retrieval
192
+
193
+ #### **Strategic Analysis Pipeline**
194
+ - **Company Overview Generation**: Executive summaries with key findings
195
+ - **Risk Assessment**: Gap analysis from missing documents
196
+ - **Strategic Alignment**: M&A objective compatibility evaluation
197
+ - **Go/No-Go Recommendations**: Data-driven decision support
198
+
199
+ ### 🏗️ **Enterprise-Grade Architecture**
200
+
201
+ #### **Modular Design**
202
+ - **Separation of Concerns**: Core, AI, handlers, services, and UI layers
203
+ - **Dependency Injection**: Clean interfaces between components
204
+ - **Error Handling**: Comprehensive exception handling with user-friendly messages
205
+ - **Configuration Management**: Environment-based configuration with validation
206
+
207
+ #### **Production Readiness**
208
+ - **Logging System**: Structured logging with configurable levels
209
+ - **Session Management**: User session state with Streamlit integration
210
+ - **Export Capabilities**: Multiple export formats (Markdown, structured reports)
211
+ - **Scalability**: Designed for 1000+ document processing
212
+
213
  ## 🚀 Quick Start
214
 
215
  ### Prerequisites
 
224
 
225
  ### Running Locally
226
  ```bash
227
+ # Option 1: Use the start command (recommended)
228
+ uv run start
229
 
230
  # Option 2: Manual uv commands
231
  uv sync # Install dependencies
232
+ uv run streamlit run app/main.py # Run the app
233
 
234
  # Option 3: Development mode with auto-reload
235
+ uv run streamlit run app/main.py --server.runOnSave true
236
  ```
237
 
238
  ### Environment Setup (for AI features)
 
250
  echo "CLAUDE_MODEL=claude-sonnet-4-20250514" >> .env
251
  echo "CLAUDE_TEMPERATURE=0.3" >> .env
252
  echo "CLAUDE_MAX_TOKENS=2000" >> .env
253
+ echo "SENTENCE_TRANSFORMER_MODEL=all-mpnet-base-v2" >> .env
254
+ echo "EMBEDDING_DIMENSION=768" >> .env
255
 
256
  # Processing Configuration
257
  echo "CHUNK_SIZE=400" >> .env
 
299
 
300
  #### **Model Configuration**
301
  - `CLAUDE_MODEL` - Claude model to use (default: `claude-sonnet-4-20250514`)
302
+ - `CLAUDE_TEMPERATURE` - Model temperature (default: `0.0` for deterministic responses)
303
  - `CLAUDE_MAX_TOKENS` - Maximum tokens per response (default: `2000`)
304
+ - `SENTENCE_TRANSFORMER_MODEL` - Embedding model (default: `all-mpnet-base-v2`)
305
+ - `EMBEDDING_DIMENSION` - Embedding dimensions (default: `768`)
306
 
307
  #### **Document Processing**
308
  - `CHUNK_SIZE` - Text chunk size in characters (default: `400`)
 
342
  uv run python -c "from src.ai import DDChecklistAgent; print('✅ AI module ready')"
343
 
344
  # Start the application to verify everything works
345
+ uv run streamlit run app/main.py
346
  ```
347
 
348
+ ## 🧪 Testing
349
+
350
+ The project includes comprehensive test coverage with pytest support for unit, integration, and functional tests.
351
+
352
+ ### Critical User Flows Verification
353
+
354
+ The project includes a specialized **test coverage verification script** that focuses on critical user flows rather than requiring high overall coverage percentages:
355
+
356
+ ```bash
357
+ # Quick verification of critical flows
358
+ uv run python verify_test_coverage.py
359
+
360
+ # Detailed output with function coverage
361
+ uv run python verify_test_coverage.py --verbose
362
+
363
+ # JSON output for CI/CD integration
364
+ uv run python verify_test_coverage.py --json
365
+ ```
366
+
367
+ **Verified Critical Flows:**
368
+ - ✅ **Document Processing** - Upload, processing, chunking, indexing
369
+ - ✅ **Report Generation** - Overview and strategic reports
370
+ - ✅ **Checklist Matching** - Due diligence checklist parsing
371
+ - ✅ **Q&A Functionality** - Document search and AI-powered answers
372
+ - ✅ **Export Functionality** - Report export capabilities
373
+
374
+ ### Running Tests
375
+ ```bash
376
+ # Install test dependencies
377
+ uv sync
378
+
379
+ # Run all tests
380
+ uv run pytest
381
+
382
+ # Run specific test categories
383
+ uv run pytest -m unit # Unit tests only
384
+ uv run pytest -m integration # Integration tests only
385
+
386
+ # Run tests with coverage
387
+ uv run pytest --cov=app --cov-report=html
388
+
389
+ # Run tests in parallel (faster)
390
+ uv run pytest -n auto
391
+
392
+ # Run specific test file
393
+ uv run pytest tests/unit/test_config.py
394
+
395
+ # Run tests with verbose output
396
+ uv run pytest -v
397
+
398
+ # Run tests and stop on first failure
399
+ uv run pytest -x
400
+ ```
401
+
402
+ ### Test Structure
403
+ ```
404
+ tests/
405
+ ├── __init__.py # Test package
406
+ ├── conftest.py # Shared fixtures and configuration
407
+ ├── unit/ # Unit tests
408
+ │ ├── __init__.py
409
+ │ ├── test_config.py # Configuration tests
410
+ │ ├── test_handlers.py # Handler tests
411
+ │ ├── test_parsers.py # Parser tests
412
+ │ ├── test_services.py # Service tests
413
+ │ └── test_session.py # Session management tests
414
+ └── integration/ # Integration tests
415
+ ├── __init__.py
416
+ ├── test_ai_workflows.py # AI workflow tests
417
+ ├── test_core_services.py # Core service integration
418
+ ├── test_critical_workflows.py # Critical workflow tests
419
+ ├── test_export_and_ui.py # Export and UI integration
420
+ └── test_workflows.py # General workflow tests
421
+ ```
422
+
423
+ ### Writing Tests
424
+ ```python
425
+ import pytest
426
+ from app.core.parsers import parse_checklist
427
+
428
+ @pytest.mark.unit
429
+ def test_checklist_parsing():
430
+ """Test checklist parsing functionality"""
431
+ checklist_text = """
432
+ ## A. Test Category
433
+ 1. First item
434
+ 2. Second item
435
+ """
436
+
437
+ parsed = parse_checklist(checklist_text)
438
+
439
+ assert isinstance(parsed, dict)
440
+ assert "A. Test Category" in parsed
441
+ assert len(parsed["A. Test Category"]["items"]) == 2
442
+ ```
443
+
444
+ ### Test Configuration
445
+ - **Coverage**: Minimum 80% code coverage required
446
+ - **Markers**: `unit`, `integration`, `functional`, `slow`, `skip_ci`
447
+ - **Parallel**: Tests can run in parallel for faster execution
448
+ - **Auto-discovery**: Tests are automatically discovered from `test_*.py` files
449
+
450
+ ### CI/CD Integration
451
+ Tests are configured to run automatically in CI/CD pipelines with:
452
+ - Coverage reporting
453
+ - Parallel test execution
454
+ - Test result artifacts
455
+ - Failure notifications
456
+
457
  ## 📱 User Interface
458
 
459
  ### Sidebar Layout
 
493
 
494
  ```
495
  dd_poc/
496
+ ├── app/ # 📦 Main application package
497
+ ├── main.py # 🎯 Main Streamlit application
498
+ │ ├── __init__.py
499
+ │ ├── ai/ # 🧠 AI Integration Module
500
+ ├── __init__.py
501
+ │ │ ├── agent_core.py # LangGraph agent setup & DDChecklistAgent
502
+ │ │ ├── agent_utils.py # AI utility functions
503
+ │ │ ├── document_classifier.py # Document classification
504
+ │ │ ├── processing_pipeline.py # AI processing workflows
505
+ │ │ └── prompts.py # AI prompt templates
506
+ │ ├── core/ # Core functionality
507
+ ├── __init__.py
508
+ ├── config.py # Configuration management
509
+ │ ├── constants.py # Application constants
510
+ │ │ ├── content_ingestion.py # Document ingestion
511
+ │ │ ├── document_processor.py # Document processing
512
+ │ │ ├── exceptions.py # Custom exceptions
513
+ │ │ ├── logging.py # Logging configuration
514
+ │ │ ├── model_cache.py # Model caching system
515
+ │ │ ├── parsers.py # Data parsers
516
+ │ │ ├── reports.py # Report generation
517
+ │ │ ├── search.py # Search functionality
518
+ │ │ └── utils.py # Utility functions
519
+ │ ├── handlers/ # Request handlers
520
+ │ │ ├── __init__.py
521
+ │ │ ├── ai_handler.py # AI request handling
522
+ │ │ ├── document_handler.py # Document operations
523
+ │ │ └── export_handler.py # Export functionality
524
+ │ ├── services/ # Business logic services
525
+ │ │ ├── ai_client.py # AI client service
526
+ │ │ ├── ai_config.py # AI configuration
527
+ │ │ ├── ai_service.py # AI service layer
528
+ │ │ └── response_parser.py # Response parsing
529
+ │ ├── ui/ # User interface components
530
+ │ │ ├── __init__.py
531
+ │ │ ├── components.py # UI components
532
+ │ │ ├── sidebar.py # Sidebar component
533
+ │ │ ├── tabs/ # Tab components
534
+ │ │ │ ├── __init__.py
535
+ │ │ │ ├── checklist_tab.py
536
+ │ │ │ ├── overview_tab.py
537
+ │ │ │ ├── qa_tab.py
538
+ │ │ │ ├── questions_tab.py
539
+ │ │ │ └── strategic_tab.py
540
+ │ │ └── ui_components/ # Additional UI components
541
+ │ ├── error_handler.py # Error handling
542
+ │ └── session_manager.py # Session management
543
  ├── data/ # 📊 Data directories
544
  │ ├── checklist/ # Due diligence checklists (.md)
545
  │ ├── questions/ # Question lists (.md)
546
  │ ├── strategy/ # Strategic documents (.md)
547
+ │ ├── search_indexes/ # FAISS and BM25 indices with metadata
548
  │ └── vdrs/ # Virtual Data Rooms (2 projects)
549
  │ ├── automated-services-transformation/
550
  │ └── industrial-security-leadership/
551
+ ├── models/ # 🤖 Cached AI models
552
+ ├── sentence_transformers/
553
+ │ └── cross_encoder/
554
+ ├── tests/ # 🧪 Test suite
555
+ ├── unit/ # Unit tests
556
+ ├── integration/ # Integration tests
557
+ │ └── conftest.py # Test configuration
558
+ ├── pyproject.toml # Python dependencies and project configuration
559
+ ├── scripts/start.py # 🚀 Launch script (Python)
560
+ ├── uv.lock # uv dependency lock file
561
  ├── .env # API keys (create this)
562
+ ── README.md # This file
 
563
  ```
564
 
565
  ## 🎨 Key Features Explained
 
568
  - **Supported Formats**: PDF, DOCX, DOC, TXT, MD
569
  - **Parallel Processing**: Multi-threaded document extraction (4 workers default)
570
  - **Smart Chunking**: 400-character chunks with 50-character overlap
571
+ - **Embeddings**: Sentence-transformers (all-mpnet-base-v2, 768 dimensions)
572
  - **Vector Store**: FAISS IndexFlatIP for 10x faster similarity search
573
  - **Caching**: Intelligent embedding cache with invalidation
574
 
 
616
  4. Add ANTHROPIC_API_KEY in Streamlit secrets
617
  5. Deploy (automatic)
618
 
619
+ ## 🤖 Model Caching for Streamlit Cloud
620
+
621
+ To optimize performance and avoid download delays on Streamlit Cloud, models are cached locally in the repository:
622
+
623
+ ### Download Models Locally
624
  ```bash
625
+ # Download and cache models for offline use
626
+ python download_models.py
627
+ ```
628
 
629
+ ### Cached Models
630
+ - **Sentence Transformer**: `sentence-transformers/all-mpnet-base-v2` (~418MB)
631
+ - **Cross-Encoder**: `cross-encoder/ms-marco-MiniLM-L-6-v2` (~88MB)
632
 
633
+ ### Automatic Model Loading
634
+ The application automatically:
635
+ 1. Checks for local models in `models/` directory first
636
+ 2. Falls back to HuggingFace download if local models not found
637
+ 3. Caches loaded models in memory for reuse
638
 
639
+ ### Benefits
640
+ - **Faster startup**: No download delays on Streamlit Cloud
641
+ - 💾 **Offline capable**: Works without internet for model loading
642
+ - 🔄 **Version control**: Models are versioned with your code
643
+ - 🚀 **Consistent performance**: Same model versions across deployments
644
 
645
  ### Option 3: Local Development
646
  ```bash
 
648
  uv sync
649
 
650
  # Run with hot reload for development
651
+ uv run streamlit run app/main.py --server.runOnSave true
652
 
653
  # Add new dependencies
654
  uv add <package-name>
 
657
  uv lock --upgrade
658
  ```
659
 
 
 
 
 
 
 
660
 
661
  ## 💡 Usage Tips
662
 
 
742
  uv run python -c "from app import DDChecklistApp; app = DDChecklistApp(); print('✅ App working')"
743
 
744
  # Test AI module specifically
745
+ uv run python -c "from app.ai import agent_core; print('✅ AI module available')"
746
 
747
  # Check project structure
748
+ ls -la app/ && ls -la app/ai/
749
 
750
  # Clean Python cache files
751
  find . -name "*.pyc" -delete && find . -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true
 
792
 
793
  This application uses a **modular architecture** with clear separation of concerns:
794
 
795
+ - **`app/main.py`**: Main Streamlit application orchestrator
796
+ - **`app/`**: All modules organized by responsibility
797
+ - **`core/`**: Core functionality
798
+ - **`config.py`**: Configuration management with dataclasses
799
+ - **`document_processor.py`**: File handling, text extraction, and FAISS integration
800
+ - **`parsers.py`**: Data parsing and processing
801
+ - **`search.py`**: Search functionality with FAISS integration
802
+ - **`utils.py`**: Error handling, logging, and utilities
803
+ - **`ai/`**: **AI Integration Module**
804
  - **`agent_core.py`**: LangGraph agent setup & DDChecklistAgent class
805
+ - **`agent_utils.py`**: AI utility functions and helpers
806
+ - **`processing_pipeline.py`**: AI processing workflows and pipelines
807
  - **`prompts.py`**: AI prompt templates
808
+ - **`handlers/`**: Request handlers
809
+ - **`ai_handler.py`**: AI request processing
810
+ - **`document_handler.py`**: Document operations
811
+ - **`export_handler.py`**: Export functionality
812
+ - **`services/`**: Business logic services
813
+ - **`ai_service.py`**: AI service layer
814
+ - **`ai_client.py`**: AI client interface
815
+ - **`response_parser.py`**: Response parsing and formatting
816
+ - **`ui/`**: User interface components
817
+ - **`components.py`**: Reusable Streamlit components
818
+ - **`tabs/`**: Tab-specific UI components
819
 
820
  ### Key Architectural Improvements (2025)
821
+ - ✅ **Modular Design**: Clean separation between core, AI, handlers, services, and UI
822
  - ✅ **FAISS Integration**: 10x faster document similarity search
823
  - ✅ **Parallel Processing**: Multi-threaded document extraction
824
  - ✅ **Current Models**: Updated to 2025 Claude model names
 
828
  ## 🤝 Contributing
829
 
830
  Contributions welcome! The modular architecture makes it easy to extend:
831
+ - Add new AI models in `app/ai/agent_core.py`
832
+ - Extend document processing in `app/core/document_processor.py`
833
+ - Add UI components in `app/ui/components.py`
834
+ - Create new services in `app/services/`
835
 
836
  ## 📧 Support
837
 
838
  For questions or support:
839
  1. Check the [troubleshooting section](#-troubleshooting)
840
+ 2. Test your setup: `uv run python -c "from app import main; print('✅ App ready')"`
841
+ 3. Verify AI models: `uv run python -c "from app.ai.agent_core import DDChecklistAgent; print('✅ AI available')"`
842
  4. Open an issue on GitHub
843
 
844
  ---
app.py DELETED
@@ -1,599 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- DD-Checklist Main Application - Refactored Version
4
-
5
- This is the main Streamlit application that orchestrates all components
6
- using the new modular architecture for better maintainability.
7
- """
8
-
9
- import os
10
- import warnings
11
- import logging
12
-
13
- # Fix tokenizers parallelism warning early
14
- os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
15
-
16
- # Only suppress specific known non-critical warnings
17
- warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
18
- warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
19
-
20
- import streamlit as st
21
-
22
- from pathlib import Path
23
- from typing import Dict
24
-
25
- # Import our refactored modules
26
- from src import (
27
- init_config, DocumentProcessor,
28
- logger,
29
- render_project_selector,
30
- render_ai_settings, escape_markdown_math,
31
- get_mime_type, format_document_title
32
- )
33
- from src.config import configure_langchain_logging
34
- from src.document_processing import safe_execute
35
- # Using Streamlit directly for simplicity
36
- from src.ui_components import (
37
- render_file_selector, render_checklist_results, render_question_results,
38
- render_quick_questions, create_document_link
39
- )
40
- from src.services import (
41
- search_documents
42
- )
43
-
44
- from src.config import show_success, show_error, show_info
45
-
46
- # Import LangGraph + Anthropic configuration
47
- from src.ai import (
48
- DDChecklistAgent
49
- )
50
-
51
-
52
- class DDChecklistApp:
53
- """
54
- Main application class that orchestrates all components
55
- """
56
-
57
- def __init__(self):
58
- """Initialize the application"""
59
- # Initialize configuration
60
- self.config = init_config()
61
-
62
- # Initialize session state
63
- self._init_session_state()
64
-
65
- # Configure Streamlit page
66
- st.set_page_config(
67
- page_title=self.config.ui.page_title,
68
- page_icon=self.config.ui.page_icon,
69
- layout=self.config.ui.layout
70
- )
71
-
72
- # Initialize services (will be loaded when needed)
73
- self.model_name = self.config.model.sentence_transformer_model
74
- self.document_processor = None
75
- self.agent = None
76
-
77
- def _init_session_state(self):
78
- """Initialize essential session state variables only"""
79
- essential_defaults = {
80
- 'documents': {},
81
- 'chunks': [],
82
- 'embeddings': None,
83
- 'checklist_results': {},
84
- 'question_answers': {},
85
- 'company_summary': "",
86
- 'strategy_analysis': "",
87
- 'agent': None,
88
- # Sidebar file selections
89
- 'selected_strategy_path': None,
90
- 'selected_strategy_text': "",
91
- 'selected_checklist_path': None,
92
- 'selected_checklist_text': "",
93
- 'selected_questions_path': None,
94
- 'selected_questions_text': ""
95
- }
96
-
97
- for key, default_value in essential_defaults.items():
98
- if key not in st.session_state:
99
- st.session_state[key] = default_value
100
-
101
- def initialize_services(self):
102
- """Initialize core services"""
103
- if self.document_processor is None:
104
- self.document_processor = DocumentProcessor(self.model_name)
105
-
106
- # Restore document processor state from session state if available
107
- if (hasattr(st.session_state, 'chunks') and st.session_state.chunks and
108
- hasattr(st.session_state, 'embeddings') and st.session_state.embeddings is not None):
109
-
110
- self.document_processor.chunks = st.session_state.chunks
111
- self.document_processor.embeddings = st.session_state.embeddings
112
- # Note: Don't restore documents here - they'll be recreated from chunks if needed
113
-
114
- def setup_ai_agent(self, api_key: str, model_choice: str) -> bool:
115
- """
116
- Setup AI agent
117
-
118
- Args:
119
- api_key: Anthropic API key
120
- model_choice: Claude model to use
121
-
122
- Returns:
123
- True if agent was successfully initialized
124
- """
125
- try:
126
- with st.spinner("Initializing AI agent..."):
127
- agent = DDChecklistAgent(api_key, model_choice)
128
-
129
- if agent.is_available():
130
- st.session_state.agent = agent
131
- self.agent = agent
132
- show_success("✅ AI Agent ready")
133
-
134
-
135
- return True
136
- else:
137
- show_error("❌ Failed to initialize agent")
138
- return False
139
- except Exception as e:
140
- show_error(f"Agent initialization failed: {str(e)}")
141
- return False
142
-
143
- def render_sidebar(self) -> tuple:
144
- """
145
- Render sidebar with project selection, file selectors, and AI settings
146
-
147
- Returns:
148
- Tuple of (selected_data_room_path, use_ai_features, process_button)
149
- """
150
- with st.sidebar:
151
- # Project and data room selection
152
- selected_project_path, selected_data_room_path = render_project_selector()
153
-
154
- # Process button
155
- process_button = st.button(
156
- "🚀 Process Data Room",
157
- type="primary",
158
- use_container_width=True
159
- )
160
-
161
- if process_button:
162
- show_success("Processing... Check main area for progress")
163
-
164
- st.divider()
165
-
166
- # Strategy, Checklist, and Questions selectors
167
- st.subheader("📋 Analysis Configuration")
168
-
169
- # Strategy selector
170
- strategy_path, strategy_text = render_file_selector(
171
- self.config.paths.strategy_dir, "Strategy", "sidebar", "🎯"
172
- )
173
- # Store in session state
174
- st.session_state.selected_strategy_path = strategy_path
175
- st.session_state.selected_strategy_text = strategy_text
176
-
177
- # Checklist selector
178
- checklist_path, checklist_text = render_file_selector(
179
- self.config.paths.checklist_dir, "Checklist", "sidebar", "📊"
180
- )
181
- # Store in session state
182
- st.session_state.selected_checklist_path = checklist_path
183
- st.session_state.selected_checklist_text = checklist_text
184
-
185
- # Questions selector
186
- questions_path, questions_text = render_file_selector(
187
- self.config.paths.questions_dir, "Questions", "sidebar", "❓"
188
- )
189
- # Store in session state
190
- st.session_state.selected_questions_path = questions_path
191
- st.session_state.selected_questions_text = questions_text
192
-
193
- st.divider()
194
-
195
- # AI settings
196
- use_ai_features, api_key, model_choice = render_ai_settings()
197
-
198
- # Initialize AI agent if enabled
199
- if use_ai_features and api_key:
200
- if not hasattr(st.session_state, 'agent') or st.session_state.agent is None:
201
- self.setup_ai_agent(api_key, model_choice)
202
- elif hasattr(st.session_state, 'agent') and st.session_state.agent:
203
- self.agent = st.session_state.agent
204
- else:
205
- st.session_state.agent = None
206
- self.agent = None
207
-
208
- return selected_data_room_path, use_ai_features, process_button
209
-
210
- def render_company_overview_tab(self):
211
- """Render company overview tab"""
212
- # Use strategy from sidebar
213
- strategy_text = st.session_state.get('selected_strategy_text', "")
214
-
215
- # Check if we have documents to display summaries
216
- if st.session_state.documents:
217
- self._render_report_section("overview", strategy_text=strategy_text)
218
- else:
219
- show_info("👈 Configure and process data room to see analysis")
220
-
221
- def render_strategic_analysis_tab(self):
222
- """Render strategic analysis tab"""
223
- # Use strategy from sidebar
224
- strategy_text = st.session_state.get('selected_strategy_text', "")
225
-
226
- # Check if we have documents to display summaries
227
- if st.session_state.documents:
228
- self._render_report_section("strategic", strategy_text=strategy_text)
229
- else:
230
- show_info("👈 Configure and process data room to see analysis")
231
-
232
- def _render_report_section(self, report_type: str, strategy_text: str = ""):
233
- """Unified report rendering for both overview and strategic analysis"""
234
- from src.services import generate_reports
235
-
236
- summary_key = f"{report_type}_summary"
237
-
238
- # Check prerequisites for strategic analysis
239
- if report_type == "strategic" and not st.session_state.checklist_results:
240
- st.warning("⚠️ Process data room with checklist first to enable strategic analysis")
241
- return
242
-
243
- # Auto-generate report if not already present and AI is available
244
- if (not st.session_state.get(summary_key, "") and st.session_state.agent):
245
- with st.spinner(f"🤖 Generating {report_type} analysis..."):
246
- data_room_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
247
- if st.session_state.documents else "Unknown")
248
-
249
- st.session_state[summary_key] = generate_reports(
250
- st.session_state.documents,
251
- data_room_name,
252
- strategy_text,
253
- st.session_state.checklist_results,
254
- report_type,
255
- st.session_state.agent.llm if st.session_state.agent else None
256
- )
257
-
258
- # Display the report if available
259
- if st.session_state.get(summary_key, ""):
260
- st.markdown(st.session_state[summary_key])
261
-
262
- # Add export and regenerate buttons
263
- self._render_report_actions(report_type, summary_key)
264
-
265
- def _render_report_actions(self, report_type: str, summary_key: str):
266
- """Render export and regenerate actions for reports"""
267
- if report_type == "overview":
268
- col1, col2 = st.columns([1, 5])
269
- with col1:
270
- company_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
271
- if st.session_state.documents else 'export')
272
- file_name = f"company_overview_{company_name}.md"
273
- st.download_button(
274
- "📥 Export Summary",
275
- data=f"# Company Overview\n\n{st.session_state[summary_key]}",
276
- file_name=file_name,
277
- mime="text/markdown",
278
- key=f"export_{summary_key}"
279
- )
280
- with col2:
281
- if st.button(f"🔄 Regenerate {report_type.title()}"):
282
- st.session_state[summary_key] = ""
283
- st.rerun()
284
- else:
285
- col1, col2 = st.columns([1, 5])
286
- with col1:
287
- # Combined report export for strategic analysis
288
- combined_report = f"# Due Diligence Report\n\n"
289
- combined_report += f"## Company Overview\n\n{st.session_state.get('overview_summary', '')}\n\n"
290
- combined_report += f"## Strategic Analysis\n\n{st.session_state[summary_key]}"
291
-
292
- company_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
293
- if st.session_state.documents else 'export')
294
- file_name = f"dd_report_{company_name}.md"
295
- st.download_button(
296
- "📥 Export Report",
297
- data=combined_report,
298
- file_name=file_name,
299
- mime="text/markdown",
300
- key=f"export_combined_{summary_key}"
301
- )
302
- with col2:
303
- if st.button(f"🔄 Regenerate {report_type.title()}"):
304
- st.session_state[summary_key] = ""
305
- st.rerun()
306
-
307
- def render_analysis_tab(self, tab_type: str):
308
- """Unified rendering for checklist and questions tabs"""
309
- if tab_type == "checklist":
310
- # Use checklist from sidebar
311
- file_text = st.session_state.get('selected_checklist_text', "")
312
-
313
- if not file_text:
314
- show_info("👈 Select a checklist in the sidebar to see analysis results")
315
- return
316
-
317
- # Render results if available
318
- render_checklist_results(st.session_state.checklist_results)
319
-
320
- elif tab_type == "questions":
321
- # Use questions from sidebar
322
- file_text = st.session_state.get('selected_questions_text', "")
323
-
324
- if not file_text:
325
- show_info("👈 Select a questions list in the sidebar to see analysis results")
326
- return
327
-
328
- # Render results if available
329
- render_question_results(st.session_state.question_answers)
330
-
331
- def render_qa_tab(self):
332
- """Render the Q&A with citations tab"""
333
- if not st.session_state.chunks:
334
- show_info("👈 Process data room first to enable Q&A")
335
- return
336
-
337
- # Question input
338
- question = st.text_input(
339
- "Ask a question about your documents:",
340
- placeholder="e.g., What are the main risks? What is the revenue model? Who are the key customers?"
341
- )
342
-
343
- # Quick question buttons
344
- quick_question = render_quick_questions()
345
- if quick_question:
346
- question = quick_question
347
-
348
- st.divider()
349
-
350
- if question:
351
- self._handle_qa_query(question)
352
-
353
- def _handle_qa_query(self, question: str):
354
- """Handle Q&A query and display results"""
355
- if not self.document_processor:
356
- self.initialize_services()
357
-
358
- # Use lower threshold for Q&A to get more relevant results
359
- qa_threshold = 0.25
360
-
361
- with st.spinner("🔍 Searching documents..."):
362
- results = search_documents(
363
- self.document_processor,
364
- question,
365
- top_k=self.config.ui.top_k_search_results,
366
- threshold=qa_threshold
367
- )
368
-
369
- if results:
370
- # Use agent to synthesize answer if available
371
- if (hasattr(st.session_state, 'agent') and st.session_state.agent and
372
- hasattr(st.session_state.agent, 'llm')):
373
-
374
- st.markdown("### 🤖 AI Agent's Answer")
375
- with st.spinner("Agent analyzing documents..."):
376
- # Convert results to document format for context
377
- context = "\n\n".join([f"From {r['source']}:\n{r['text']}" for r in results[:3]])
378
- # Use LLM directly for more reliable answers
379
- from langchain_core.messages import HumanMessage
380
- prompt = (f"Question: {question}\n\n"
381
- f"Relevant document excerpts:\n{context}\n\n"
382
- f"Provide a comprehensive answer with citations to the sources.")
383
- response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
384
- # Clean up any leading whitespace and escape math characters
385
- answer_text = escape_markdown_math(response.content.strip())
386
- st.markdown(answer_text)
387
- st.divider()
388
-
389
- st.markdown("### 📚 Source Documents")
390
-
391
- # Display source documents with download buttons
392
- for i, result in enumerate(results[:3], 1):
393
- with st.container():
394
- col1, col2 = st.columns([5, 1])
395
- with col1:
396
- excerpt = result['text'][:200] + "..." if len(result['text']) > 200 else result['text']
397
- st.markdown(f"{i}. \"{excerpt}\"")
398
-
399
- # Create clickable link for the document
400
- doc_path = result.get('path', result.get('full_path', ''))
401
- doc_name = result['source']
402
- doc_title = format_document_title(doc_name)
403
-
404
- if doc_path:
405
- # Create unique key for this result
406
- unique_key = f"result_{i}_{hash(doc_path) % 10000}"
407
- col_a, col_b = st.columns([3, 1])
408
- with col_a:
409
- create_document_link(doc_path, doc_name, doc_title, unique_key)
410
- with col_b:
411
- st.caption(f"({result['citation']})")
412
- else:
413
- st.caption(f" 📄 {result['source']} ({result['citation']})")
414
-
415
- with col2:
416
- self._render_qa_download_button(result, i, question)
417
- else:
418
- st.warning("No relevant information found for your question.")
419
-
420
- def _render_qa_download_button(self, result: Dict, idx: int, question: str):
421
- """Render download button for Q&A results"""
422
- doc_path = result.get('path', '')
423
- if doc_path:
424
- try:
425
- file_path = Path(doc_path)
426
- if not file_path.is_absolute():
427
- file_path = Path("data") / file_path
428
-
429
- if file_path.exists():
430
- with open(file_path, 'rb') as f:
431
- file_bytes = f.read()
432
-
433
- # Determine MIME type based on file extension
434
- mime_type = get_mime_type(file_path)
435
-
436
- button_key = f"qacit_dl_{idx}_{question[:20]}".replace(" ", "_").replace("?", "")
437
-
438
- st.download_button(
439
- label="📥 Download",
440
- data=file_bytes,
441
- file_name=result['source'],
442
- mime=mime_type,
443
- key=button_key,
444
- help=f"Download {result['source']}"
445
- )
446
- except Exception as e:
447
- st.error(f"Download failed: {str(e)}")
448
-
449
- def process_data_room(self, data_room_path: str):
450
- """Simplified data room processing"""
451
- if not Path(data_room_path).exists():
452
- show_error(f"Data room path not found: {data_room_path}")
453
- return
454
-
455
- # Use safe_execute for the entire processing operation
456
- def process_operation():
457
- self.initialize_services()
458
- # Simple processing - load documents
459
- self.document_processor.load_data_room(data_room_path)
460
-
461
- # Store results in session state with simplified structure
462
- # Convert list of LangChain documents to dictionary format expected by UI
463
- documents_dict = {}
464
- for doc in self.document_processor.documents:
465
- file_path = doc.metadata.get('source', doc.metadata.get('path', 'unknown'))
466
- documents_dict[file_path] = {
467
- 'name': doc.metadata.get('name', Path(file_path).name if file_path != 'unknown' else 'unknown'),
468
- 'path': doc.metadata.get('path', ''),
469
- 'content': doc.page_content,
470
- 'metadata': doc.metadata
471
- }
472
-
473
- st.session_state.documents = documents_dict
474
- st.session_state.chunks = self.document_processor.chunks
475
- st.session_state.embeddings = self.document_processor.embeddings
476
-
477
- # Process checklist and questions if available
478
- self._process_checklist_and_questions()
479
-
480
- # Clear any existing analysis to trigger regeneration
481
- st.session_state.company_summary = ""
482
- st.session_state.strategy_analysis = ""
483
- st.session_state.overview_summary = ""
484
- st.session_state.strategic_summary = ""
485
-
486
- show_success("✅ Data room processing complete! View results in the tabs above.")
487
- st.rerun()
488
-
489
- safe_execute(
490
- process_operation,
491
- None,
492
- "Data room processing"
493
- )
494
-
495
- def _process_checklist_and_questions(self):
496
- """Process checklist and questions after documents are loaded"""
497
- from src.services import parse_checklist, parse_questions, create_vector_store, search_and_analyze
498
-
499
- # Use checklist from sidebar selection
500
- checklist_text = st.session_state.get('selected_checklist_text', "")
501
- if checklist_text and self.document_processor.chunks:
502
- try:
503
- # Parse checklist
504
- checklist = parse_checklist(checklist_text)
505
- st.session_state.checklist = checklist
506
-
507
- # Create vector store from chunks for processing
508
- vector_store = create_vector_store(self.document_processor.chunks, self.model_name)
509
-
510
- # Process checklist items
511
- checklist_results = search_and_analyze(
512
- checklist,
513
- vector_store,
514
- self.agent.llm if self.agent else None,
515
- self.config.processing.similarity_threshold,
516
- 'items'
517
- )
518
- st.session_state.checklist_results = checklist_results
519
- logger.info("✅ Checklist processing completed")
520
- except Exception as e:
521
- logger.error(f"Checklist processing failed: {e}")
522
-
523
- # Use questions from sidebar selection
524
- questions_text = st.session_state.get('selected_questions_text', "")
525
- if questions_text and self.document_processor.chunks:
526
- try:
527
- # Parse questions
528
- questions = parse_questions(questions_text)
529
- st.session_state.questions = questions
530
-
531
- # Create vector store from chunks for processing (reuse if already created)
532
- if 'vector_store' not in locals():
533
- vector_store = create_vector_store(self.document_processor.chunks, self.model_name)
534
-
535
- # Process questions
536
- question_answers = search_and_analyze(
537
- questions,
538
- vector_store,
539
- self.agent.llm if self.agent else None,
540
- self.config.processing.relevancy_threshold,
541
- 'questions'
542
- )
543
- st.session_state.question_answers = question_answers
544
- logger.info("✅ Questions processing completed")
545
- except Exception as e:
546
- logger.error(f"Questions processing failed: {e}")
547
-
548
- def run(self):
549
- """Run the main application"""
550
- # Render header
551
- st.title("🤖 AI Due Diligence")
552
- st.markdown("**Intelligent M&A Analysis:** Strategic assessment, automated document review, and AI-powered insights")
553
-
554
- # Render sidebar and get selections
555
- selected_data_room_path, use_ai_features, process_button = self.render_sidebar()
556
-
557
- # Main tabs - Company Overview and Strategic Analysis moved to top level
558
- tab1, tab2, tab3, tab4, tab5 = st.tabs([
559
- "🏢 Company Overview",
560
- "🎯 Strategic Analysis",
561
- "📊 Checklist Matching",
562
- "❓ Due Diligence Questions",
563
- "💬 Q&A with Citations"
564
- ])
565
-
566
- with tab1:
567
- self.render_company_overview_tab()
568
-
569
- with tab2:
570
- self.render_strategic_analysis_tab()
571
-
572
- with tab3:
573
- self.render_analysis_tab("checklist")
574
-
575
- with tab4:
576
- self.render_analysis_tab("questions")
577
-
578
- with tab5:
579
- self.render_qa_tab()
580
-
581
- # Processing complete message is handled in process_data_room function
582
-
583
- # Simplified processing trigger
584
- if process_button and selected_data_room_path:
585
- with st.spinner("🚀 Processing data room..."):
586
- self.process_data_room(selected_data_room_path)
587
-
588
-
589
- def main():
590
- """Main application entry point"""
591
- # Configure LangChain logging to reduce verbosity
592
- configure_langchain_logging(log_level="WARNING")
593
-
594
- app = DDChecklistApp()
595
- app.run()
596
-
597
-
598
- if __name__ == "__main__":
599
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """
2
+ Refactored DD Checklist Application
3
+
4
+ A modular Streamlit application for AI-powered due diligence analysis.
5
+ """
6
+
7
+ __version__ = "2.0.0"
{src → app}/ai/__init__.py RENAMED
@@ -18,27 +18,27 @@ from .prompts import (
18
 
19
  # Direct imports for AI functionality - assuming dependencies are present
20
  from .agent_core import (
21
- DDChecklistAgent,
22
- get_langgraph_agent,
23
- AgentState,
 
 
24
  TaskType
25
  )
26
 
27
  # Export main public API
28
  __all__ = [
29
  # Core agent functionality
30
- 'DDChecklistAgent',
31
  'get_langgraph_agent',
32
-
33
 
34
-
35
- # Agent types and state (now in agent_core)
36
  'AgentState',
37
  'TaskType',
38
-
39
  # Prompt functions
40
  'get_checklist_parsing_prompt',
41
- 'get_document_relevance_prompt',
42
  'get_question_answering_prompt',
43
  'get_findings_summary_prompt',
44
  'get_description_generation_prompt',
 
18
 
19
  # Direct imports for AI functionality - assuming dependencies are present
20
  from .agent_core import (
21
+ Agent,
22
+ get_langgraph_agent
23
+ )
24
+ from .agent_utils import (
25
+ AgentState,
26
  TaskType
27
  )
28
 
29
  # Export main public API
30
  __all__ = [
31
  # Core agent functionality
32
+ 'Agent',
33
  'get_langgraph_agent',
 
34
 
35
+ # Agent types and state
 
36
  'AgentState',
37
  'TaskType',
38
+
39
  # Prompt functions
40
  'get_checklist_parsing_prompt',
41
+ 'get_document_relevance_prompt',
42
  'get_question_answering_prompt',
43
  'get_findings_summary_prompt',
44
  'get_description_generation_prompt',
app/ai/agent_core.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ LangGraph Agent Core Module
4
+
5
+ This module contains the main LangGraph agent setup and the high-level
6
+ Agent class for interacting with the agent system.
7
+ """
8
+
9
+ # Standard library imports
10
+ import logging
11
+ from typing import Optional, Dict, List, Any, Tuple
12
+
13
+ # Third-party imports
14
+ import streamlit as st
15
+ from langchain_anthropic import ChatAnthropic
16
+ from langchain_core.messages import HumanMessage, AIMessage
17
+ from langgraph.checkpoint.memory import MemorySaver
18
+ from langgraph.graph import StateGraph, END
19
+
20
+ # Local imports
21
+ from app.ai.agent_utils import AgentState
22
+ from app.ai.processing_pipeline import route_task, route_condition
23
+ from app.ai.processing_pipeline import (
24
+ parse_checklist_node,
25
+ match_checklist_node,
26
+ answer_question_node,
27
+ summarize_node
28
+ )
29
+ from app.core.config import get_config
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+
35
+ # Agent Functions
36
+
37
+ def get_langgraph_agent(api_key: Optional[str] = None, model: Optional[str] = None) -> Optional[Tuple[Any, "ChatAnthropic"]]:
38
+ """
39
+ Create a LangGraph agent with Anthropic
40
+
41
+ Args:
42
+ api_key: Anthropic API key (optional, will be sourced from environment/config)
43
+ model: Model name to use (optional, will use config default)
44
+
45
+ Returns:
46
+ Tuple of (compiled_app, llm) or None if not available
47
+ """
48
+
49
+ # Get configuration
50
+ config = get_config()
51
+
52
+ # Get API key from various sources
53
+ if not api_key:
54
+ api_key = config.api.anthropic_api_key
55
+ if not api_key and st and hasattr(st, 'secrets') and 'ANTHROPIC_API_KEY' in st.secrets:
56
+ api_key = st.secrets['ANTHROPIC_API_KEY']
57
+
58
+ if not api_key:
59
+ return None
60
+
61
+ # Use model from config if not specified
62
+ if not model:
63
+ model = config.model.claude_model
64
+
65
+ # Initialize Claude with config values
66
+ llm = ChatAnthropic(
67
+ model=model,
68
+ anthropic_api_key=api_key,
69
+ temperature=config.model.temperature,
70
+ max_tokens=config.model.max_tokens
71
+ )
72
+
73
+ # No custom tools needed - using built-in LangGraph functionality
74
+
75
+ # Create the graph
76
+ workflow = StateGraph(AgentState)
77
+
78
+ # Create node functions that have access to the llm
79
+ def _route_task(state: AgentState) -> AgentState:
80
+ return route_task(state)
81
+
82
+ def _parse_checklist_node(state: AgentState) -> AgentState:
83
+ return parse_checklist_node(state, llm)
84
+
85
+ def _match_checklist_node(state: AgentState) -> AgentState:
86
+ return match_checklist_node(state, llm)
87
+
88
+ def _answer_question_node(state: AgentState) -> AgentState:
89
+ return answer_question_node(state, llm)
90
+
91
+ def _summarize_node(state: AgentState) -> AgentState:
92
+ return summarize_node(state, llm)
93
+
94
+ # Add nodes to workflow
95
+ workflow.add_node("route", _route_task)
96
+ workflow.add_node("parse_checklist", _parse_checklist_node)
97
+ workflow.add_node("match_checklist", _match_checklist_node)
98
+ workflow.add_node("answer_question", _answer_question_node)
99
+ workflow.add_node("summarize", _summarize_node)
100
+
101
+ # Define edges
102
+ workflow.set_entry_point("route")
103
+
104
+ # Conditional routing based on next_action
105
+ workflow.add_conditional_edges(
106
+ "route",
107
+ route_condition,
108
+ {
109
+ "parse_checklist": "parse_checklist",
110
+ "match_checklist": "match_checklist",
111
+ "answer_question": "answer_question",
112
+ "summarize": "summarize"
113
+ }
114
+ )
115
+
116
+ # All task nodes go to END
117
+ workflow.add_edge("parse_checklist", END)
118
+ workflow.add_edge("match_checklist", END)
119
+ workflow.add_edge("answer_question", END)
120
+ workflow.add_edge("summarize", END)
121
+
122
+ # Compile with memory
123
+ memory = MemorySaver()
124
+ app = workflow.compile(checkpointer=memory)
125
+
126
+ return app, llm
127
+
128
+
129
+ class Agent:
130
+ """High-level interface for the LangGraph agent"""
131
+
132
+ def __init__(self, api_key: Optional[str] = None, model: Optional[str] = None):
133
+ """
134
+ Initialize the Agent
135
+
136
+ Args:
137
+ api_key: Anthropic API key (optional)
138
+ model: Model name to use
139
+ """
140
+ result = get_langgraph_agent(api_key, model)
141
+ if result:
142
+ self.app, self.llm = result
143
+ self.thread_id = "dd-poc-session"
144
+ else:
145
+ self.app = None
146
+ self.llm = None
147
+
148
+ def is_available(self) -> bool:
149
+ """Check if the agent is available for use"""
150
+ return self.app is not None and self.llm is not None
151
+
152
+ def parse_checklist(self, checklist_text: str) -> Optional[Dict]:
153
+ """
154
+ Parse checklist using the agent
155
+
156
+ Args:
157
+ checklist_text: Raw checklist text to parse
158
+
159
+ Returns:
160
+ Parsed checklist dictionary or None if failed
161
+ """
162
+ if not self.app:
163
+ return None
164
+
165
+ try:
166
+ # Run the agent
167
+ result = self.app.invoke(
168
+ {"messages": [HumanMessage(content=f"Parse this checklist: {checklist_text}")]},
169
+ config={"configurable": {"thread_id": self.thread_id}}
170
+ )
171
+
172
+ return result.get("checklist")
173
+ except Exception as e:
174
+ st.error(f"Agent error: {str(e)}")
175
+ return None
176
+
177
+ def match_documents(self, checklist: Dict, documents: List[Dict]) -> Dict:
178
+ """
179
+ Match documents to checklist items
180
+
181
+ Args:
182
+ checklist: Parsed checklist dictionary
183
+ documents: List of document dictionaries
184
+
185
+ Returns:
186
+ Dictionary of findings or empty dict if failed
187
+ """
188
+ if not self.app:
189
+ return {}
190
+
191
+ try:
192
+ # Prepare state
193
+ initial_state = {
194
+ "messages": [HumanMessage(content="Match documents to checklist items")],
195
+ "checklist": checklist,
196
+ "documents": documents,
197
+ "findings": {}
198
+ }
199
+
200
+ result = self.app.invoke(
201
+ initial_state,
202
+ config={"configurable": {"thread_id": self.thread_id}}
203
+ )
204
+
205
+ return result.get("findings", {})
206
+ except Exception as e:
207
+ st.error(f"Agent error: {str(e)}")
208
+ return {}
209
+
210
+ def answer_question(self, question: str, documents: List[Dict]) -> str:
211
+ """
212
+ Answer a question using document context
213
+
214
+ Args:
215
+ question: User question
216
+ documents: List of document dictionaries for context
217
+
218
+ Returns:
219
+ Answer string or error message
220
+ """
221
+ if not self.app:
222
+ return "Agent not available"
223
+
224
+ try:
225
+ initial_state = {
226
+ "messages": [HumanMessage(content=question)],
227
+ "documents": documents
228
+ }
229
+
230
+ result = self.app.invoke(
231
+ initial_state,
232
+ config={"configurable": {"thread_id": self.thread_id}}
233
+ )
234
+
235
+ # Get the last AI message
236
+ messages = result.get("messages", [])
237
+ for msg in reversed(messages):
238
+ if isinstance(msg, AIMessage):
239
+ return msg.content
240
+
241
+ return "No answer generated"
242
+ except Exception as e:
243
+ return f"Error: {str(e)}"
244
+
245
+ def summarize_findings(self, findings: Dict) -> str:
246
+ """
247
+ Generate executive summary
248
+
249
+ Args:
250
+ findings: Dictionary of due diligence findings
251
+
252
+ Returns:
253
+ Summary string or error message
254
+ """
255
+ if not self.app:
256
+ return "Agent not available"
257
+
258
+ try:
259
+ initial_state = {
260
+ "messages": [HumanMessage(content="Summarize the due diligence findings")],
261
+ "findings": findings
262
+ }
263
+
264
+ result = self.app.invoke(
265
+ initial_state,
266
+ config={"configurable": {"thread_id": self.thread_id}}
267
+ )
268
+
269
+ # Get the last AI message
270
+ messages = result.get("messages", [])
271
+ for msg in reversed(messages):
272
+ if isinstance(msg, AIMessage):
273
+ return msg.content
274
+
275
+ return "No summary generated"
276
+ except Exception as e:
277
+ return f"Error: {str(e)}"
app/ai/agent_utils.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Agent Utilities Module
4
+
5
+ This module contains utility functions, helper methods, and type definitions
6
+ for the LangGraph agent system.
7
+ """
8
+
9
+ # Standard library imports
10
+ import logging
11
+ import random
12
+ import time
13
+ from enum import Enum
14
+ from typing import Optional, Dict, List, Sequence
15
+
16
+ # Third-party imports
17
+ from langchain_core.runnables import RunnableLambda
18
+ from typing_extensions import TypedDict
19
+
20
+ # Local imports
21
+ from app.core.config import get_config
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ def with_retry(func, max_attempts=3, base_delay=1.0):
27
+ """
28
+ Wrapper function to add exponential backoff retry logic to any function.
29
+
30
+ Args:
31
+ func: Function to wrap with retry logic
32
+ max_attempts: Maximum number of retry attempts (default: 3)
33
+ base_delay: Base delay in seconds for exponential backoff (default: 1.0)
34
+
35
+ Returns:
36
+ Wrapped function with retry logic
37
+ """
38
+ def wrapper(*args, **kwargs):
39
+ for attempt in range(max_attempts):
40
+ try:
41
+ return func(*args, **kwargs)
42
+ except Exception as e:
43
+ if attempt == max_attempts - 1: # Last attempt
44
+ raise e
45
+
46
+ # Exponential backoff with jitter
47
+ delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
48
+ logger.warning(f"Attempt {attempt + 1} failed: {str(e)}. Retrying in {delay:.2f}s...")
49
+ time.sleep(delay)
50
+
51
+ return wrapper
52
+
53
+
54
+ def create_batch_processor(llm: "ChatAnthropic", max_concurrency: int = None) -> RunnableLambda:
55
+ """
56
+ Create a batch processor using LangChain's retry and fallback mechanisms.
57
+
58
+ Args:
59
+ llm: ChatAnthropic instance
60
+ max_concurrency: Maximum concurrent requests (uses config default if None)
61
+
62
+ Returns:
63
+ RunnableLambda configured with retry and fallback mechanisms
64
+ """
65
+ config = get_config()
66
+ if max_concurrency is None:
67
+ max_concurrency = 3 # Default max concurrency
68
+
69
+ def process_single_item(input_data):
70
+ """Process a single item with error handling"""
71
+ try:
72
+ messages, item_info = input_data
73
+ response = llm.invoke(messages)
74
+ return {
75
+ 'success': True,
76
+ 'response': response,
77
+ 'item_info': item_info,
78
+ 'error': None
79
+ }
80
+ except Exception as e:
81
+ # Fail immediately on any error
82
+ error_msg = f"Single item processing failed: {str(e)}"
83
+ logger.error(error_msg)
84
+ raise Exception(error_msg)
85
+
86
+ def process_batch(batch_inputs):
87
+ """Process a batch of inputs with individual item error handling"""
88
+ try:
89
+ # Use LLM's batch method for efficiency
90
+ messages_batch = [input_data[0] for input_data in batch_inputs]
91
+ item_infos = [input_data[1] for input_data in batch_inputs]
92
+
93
+ responses = llm.batch(
94
+ messages_batch,
95
+ config={"max_concurrency": max_concurrency}
96
+ )
97
+
98
+ # Process results with individual error handling - fail on any error
99
+ results = []
100
+ for i, (response, item_info) in enumerate(zip(responses, item_infos)):
101
+ if response:
102
+ results.append({
103
+ 'success': True,
104
+ 'response': response,
105
+ 'item_info': item_info,
106
+ 'error': None
107
+ })
108
+ else:
109
+ # Fail immediately on any missing response
110
+ error_msg = f'No response for item {i}'
111
+ logger.error(error_msg)
112
+ raise Exception(error_msg)
113
+
114
+ return results
115
+
116
+ except Exception as e:
117
+ # If batch fails completely, fail immediately
118
+ error_msg = f"Batch processing failed: {e}"
119
+ logger.error(error_msg)
120
+ raise Exception(error_msg)
121
+
122
+ # Create the main processor with retry logic
123
+ retryable_process_batch = with_retry(process_batch, max_attempts=3, base_delay=1.0)
124
+ processor = RunnableLambda(retryable_process_batch)
125
+
126
+ return processor
127
+
128
+
129
+ # =============================================================================
130
+ # TYPE DEFINITIONS
131
+ # =============================================================================
132
+
133
+ # Define the state for our agent
134
+ class AgentState(TypedDict):
135
+ """State for the due diligence agent"""
136
+ messages: Sequence["BaseMessage"]
137
+ checklist: Optional[Dict]
138
+ documents: Optional[List[Dict]]
139
+ current_task: Optional[str]
140
+ findings: Dict[str, List[str]]
141
+ next_action: Optional[str]
142
+
143
+
144
+ class TaskType(Enum):
145
+ """Types of tasks the agent can perform"""
146
+ PARSE_CHECKLIST = "parse_checklist"
147
+ ANALYZE_DOCUMENT = "analyze_document"
148
+ MATCH_CHECKLIST = "match_checklist"
149
+ ANSWER_QUESTION = "answer_question"
150
+ SUMMARIZE_FINDINGS = "summarize_findings"
app/ai/document_classifier.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Document Classification Module
4
+
5
+ This module contains functions for classifying document types and related utilities.
6
+ """
7
+
8
+ # Standard library imports
9
+ import logging
10
+ from typing import List, Dict, Optional
11
+
12
+ # Third-party imports
13
+ from langchain_core.messages import HumanMessage
14
+ import httpx
15
+ import backoff
16
+
17
+ # Local imports
18
+ from app.ai.agent_utils import create_batch_processor
19
+ from app.ai.prompts import get_document_type_classification_prompt
20
+ from app.core.config import get_config
21
+ from app.core.constants import DEFAULT_BATCH_SIZE
22
+ from app.core.performance import get_performance_manager
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ @backoff.on_exception(
28
+ backoff.expo,
29
+ (Exception,),
30
+ max_tries=3,
31
+ jitter=backoff.random_jitter
32
+ )
33
+ def batch_classify_document_types(first_chunks: List[Dict], llm: "ChatAnthropic", batch_size: Optional[int] = None) -> List[Dict]:
34
+ """
35
+ Fast document type classification using first chunks only with Haiku model.
36
+ Optimized for speed and cost with batched processing.
37
+
38
+ Args:
39
+ first_chunks: List of first chunk dictionaries to classify
40
+ llm: ChatAnthropic instance (should be Haiku for speed/cost)
41
+ batch_size: Number of documents to process in each batch (uses config default if None)
42
+
43
+ Returns:
44
+ List of documents with added document_type field
45
+ """
46
+ config = get_config()
47
+ if batch_size is None:
48
+ # Use optimized batch size for Haiku (faster model)
49
+ batch_size = min(DEFAULT_BATCH_SIZE, 25) # Increased to 25 docs per batch for better performance
50
+
51
+ # Create batch processor with retry and fallback mechanisms
52
+ batch_processor = create_batch_processor(llm, max_concurrency=5) # Increased concurrency
53
+
54
+ # Process documents in batches
55
+ classified_docs = []
56
+ total_docs = len(first_chunks)
57
+ total_batches = (total_docs + batch_size - 1) // batch_size
58
+
59
+ model_name = getattr(llm, 'model', 'unknown')
60
+ logger.info(f"🏷️ Classifying {total_docs} document types using {model_name}")
61
+
62
+ # Get performance manager for caching
63
+ perf_manager = get_performance_manager()
64
+
65
+ for batch_num, i in enumerate(range(0, total_docs, batch_size), 1):
66
+ batch = first_chunks[i:i + batch_size]
67
+ batch_end = min(i + batch_size, total_docs)
68
+
69
+ # Check cache for existing classifications
70
+ cached_batch = []
71
+ uncached_batch = []
72
+ uncached_indices = []
73
+
74
+ for idx, doc in enumerate(batch):
75
+ cache_key = f"classification:{doc.get('path', '')}"
76
+ cached_result = perf_manager.doc_cache.get(cache_key)
77
+ if cached_result:
78
+ cached_batch.append(cached_result)
79
+ logger.debug(f"Cache hit for document classification: {doc.get('name', '')}")
80
+ else:
81
+ uncached_batch.append(doc)
82
+ uncached_indices.append(idx)
83
+
84
+ logger.info(f"Processing classification batch {batch_num}/{total_batches} "
85
+ f"({len(uncached_batch)} new, {len(cached_batch)} cached documents)")
86
+
87
+ # Only process uncached documents
88
+ if uncached_batch:
89
+ batch_inputs = []
90
+ for doc in uncached_batch:
91
+ template = get_document_type_classification_prompt()
92
+ prompt = template.format(
93
+ doc_name=doc.get('name', 'Unknown'),
94
+ content_preview=doc.get('content', '')[:500] # First 500 chars for classification
95
+ )
96
+ messages = [HumanMessage(content=prompt)]
97
+ batch_inputs.append((messages, doc))
98
+
99
+ # Process batch using LangChain's built-in mechanisms
100
+ try:
101
+ logger.info(f"Processing classification batch {batch_num}/{total_batches} with {len(uncached_batch)} new documents")
102
+ batch_results = batch_processor.invoke(batch_inputs)
103
+
104
+ # Process results with individual document error handling
105
+ for idx, result in enumerate(batch_results):
106
+ doc = result['item_info'].copy()
107
+
108
+ if result['success'] and result['response']:
109
+ # Successfully classified document type
110
+ doc_type = result['response'].content.strip().lower()
111
+ # Remove any "the document type is" prefix if present (for backward compatibility)
112
+ if doc_type.startswith("the document type is "):
113
+ doc_type = doc_type[21:].strip()
114
+ doc['document_type'] = doc_type
115
+ logger.debug(f"Classified '{doc.get('name', 'Unknown')}' as: {doc_type}")
116
+
117
+ # Cache the result
118
+ cache_key = f"classification:{doc.get('path', '')}"
119
+ perf_manager.doc_cache.set(cache_key, doc, expire=86400 * 30) # 30 days
120
+
121
+ classified_docs.append(doc)
122
+ else:
123
+ # Fail on classification error
124
+ error_msg = f"Failed to classify document '{doc.get('name', 'Unknown')}': {result.get('error', 'Unknown error')}"
125
+ logger.error(error_msg)
126
+ raise Exception(error_msg)
127
+
128
+ except Exception as e:
129
+ error_msg = f"Classification batch {batch_num} processing completely failed: {e}"
130
+ logger.error(error_msg)
131
+ raise Exception(error_msg)
132
+
133
+ # Add cached results to the final list
134
+ classified_docs.extend(cached_batch)
135
+
136
+ successful_classifications = len([d for d in classified_docs if d.get('document_type') != 'unknown document'])
137
+ success_rate = (successful_classifications / total_docs) * 100 if total_docs > 0 else 0
138
+ logger.info(f"✅ Classified {successful_classifications}/{total_docs} documents ({success_rate:.1f}% success rate)")
139
+
140
+ return classified_docs
app/ai/processing_pipeline.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Processing Pipeline Module
4
+
5
+ This module contains content processing pipeline and workflow functions,
6
+ including agent node functions and batch processing utilities.
7
+ """
8
+
9
+ # Standard library imports
10
+ import logging
11
+ from typing import List, Dict, Optional
12
+
13
+ # Third-party imports
14
+ import streamlit as st
15
+ from langchain_core.messages import HumanMessage, AIMessage
16
+ from langchain_core.output_parsers import PydanticOutputParser
17
+ from pydantic import BaseModel, Field
18
+
19
+ # Local imports
20
+ from app.ai.agent_utils import AgentState, create_batch_processor
21
+ from app.ai.prompts import (
22
+ get_checklist_parsing_prompt,
23
+ get_document_relevance_prompt,
24
+ get_question_answering_prompt,
25
+ get_findings_summary_prompt,
26
+ get_description_generation_prompt,
27
+ get_document_summarization_prompt
28
+ )
29
+ from app.core.config import get_config
30
+ from app.core.constants import DEFAULT_BATCH_SIZE
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ # Pydantic models for structured output parsing
36
+ class ChecklistItem(BaseModel):
37
+ """Individual checklist item"""
38
+ text: str = Field(description="The checklist item text")
39
+ original: str = Field(description="The original text before any cleanup")
40
+
41
+ class ChecklistCategory(BaseModel):
42
+ """Checklist category with items"""
43
+ name: str = Field(description="Category name (e.g., 'Organizational and Corporate Documents')")
44
+ items: List[ChecklistItem] = Field(description="List of checklist items in this category")
45
+
46
+ class StructuredChecklist(BaseModel):
47
+ """Complete checklist with all categories"""
48
+ categories: Dict[str, ChecklistCategory] = Field(
49
+ description="Dictionary of categories keyed by letter (A, B, C, etc.)"
50
+ )
51
+
52
+ class Question(BaseModel):
53
+ """Individual question"""
54
+ category: str = Field(description="Question category")
55
+ question: str = Field(description="The question text")
56
+ id: str = Field(description="Unique question ID")
57
+
58
+ class StructuredQuestions(BaseModel):
59
+ """List of structured questions"""
60
+ questions: List[Question] = Field(description="List of all questions")
61
+
62
+
63
+
64
+ def route_task(state: AgentState) -> AgentState:
65
+ """Route to appropriate task based on current state"""
66
+ messages = state["messages"]
67
+ if not messages:
68
+ return state
69
+
70
+ last_message = messages[-1].content if messages else ""
71
+
72
+ # Determine next action based on message content
73
+ if "parse" in last_message.lower() and "checklist" in last_message.lower():
74
+ state["next_action"] = "parse_checklist"
75
+ elif "analyze" in last_message.lower() or "match" in last_message.lower():
76
+ state["next_action"] = "match_checklist"
77
+ elif "?" in last_message:
78
+ state["next_action"] = "answer_question"
79
+ else:
80
+ state["next_action"] = "summarize"
81
+
82
+ return state
83
+
84
+
85
+ def parse_checklist_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
86
+ """Parse checklist using structured output - standardized with StructuredChecklist!"""
87
+ messages = state["messages"]
88
+ checklist_text = messages[-1].content if messages else ""
89
+
90
+ # Set up structured parser - using the same as parse_checklist function
91
+ parser = PydanticOutputParser(pydantic_object=StructuredChecklist)
92
+ prompt = get_checklist_parsing_prompt()
93
+
94
+ try:
95
+ # Format the prompt with the checklist text and format instructions
96
+ formatted_prompt = prompt.format_messages(
97
+ checklist_text=checklist_text, # Don't truncate - let LLM handle full checklist
98
+ format_instructions=parser.get_format_instructions()
99
+ )
100
+
101
+ # Get LLM response
102
+ llm_response = llm.invoke(formatted_prompt)
103
+
104
+ # Parse the response using the Pydantic parser
105
+ result = parser.parse(llm_response.content)
106
+
107
+ # Convert Pydantic model to expected dictionary format (same as parse_checklist)
108
+ categories_dict = {}
109
+ for key, category in result.categories.items():
110
+ categories_dict[key] = {
111
+ 'name': category.name,
112
+ 'items': [
113
+ {
114
+ 'text': item.text,
115
+ 'original': item.original
116
+ }
117
+ for item in category.items
118
+ ]
119
+ }
120
+
121
+ state["checklist"] = categories_dict
122
+ state["messages"].append(AIMessage(content=f"Parsed {len(categories_dict)} categories"))
123
+
124
+ except Exception as e:
125
+ state["messages"].append(AIMessage(content=f"Parsing failed: {str(e)}"))
126
+
127
+ return state
128
+
129
+
130
+ def match_checklist_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
131
+ """Match documents to checklist items - keep it simple"""
132
+ checklist = state.get("checklist", {})
133
+ documents = state.get("documents", [])
134
+
135
+ if not checklist or not documents:
136
+ state["messages"].append(AIMessage(content="Need both checklist and documents to match"))
137
+ return state
138
+
139
+ # For each checklist item, find relevant documents
140
+ findings = {}
141
+ for cat_letter, category in checklist.items():
142
+ cat_findings = []
143
+ for item in category.get("items", []):
144
+ # Use Claude to assess relevance
145
+ document_names = [d.get('name', 'Unknown') for d in documents[:10]]
146
+ prompt = get_document_relevance_prompt(item['text'], document_names)
147
+
148
+ response = llm.invoke([HumanMessage(content=str(prompt))])
149
+ cat_findings.append({
150
+ "item": item['text'],
151
+ "relevant_docs": response.content
152
+ })
153
+
154
+ findings[category['name']] = cat_findings
155
+
156
+ state["findings"] = findings
157
+ state["messages"].append(AIMessage(content=f"Matched checklist to {len(documents)} documents"))
158
+
159
+ return state
160
+
161
+
162
+ def answer_question_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
163
+ """Answer questions using document context"""
164
+ messages = state["messages"]
165
+ question = messages[-1].content if messages else ""
166
+ documents = state.get("documents", [])
167
+
168
+ # Create context from documents
169
+ context = "\n".join([f"- {d.get('name', 'Unknown')}: {d.get('text', '')[:200]}"
170
+ for d in documents[:5]])
171
+
172
+ prompt = get_question_answering_prompt(question, context)
173
+ response = llm.invoke([HumanMessage(content=prompt)])
174
+ state["messages"].append(AIMessage(content=response.content))
175
+
176
+ return state
177
+
178
+
179
+ def summarize_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
180
+ """Summarize findings"""
181
+ findings = state.get("findings", {})
182
+
183
+ if not findings:
184
+ state["messages"].append(AIMessage(content="No findings to summarize"))
185
+ return state
186
+
187
+ prompt = get_findings_summary_prompt(findings)
188
+ response = llm.invoke([HumanMessage(content=prompt)])
189
+ state["messages"].append(AIMessage(content=response.content))
190
+
191
+ return state
192
+
193
+
194
+ def route_condition(state: AgentState) -> str:
195
+ """Conditional routing function based on next_action"""
196
+ next_action = state.get("next_action")
197
+ if next_action == "parse_checklist":
198
+ return "parse_checklist"
199
+ elif next_action == "match_checklist":
200
+ return "match_checklist"
201
+ elif next_action == "answer_question":
202
+ return "answer_question"
203
+ else:
204
+ return "summarize"
205
+
206
+
207
+
208
+
209
+ def batch_summarize_documents(documents: List[Dict], llm: "ChatAnthropic", batch_size: Optional[int] = None) -> List[Dict]:
210
+ """
211
+ Summarize documents using LangChain's built-in retry mechanisms and proper error handling.
212
+ Uses RunnableLambda for better batch processing control with individual item error handling.
213
+ Returns documents with added 'summary' field.
214
+
215
+ Args:
216
+ documents: List of document dictionaries to summarize
217
+ llm: ChatAnthropic instance for generating summaries
218
+ batch_size: Number of documents to process in each batch (uses config default if None)
219
+
220
+ Returns:
221
+ List of documents with added summary field
222
+ """
223
+
224
+ config = get_config()
225
+ if batch_size is None:
226
+ batch_size = DEFAULT_BATCH_SIZE
227
+
228
+ # Create batch processor with retry and fallback mechanisms
229
+ batch_processor = create_batch_processor(llm, max_concurrency=3)
230
+
231
+ # Process documents in batches
232
+ summarized_docs = []
233
+ total_docs = len(documents)
234
+ total_batches = (total_docs + batch_size - 1) // batch_size
235
+
236
+ for batch_num, i in enumerate(range(0, total_docs, batch_size), 1):
237
+ batch = documents[i:i + batch_size]
238
+ batch_end = min(i + batch_size, total_docs)
239
+
240
+ # Update progress with batch info
241
+ if hasattr(st, 'progress') and 'summary_progress' in st.session_state:
242
+ progress = i / total_docs
243
+ st.session_state.summary_progress.progress(
244
+ progress,
245
+ text=f"📝 Processing batch {batch_num}/{total_batches} (docs {i+1}-{batch_end} of {total_docs})"
246
+ )
247
+
248
+ # Prepare batch inputs for the processor
249
+ batch_inputs = []
250
+ for doc in batch:
251
+ template = get_document_summarization_prompt(doc)
252
+ prompt = template.format()
253
+ messages = [HumanMessage(content=prompt)]
254
+ batch_inputs.append((messages, doc))
255
+
256
+ # Process batch using LangChain's built-in mechanisms
257
+ try:
258
+ batch_results = batch_processor.invoke(batch_inputs)
259
+
260
+ # Process results with individual document error handling
261
+ for result in batch_results:
262
+ doc = result['item_info'].copy()
263
+
264
+ if result['success'] and result['response']:
265
+ # Successfully generated summary
266
+ doc['summary'] = result['response'].content.strip()
267
+ summarized_docs.append(doc)
268
+ else:
269
+ # Fail on summary generation error
270
+ error_msg = f"Failed to generate summary for document '{doc.get('name', 'Unknown')}': {result.get('error', 'Unknown error')}"
271
+ logger.error(error_msg)
272
+ raise Exception(error_msg)
273
+
274
+ except Exception as e:
275
+ error_msg = f"Batch {batch_num} processing completely failed: {e}"
276
+ logger.error(error_msg)
277
+ raise Exception(error_msg)
278
+
279
+ return summarized_docs
{src → app}/ai/prompts.py RENAMED
@@ -6,46 +6,75 @@ This module contains all prompt templates used for AI interactions
6
  in the DD-Checklist application.
7
  """
8
 
 
9
  import json
10
  from typing import Dict, List
11
- from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
 
12
  from langchain_core.messages import SystemMessage, HumanMessage
 
 
 
 
13
 
14
 
15
- def get_checklist_parsing_prompt(checklist_text: str) -> ChatPromptTemplate:
16
- """Generate prompt for parsing due diligence checklists with structured output"""
17
  return ChatPromptTemplate.from_messages([
18
  SystemMessage(content="""
19
- Parse this due diligence checklist into structured format. Extract:
20
- - Categories (A., B., C., etc.) with their names
21
- - Numbered items within each category (1., 2., 3., etc.)
22
- - Total count of items
23
-
24
- Follow the exact format specified in the format instructions.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  """),
26
- HumanMessage(content="""Parse this checklist:
27
 
28
  {checklist_text}
29
 
 
30
  {format_instructions}
31
 
32
- Please provide the structured output:""")
33
  ])
34
 
35
 
36
  def get_document_relevance_prompt(item_text: str, documents: List[str]) -> PromptTemplate:
37
- """Generate prompt for assessing document relevance to checklist items with structured output"""
38
  return PromptTemplate.from_template(
39
- """Analyze which documents are relevant to the following checklist item:
40
 
41
  Checklist Item: {item_text}
42
 
43
  Available Documents:
44
  {documents}
45
 
46
- {format_instructions}
47
-
48
- Please provide your analysis in the specified format:"""
49
  )
50
 
51
 
@@ -57,7 +86,7 @@ def get_question_answering_prompt(question: str, context: str) -> ChatPromptTemp
57
  ])
58
 
59
 
60
- def get_findings_summary_prompt(findings: Dict, max_chars: int = 2000) -> PromptTemplate:
61
  """Generate prompt for summarizing due diligence findings"""
62
  findings_text = json.dumps(findings, indent=2)[:max_chars]
63
  return PromptTemplate.from_template(
@@ -81,6 +110,24 @@ def get_description_generation_prompt(category_name: str, item_text: str) -> Pro
81
  ).partial(category_name=category_name, item_text=item_text)
82
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def get_document_summarization_prompt(doc: Dict) -> PromptTemplate:
85
  """Generate prompt for document type identification and summarization"""
86
  doc_name = doc.get('name', 'Unknown')
 
6
  in the DD-Checklist application.
7
  """
8
 
9
+ # Standard library imports
10
  import json
11
  from typing import Dict, List
12
+
13
+ # Third-party imports
14
  from langchain_core.messages import SystemMessage, HumanMessage
15
+ from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
16
+
17
+ # Local imports
18
+ from app.core.constants import QA_MAX_TOKENS
19
 
20
 
21
+ def get_checklist_parsing_prompt() -> ChatPromptTemplate:
22
+ """Generate prompt template for parsing due diligence checklists with structured output"""
23
  return ChatPromptTemplate.from_messages([
24
  SystemMessage(content="""
25
+ You are a JSON parser. Your ONLY task is to convert the checklist into valid JSON format.
26
+
27
+ CRITICAL PARSING RULES:
28
+ - Return ONLY valid JSON - no explanations, no notes, no additional text
29
+ - Do NOT add any conversational text before or after the JSON
30
+ - Do NOT offer to continue or ask questions
31
+ - Do NOT provide partial results or examples
32
+ - Parse the COMPLETE document - every single category and item
33
+
34
+ JSON Structure Required:
35
+ - Top-level object with "categories" field
36
+ - Categories keyed by letter (A, B, C, D, E, etc.)
37
+ - Each category has "name" and "items" fields
38
+ - Each item has "text" and "original" fields
39
+
40
+ You must process the ENTIRE checklist. Do not stop after a few categories.
41
+
42
+ Output format:
43
+ {
44
+ "categories": {
45
+ "A": {
46
+ "name": "Category Name",
47
+ "items": [
48
+ {"text": "Item text", "original": "1. Item text"}
49
+ ]
50
+ }
51
+ }
52
+ }
53
+
54
+ Return ONLY the JSON. No other text.
55
  """),
56
+ HumanMessagePromptTemplate.from_template("""Parse this complete checklist into the exact JSON format:
57
 
58
  {checklist_text}
59
 
60
+ Required JSON schema:
61
  {format_instructions}
62
 
63
+ Return the complete JSON with all categories found in the checklist:""")
64
  ])
65
 
66
 
67
  def get_document_relevance_prompt(item_text: str, documents: List[str]) -> PromptTemplate:
68
+ """Generate prompt for assessing document relevance to checklist items"""
69
  return PromptTemplate.from_template(
70
+ """Analyze which documents are most relevant to the following checklist item.
71
 
72
  Checklist Item: {item_text}
73
 
74
  Available Documents:
75
  {documents}
76
 
77
+ Provide a brief analysis identifying the most relevant documents and explain why they are relevant to this checklist item. Be concise and specific."""
 
 
78
  )
79
 
80
 
 
86
  ])
87
 
88
 
89
+ def get_findings_summary_prompt(findings: Dict, max_chars: int = QA_MAX_TOKENS) -> PromptTemplate:
90
  """Generate prompt for summarizing due diligence findings"""
91
  findings_text = json.dumps(findings, indent=2)[:max_chars]
92
  return PromptTemplate.from_template(
 
110
  ).partial(category_name=category_name, item_text=item_text)
111
 
112
 
113
+ def get_document_type_classification_prompt() -> PromptTemplate:
114
+ """Generate prompt for fast document type classification based on first chunk content"""
115
+ return PromptTemplate.from_template(
116
+ "Classify the document type using one short phrase. Use exact terminology.\n"
117
+ "Respond with ONLY the document type, no prefix or explanation.\n\n"
118
+ "Examples:\n"
119
+ "certificate of incorporation\n"
120
+ "corporate bylaws\n"
121
+ "amended and restated bylaws\n"
122
+ "board resolution\n"
123
+ "financial statement\n"
124
+ "employment agreement\n"
125
+ "software license agreement\n\n"
126
+ "Document: {doc_name}\n"
127
+ "Content: {content_preview}\n\n"
128
+ "Document type:"
129
+ )
130
+
131
  def get_document_summarization_prompt(doc: Dict) -> PromptTemplate:
132
  """Generate prompt for document type identification and summarization"""
133
  doc_name = doc.get('name', 'Unknown')
app/core/__init__.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Core Business Logic Layer
3
+
4
+ This layer contains the core business logic and domain models.
5
+ It should not depend on UI or external frameworks.
6
+ """
7
+
8
+ # Configuration
9
+ from .config import AppConfig, get_config
10
+
11
+ # Exceptions
12
+ from .exceptions import (
13
+ AppException,
14
+ DocumentProcessingError,
15
+ SearchError,
16
+ ConfigError,
17
+ FileOperationError,
18
+ AIError,
19
+ LLMConnectionError,
20
+ LLMAuthenticationError,
21
+ LLMTimeoutError,
22
+ LLMQuotaExceededError,
23
+ LLMInvalidResponseError,
24
+ create_processing_error,
25
+ create_config_error,
26
+ create_ai_error
27
+ )
28
+
29
+ # Core classes and functions
30
+ from .document_processor import DocumentProcessor
31
+ from .search import search_and_analyze, search_documents
32
+ from .ranking import rerank_results
33
+ from .parsers import parse_checklist, parse_questions
34
+ from .utils import create_document_processor, format_document_title, count_documents_in_directory
35
+ from .logging import logger
36
+ from .constants import (
37
+ RELEVANCY_THRESHOLD,
38
+ SIMILARITY_THRESHOLD,
39
+ DEFAULT_BATCH_SIZE,
40
+ QA_MAX_TOKENS,
41
+ CHECKLIST_PARSING_MAX_TOKENS
42
+ )
43
+
44
+ __all__ = [
45
+ # Configuration
46
+ 'AppConfig', 'get_config',
47
+
48
+ # Exceptions
49
+ 'AppException', 'DocumentProcessingError', 'SearchError', 'ConfigError',
50
+ 'FileOperationError', 'AIError', 'LLMConnectionError', 'LLMAuthenticationError',
51
+ 'LLMTimeoutError', 'LLMQuotaExceededError', 'LLMInvalidResponseError',
52
+ 'create_processing_error', 'create_config_error', 'create_ai_error',
53
+
54
+ # Core functionality
55
+ 'DocumentProcessor', 'search_and_analyze', 'search_documents', 'rerank_results',
56
+ 'parse_checklist', 'parse_questions', 'create_document_processor',
57
+ 'format_document_title', 'count_documents_in_directory', 'logger',
58
+
59
+ # Constants
60
+ 'RELEVANCY_THRESHOLD', 'SIMILARITY_THRESHOLD', 'DEFAULT_BATCH_SIZE', 'QA_MAX_TOKENS', 'CHECKLIST_PARSING_MAX_TOKENS'
61
+ ]
app/core/config.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any, Optional
2
+ from pathlib import Path
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from app.core.constants import (
6
+ CHUNK_SIZE, CHUNK_OVERLAP, SIMILARITY_THRESHOLD,
7
+ RELEVANCY_THRESHOLD, CLASSIFICATION_MAX_TOKENS, CHECKLIST_PARSING_MAX_TOKENS,
8
+ TEMPERATURE
9
+ )
10
+
11
+ load_dotenv()
12
+
13
+
14
+ class AppConfig:
15
+ def __init__(self) -> None:
16
+ self._config: Dict[str, Any] = {}
17
+ self._load_config()
18
+
19
+ def _load_config(self) -> None:
20
+ self._config['ui'] = {
21
+ 'page_title': "🤖 AI Due Diligence",
22
+ 'page_icon': "🤖",
23
+ 'layout': "wide",
24
+ 'top_k_search_results': 10
25
+ }
26
+
27
+ self._config['model'] = {
28
+ 'sentence_transformer_model': 'sentence-transformers/all-mpnet-base-v2',
29
+ 'claude_model': os.getenv('CLAUDE_MODEL', 'claude-3-5-sonnet'),
30
+ 'claude_haiku_model': 'claude-3-5-haiku-20241022',
31
+ 'classification_max_tokens': CLASSIFICATION_MAX_TOKENS,
32
+ 'temperature': float(os.getenv('CLAUDE_TEMPERATURE', str(TEMPERATURE))),
33
+ 'max_tokens': int(os.getenv('CLAUDE_MAX_TOKENS', '16000')) # High limit for checklist parsing
34
+ }
35
+
36
+ self._config['processing'] = {
37
+ 'chunk_size': CHUNK_SIZE,
38
+ 'chunk_overlap': CHUNK_OVERLAP,
39
+ 'similarity_threshold': SIMILARITY_THRESHOLD,
40
+ 'relevancy_threshold': RELEVANCY_THRESHOLD,
41
+ 'supported_file_extensions': [
42
+ '.pdf', '.docx', '.doc', '.txt', '.md',
43
+ '.xls', '.xlsx', '.ppt', '.pptx'
44
+ ],
45
+ 'faiss_store_name': 'default'
46
+ }
47
+
48
+ self._config['paths'] = {
49
+ 'data_dir': Path('data'),
50
+ 'strategy_dir': Path('data/strategy'),
51
+ 'checklist_dir': Path('data/checklist'),
52
+ 'questions_dir': Path('data/questions'),
53
+ 'vdrs_dir': Path('data/vdrs'),
54
+ 'faiss_dir': Path('data/search_indexes')
55
+ }
56
+
57
+ self._config['anthropic'] = {
58
+ 'api_key': os.getenv('ANTHROPIC_API_KEY'),
59
+ 'model': os.getenv('CLAUDE_MODEL', 'claude-3-5-sonnet')
60
+ }
61
+
62
+ @property
63
+ def ui(self) -> Dict[str, Any]:
64
+ return self._config['ui']
65
+
66
+ @property
67
+ def model(self) -> Dict[str, Any]:
68
+ return self._config['model']
69
+
70
+ @property
71
+ def processing(self) -> Dict[str, Any]:
72
+ return self._config['processing']
73
+
74
+ @property
75
+ def paths(self) -> Dict[str, Path]:
76
+ return self._config['paths']
77
+
78
+ @property
79
+ def anthropic(self) -> Dict[str, Optional[str]]:
80
+ return self._config['anthropic']
81
+
82
+ def validate(self) -> bool:
83
+ """Validate all critical configuration values."""
84
+ self._validate_anthropic_config()
85
+ self._validate_paths()
86
+ self._validate_models()
87
+ self._validate_processing_config()
88
+ self._validate_file_extensions()
89
+ return True
90
+
91
+ def _validate_anthropic_config(self) -> None:
92
+ """Validate Anthropic API configuration."""
93
+ if not self.anthropic.get('api_key'):
94
+ raise ValueError("ANTHROPIC_API_KEY environment variable is required")
95
+
96
+ model = self.anthropic.get('model')
97
+ if not model:
98
+ raise ValueError("CLAUDE_MODEL environment variable is required")
99
+
100
+ valid_claude_models = [
101
+ 'claude-3-5-sonnet',
102
+ 'claude-3-5-haiku-20241022',
103
+ 'claude-3-opus-20240229',
104
+ 'claude-3-sonnet-20240229',
105
+ 'claude-3-haiku-20240307'
106
+ ]
107
+ if model not in valid_claude_models:
108
+ raise ValueError(f"Invalid Claude model: {model}. Valid models: {', '.join(valid_claude_models)}")
109
+
110
+ def _validate_paths(self) -> None:
111
+ """Validate that critical directories exist."""
112
+ critical_dirs = [
113
+ ('data_dir', self.paths['data_dir']),
114
+ ('vdrs_dir', self.paths['vdrs_dir'])
115
+ ]
116
+
117
+ for dir_name, dir_path in critical_dirs:
118
+ if not dir_path.exists():
119
+ raise ValueError(f"Critical directory '{dir_name}' does not exist: {dir_path}")
120
+ if not dir_path.is_dir():
121
+ raise ValueError(f"Path '{dir_name}' exists but is not a directory: {dir_path}")
122
+
123
+ def _validate_models(self) -> None:
124
+ """Validate that required models are available."""
125
+ # Check sentence transformer model
126
+ model_path = Path('models') / 'sentence_transformers' / self.model['sentence_transformer_model'].split('/')[-1]
127
+ if not model_path.exists():
128
+ raise ValueError(f"Sentence transformer model not found: {model_path}")
129
+
130
+ # Check cross-encoder model
131
+ cross_encoder_path = Path('models') / 'cross_encoder' / 'ms-marco-MiniLM-L-6-v2'
132
+ if not cross_encoder_path.exists():
133
+ raise ValueError(f"Cross-encoder model not found: {cross_encoder_path}")
134
+
135
+ def _validate_processing_config(self) -> None:
136
+ """Validate processing configuration values."""
137
+ processing = self.processing
138
+
139
+ # Validate chunk size
140
+ chunk_size = processing['chunk_size']
141
+ if not isinstance(chunk_size, int) or chunk_size <= 0:
142
+ raise ValueError(f"Invalid chunk_size: {chunk_size}. Must be a positive integer.")
143
+
144
+ # Validate chunk overlap
145
+ chunk_overlap = processing['chunk_overlap']
146
+ if not isinstance(chunk_overlap, int) or chunk_overlap < 0:
147
+ raise ValueError(f"Invalid chunk_overlap: {chunk_overlap}. Must be a non-negative integer.")
148
+ if chunk_overlap >= chunk_size:
149
+ raise ValueError(f"chunk_overlap ({chunk_overlap}) must be less than chunk_size ({chunk_size})")
150
+
151
+ # Validate thresholds
152
+ similarity_threshold = processing['similarity_threshold']
153
+ if not isinstance(similarity_threshold, (int, float)) or not (0 <= similarity_threshold <= 1):
154
+ raise ValueError(f"Invalid similarity_threshold: {similarity_threshold}. Must be between 0 and 1.")
155
+
156
+ relevancy_threshold = processing['relevancy_threshold']
157
+ if not isinstance(relevancy_threshold, (int, float)) or not (0 <= relevancy_threshold <= 1):
158
+ raise ValueError(f"Invalid relevancy_threshold: {relevancy_threshold}. Must be between 0 and 1.")
159
+
160
+ # Validate max tokens
161
+ max_tokens = processing.get('classification_max_tokens', CLASSIFICATION_MAX_TOKENS)
162
+ if not isinstance(max_tokens, int) or max_tokens <= 0:
163
+ raise ValueError(f"Invalid classification_max_tokens: {max_tokens}. Must be a positive integer.")
164
+
165
+ def _validate_file_extensions(self) -> None:
166
+ """Validate supported file extensions."""
167
+ extensions = self.processing['supported_file_extensions']
168
+ if not extensions:
169
+ raise ValueError("supported_file_extensions cannot be empty")
170
+
171
+ # Validate each extension starts with a dot and contains valid characters
172
+ for ext in extensions:
173
+ if not isinstance(ext, str):
174
+ raise ValueError(f"Invalid file extension type: {type(ext)}. Must be string.")
175
+ if not ext.startswith('.'):
176
+ raise ValueError(f"File extension must start with '.': {ext}")
177
+ if len(ext) < 2 or not ext[1:].replace('_', '').replace('-', '').isalnum():
178
+ raise ValueError(f"Invalid file extension format: {ext}")
179
+
180
+ def get_supported_extensions(self) -> list[str]:
181
+ """Get list of supported file extensions for document processing."""
182
+ return self._config['processing']['supported_file_extensions']
183
+
184
+
185
+ # Global configuration instance
186
+ _config_instance: Optional[AppConfig] = None
187
+
188
+
189
+ def get_app_config() -> AppConfig:
190
+ """Get the global application configuration instance."""
191
+ global _config_instance
192
+ if _config_instance is None:
193
+ _config_instance = AppConfig()
194
+ _config_instance.validate()
195
+ return _config_instance
196
+
197
+
198
+ # Compatibility alias
199
+ init_app_config = get_app_config
200
+
201
+ # Compatibility alias
202
+ get_config = get_app_config
app/core/constants.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Constants for the application
2
+
3
+ # Chunk sizes
4
+ CHUNK_SIZE = 1000
5
+ CHUNK_OVERLAP = 200
6
+
7
+ # Thresholds
8
+ SIMILARITY_THRESHOLD = 0.2
9
+ RELEVANCY_THRESHOLD = 0.25
10
+
11
+ # Token limits
12
+ CLASSIFICATION_MAX_TOKENS = 1000
13
+ QA_MAX_TOKENS = 8000
14
+ CHECKLIST_PARSING_MAX_TOKENS = 16000 # Large enough for full checklist parsing
15
+
16
+ # AI Model Configuration
17
+ TEMPERATURE = 0.0 # Deterministic responses for due diligence consistency
18
+
19
+ # Batch sizes
20
+ DEFAULT_BATCH_SIZE = 10
21
+ CLASSIFICATION_BATCH_SIZE = 20
22
+
23
+ # AI Analysis types
24
+ SUPPORTED_ANALYSIS_TYPES = ["overview", "strategic", "checklist", "questions"]
app/core/content_ingestion.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unified Content Ingestion System
4
+
5
+ This module provides a unified processing pipeline with simple ingestion functions.
6
+ All content types (VDR documents, markdown files, etc.) go through the same processing pipeline
7
+ with different ingestion functions handling the content-specific parsing.
8
+ """
9
+
10
+ # Standard library imports
11
+ import json
12
+ import logging
13
+ import time
14
+ from pathlib import Path
15
+ from typing import List, Dict, Any, Optional, Tuple, Callable
16
+
17
+ # Third-party imports
18
+ from langchain_core.documents import Document
19
+ from langchain_community.vectorstores import FAISS
20
+ from tqdm import tqdm
21
+
22
+ # Local imports
23
+ from app.core.config import get_config
24
+ from app.core.model_cache import get_cached_embeddings
25
+ from app.core.parsers import parse_checklist, parse_questions
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def vdr_ingest(vdr_path: Path, store_name: str, llm=None) -> Tuple[List[Document], Dict[str, Any]]:
31
+ """Ingest VDR documents using DocumentProcessor"""
32
+ logger.info(f"Ingesting VDR documents from {vdr_path}")
33
+
34
+ # Count total files for progress tracking
35
+ total_files = sum(1 for f in vdr_path.rglob('*')
36
+ if f.is_file() and f.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
37
+
38
+ # Initialize document processor
39
+ from app.core.utils import create_document_processor
40
+ processor = create_document_processor(store_name=store_name)
41
+
42
+ # Process the data room with file-level progress
43
+ with tqdm(total=total_files, desc=f"Files in {store_name}",
44
+ unit="files", leave=False) as file_pbar:
45
+
46
+ result = processor.load_data_room(str(vdr_path))
47
+
48
+ # Update progress bar based on actual files processed
49
+ if file_pbar and result.get('documents_count', 0) > 0:
50
+ file_pbar.update(result['documents_count'])
51
+
52
+ metadata = {
53
+ 'content_type': 'vdr',
54
+ 'source_path': str(vdr_path),
55
+ 'total_files': total_files,
56
+ **result
57
+ }
58
+
59
+ return processor.documents, metadata
60
+
61
+
62
+ def classify_vdr_documents(documents: List[Document], store_name: str, classifier=None) -> Dict[str, str]:
63
+ """Classify VDR documents using fast Haiku classifier"""
64
+ if not classifier or not documents:
65
+ return {}
66
+
67
+ logger.info(f"🏷️ Classifying document types for {store_name}")
68
+
69
+ # Extract only first chunks for classification efficiency
70
+ first_chunks = []
71
+ for doc in documents:
72
+ if doc.metadata.get('is_first_chunk', False):
73
+ first_chunks.append({
74
+ 'name': doc.metadata.get('name', ''),
75
+ 'path': doc.metadata.get('path', ''),
76
+ 'content': doc.page_content[:800]
77
+ })
78
+
79
+ if not first_chunks:
80
+ logger.warning(f"⚠️ No first chunks found for classification in {store_name}")
81
+ return {}
82
+
83
+ try:
84
+ from app.ai.document_classifier import batch_classify_document_types
85
+ classified_docs = batch_classify_document_types(first_chunks, classifier)
86
+
87
+ # Build classifications dictionary
88
+ classifications = {}
89
+ for doc in classified_docs:
90
+ if 'document_type' in doc and doc['path']:
91
+ classifications[doc['path']] = doc['document_type']
92
+
93
+ logger.info(f"✅ Classified {len(classifications)} document types for {store_name}")
94
+ return classifications
95
+
96
+ except Exception as e:
97
+ logger.error(f"⚠️ Failed to classify document types for {store_name}: {e}")
98
+ return {}
99
+
100
+
101
+ def process_content(content_source: Any, content_type: str, store_name: str, classifier=None, llm=None) -> Dict[str, Any]:
102
+ """Process content source into FAISS index"""
103
+ start_time = time.time()
104
+
105
+ try:
106
+ # Get ingestion function
107
+ ingest_func = get_ingestion_function(content_type)
108
+ documents, ingestion_metadata = ingest_func(content_source, store_name, llm)
109
+
110
+ if not documents:
111
+ return {
112
+ 'success': False,
113
+ 'store_name': store_name,
114
+ 'error': 'No documents extracted'
115
+ }
116
+
117
+ # Classify VDR documents if classifier provided
118
+ classifications = {}
119
+ if classifier and content_type == 'vdr':
120
+ classifications = classify_vdr_documents(documents, store_name, classifier)
121
+
122
+ # Create FAISS index
123
+ from app.core.model_cache import get_cached_embeddings
124
+ from app.core.config import get_config
125
+ config = get_config()
126
+ embeddings = get_cached_embeddings(config.model['sentence_transformer_model'])
127
+ vector_store = FAISS.from_documents(documents, embeddings)
128
+
129
+ # Save index
130
+ faiss_dir = config.paths['faiss_dir']
131
+ faiss_dir.mkdir(parents=True, exist_ok=True)
132
+ vector_store.save_local(str(faiss_dir), index_name=store_name)
133
+
134
+ # Save classifications if available
135
+ if classifications:
136
+ classifications_file = faiss_dir / f"{store_name}_document_types.json"
137
+ classifications_file.write_text(
138
+ json.dumps(classifications, indent=2, ensure_ascii=False)
139
+ )
140
+
141
+ # Save enhanced checklists
142
+ if 'enhanced_checklists' in ingestion_metadata:
143
+ checklists_file = faiss_dir / "checklists.json"
144
+ checklists_file.write_text(
145
+ json.dumps(ingestion_metadata['enhanced_checklists'], indent=2, ensure_ascii=False)
146
+ )
147
+
148
+ processing_time = time.time() - start_time
149
+
150
+ return {
151
+ 'success': True,
152
+ 'store_name': store_name,
153
+ 'processing_time': processing_time,
154
+ 'classifications_count': len(classifications),
155
+ **ingestion_metadata
156
+ }
157
+
158
+ except Exception as e:
159
+ return {
160
+ 'success': False,
161
+ 'store_name': store_name,
162
+ 'error': str(e),
163
+ 'processing_time': time.time() - start_time
164
+ }
165
+
166
+
167
+ def checklist_ingest(content_dir: Path, store_name: str, llm=None) -> Tuple[List[Document], Dict[str, Any]]:
168
+ """Ingest checklist markdown files"""
169
+ logger.info(f"Ingesting checklist files from {content_dir}")
170
+
171
+ if not content_dir.exists():
172
+ raise FileNotFoundError(f"Checklist directory not found: {content_dir}")
173
+
174
+ # Find all markdown files
175
+ md_files = list(content_dir.glob("*.md"))
176
+ if not md_files:
177
+ raise ValueError(f"No markdown files found in {content_dir}")
178
+
179
+ all_documents = []
180
+
181
+ with tqdm(md_files, desc="Processing checklist files",
182
+ unit="file", leave=False) as file_pbar:
183
+
184
+ for md_file in file_pbar:
185
+ file_pbar.set_description(f"Processing {md_file.name}")
186
+ logger.info(f"Processing: {md_file.name}")
187
+
188
+ content = md_file.read_text(encoding='utf-8')
189
+ parsed_data = parse_checklist(content, llm)
190
+
191
+ # Convert checklist items to documents
192
+ for cat_key, category in parsed_data.items():
193
+ for item in category.get('items', []):
194
+ doc = Document(
195
+ page_content=item['text'],
196
+ metadata={
197
+ 'source': md_file.name,
198
+ 'category': category['name'],
199
+ 'type': 'checklist_item'
200
+ }
201
+ )
202
+ all_documents.append(doc)
203
+
204
+ metadata = {
205
+ 'content_type': 'checklist',
206
+ 'source_path': str(content_dir),
207
+ 'md_files_count': len(md_files),
208
+ 'documents_count': len(all_documents)
209
+ }
210
+
211
+ return all_documents, metadata
212
+
213
+
214
+ def questions_ingest(content_dir: Path, store_name: str, llm=None) -> Tuple[List[Document], Dict[str, Any]]:
215
+ """Ingest questions markdown files"""
216
+ logger.info(f"Ingesting questions files from {content_dir}")
217
+
218
+ if not content_dir.exists():
219
+ raise FileNotFoundError(f"Questions directory not found: {content_dir}")
220
+
221
+ # Find all markdown files
222
+ md_files = list(content_dir.glob("*.md"))
223
+ if not md_files:
224
+ raise ValueError(f"No markdown files found in {content_dir}")
225
+
226
+ all_documents = []
227
+
228
+ with tqdm(md_files, desc="Processing questions files",
229
+ unit="file", leave=False) as file_pbar:
230
+
231
+ for md_file in file_pbar:
232
+ file_pbar.set_description(f"Processing {md_file.name}")
233
+ logger.info(f"Processing: {md_file.name}")
234
+
235
+ content = md_file.read_text(encoding='utf-8')
236
+ parsed_data = parse_questions(content, llm)
237
+
238
+ # Convert questions to documents
239
+ for question in parsed_data:
240
+ doc = Document(
241
+ page_content=f"{question['category']}: {question['question']}",
242
+ metadata={
243
+ 'source': md_file.name,
244
+ 'category': question['category'],
245
+ 'question_id': question['id'],
246
+ 'type': 'question'
247
+ }
248
+ )
249
+ all_documents.append(doc)
250
+
251
+ metadata = {
252
+ 'content_type': 'questions',
253
+ 'source_path': str(content_dir),
254
+ 'md_files_count': len(md_files),
255
+ 'documents_count': len(all_documents)
256
+ }
257
+
258
+ return all_documents, metadata
259
+
260
+
261
+ # Factory function for getting ingestion functions
262
+ def get_ingestion_function(content_type: str) -> Callable[..., Tuple[List[Document], Dict[str, Any]]]:
263
+ """Factory function to get appropriate ingestion function"""
264
+ functions = {
265
+ 'vdr': vdr_ingest,
266
+ 'checklist': checklist_ingest,
267
+ 'questions': questions_ingest
268
+ }
269
+
270
+ if content_type not in functions:
271
+ raise ValueError(f"Unknown content type: {content_type}. Available: {list(functions.keys())}")
272
+
273
+ return functions[content_type]
274
+
275
+
276
+ # Backward compatibility - create UnifiedContentProcessor class that uses process_content
277
+ class UnifiedContentProcessor:
278
+ """Backward compatibility wrapper for process_content function"""
279
+
280
+ def process_content_source(self, content_source: Any, content_type: str, store_name: str, classifier=None, progress_bar=None, llm=None):
281
+ """Process content using the unified function"""
282
+ return process_content(content_source, content_type, store_name, classifier, llm)
src/document_processing.py → app/core/document_processor.py RENAMED
@@ -2,19 +2,18 @@
2
  """
3
  Streamlined Document Processing Module
4
 
5
- This module provides a simplified document processing pipeline with:
6
- - Direct LangChain loader integration with glob patterns
7
  - Built-in FAISS vector storage without external file tracking
8
  - Semantic text chunking using RecursiveCharacterTextSplitter
9
  - Consolidated document metadata handling
10
  """
11
 
12
  import os
13
- import logging
14
 
15
- # Fix tokenizers parallelism warning
16
- os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
17
- import re
18
 
19
  from pathlib import Path
20
  from typing import Dict, List, Optional, Any, Callable
@@ -23,17 +22,23 @@ from datetime import datetime
23
  # LangChain imports
24
  from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, Docx2txtLoader, TextLoader
25
  from langchain_community.vectorstores import FAISS
26
- from langchain_huggingface import HuggingFaceEmbeddings
27
  from langchain_core.documents import Document
28
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
29
 
30
- # Import configuration
31
- from .config import get_config
32
-
33
- # Import error handling
34
-
35
 
36
- logger = logging.getLogger(__name__)
 
 
 
 
 
 
37
 
38
 
39
  # =============================================================================
@@ -43,13 +48,13 @@ logger = logging.getLogger(__name__)
43
  def safe_execute(func: Callable, default: Any = None, context: str = "", log_errors: bool = True) -> Any:
44
  """
45
  Execute a function with basic error handling and logging
46
-
47
  Args:
48
  func: Function to execute
49
  default: Value to return on error
50
  context: Brief description for logs
51
  log_errors: Whether to log errors
52
-
53
  Returns:
54
  Function result or default value on error
55
  """
@@ -78,71 +83,98 @@ def escape_markdown_math(text: str) -> str:
78
  class DocumentProcessor:
79
  """
80
  Streamlined document processing class with integrated FAISS vector storage
81
-
82
  This class consolidates all document processing functionality including:
83
  - Document loading using LangChain's DirectoryLoader with glob patterns
84
  - Semantic text chunking with RecursiveCharacterTextSplitter
85
  - FAISS vector storage for similarity search
86
  - Document metadata handling
87
  """
88
-
89
  def __init__(self, model_name: Optional[str] = None, store_name: Optional[str] = None):
90
  """
91
  Initialize the document processor
92
-
93
  Args:
94
  model_name: Name of the sentence transformer model for embeddings (optional)
95
  store_name: Name for the FAISS store (optional, uses config default)
96
  """
97
- config = get_config()
98
- self.model_name = model_name or config.model.sentence_transformer_model
99
- self.store_name = store_name or config.processing.faiss_store_name
100
-
101
  # Initialize components
102
  self.documents: List[Document] = []
103
  self.vector_store: Optional[FAISS] = None
104
  self.embeddings: Optional[HuggingFaceEmbeddings] = None
105
  self.text_splitter: Optional[RecursiveCharacterTextSplitter] = None
106
  self.performance_stats = {}
107
-
108
  # Convenience properties for backward compatibility
109
  self.chunks = [] # Will be populated after processing
110
-
111
  # Initialize text splitter with semantic boundaries
112
  self._init_text_splitter()
113
-
114
  # Initialize embeddings if model name provided
115
  if self.model_name:
116
- self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
117
- logger.info(f"Initialized embeddings with model: {self.model_name}")
 
 
 
 
 
 
 
 
 
 
 
118
  else:
119
  logger.warning("No model name provided - embeddings not initialized")
120
-
 
121
  # Try to load existing FAISS store
122
  self._load_existing_store()
123
-
124
  def _init_text_splitter(self):
125
  """Initialize the text splitter with optimal settings for semantic chunking"""
126
- config = get_config()
127
  self.text_splitter = RecursiveCharacterTextSplitter(
128
- chunk_size=config.processing.chunk_size,
129
- chunk_overlap=config.processing.chunk_overlap,
130
- separators=["\\n\\n", "\\n", ".", "!", "?", ",", " "],
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  length_function=len,
132
  is_separator_regex=False,
 
 
133
  )
134
- logger.info(f"Initialized text splitter: {config.processing.chunk_size} chars, {config.processing.chunk_overlap} overlap")
135
-
136
  def _load_existing_store(self):
137
  """Load existing FAISS store if available"""
138
  if not self.embeddings:
139
  return
140
-
141
- config = get_config()
142
- faiss_dir = Path(config.paths.data_dir) / "enhanced_faiss"
143
  faiss_index_path = faiss_dir / f"{self.store_name}.faiss"
144
  faiss_pkl_path = faiss_dir / f"{self.store_name}.pkl"
145
-
146
  try:
147
  if faiss_index_path.exists() and faiss_pkl_path.exists():
148
  self.vector_store = FAISS.load_local(
@@ -157,60 +189,54 @@ class DocumentProcessor:
157
  except Exception as e:
158
  logger.error(f"Failed to load FAISS store: {e}")
159
  self.vector_store = None
160
-
161
- def _save_store(self):
162
- """Save FAISS store to disk"""
163
- if not self.vector_store:
164
- return
165
-
166
- try:
167
- config = get_config()
168
- faiss_dir = Path(config.paths.data_dir) / "enhanced_faiss"
169
- faiss_dir.mkdir(parents=True, exist_ok=True)
170
-
171
- self.vector_store.save_local(
172
- str(faiss_dir),
173
- index_name=self.store_name
174
- )
175
- logger.info(f"Saved FAISS store: {self.store_name} with {self.vector_store.index.ntotal} vectors")
176
- except Exception as e:
177
- logger.error(f"Failed to save FAISS store: {e}")
178
-
179
  def load_data_room(self, data_room_path: str, progress_bar=None) -> Dict[str, Any]:
180
  """
181
  Load and process an entire data room using DirectoryLoader with glob patterns
182
-
183
  Args:
184
  data_room_path: Path to the data room directory
185
  progress_bar: Optional Streamlit progress bar object
186
-
187
  Returns:
188
  Dictionary with processing results including performance metrics
189
  """
190
  import time
191
  start_time = time.time()
192
-
193
- config = get_config()
194
  data_room_path = Path(data_room_path)
195
-
196
  if not data_room_path.exists():
197
  logger.error(f"Data room path does not exist: {data_room_path}")
198
  return {'documents_count': 0, 'chunks_count': 0, 'has_embeddings': False}
199
-
200
  logger.info(f"Starting streamlined data room processing: {data_room_path}")
201
-
202
  # Clear existing documents
203
  self.documents = []
 
 
 
 
204
  documents_loaded = 0
205
-
 
206
  # Load documents by file type using DirectoryLoader with glob patterns
207
- supported_extensions = config.processing.supported_file_extensions
208
-
 
 
 
 
 
 
209
  for ext in supported_extensions:
210
  try:
211
  # Create glob pattern for this extension
212
  glob_pattern = f"**/*{ext}"
213
-
214
  # Choose appropriate loader based on extension
215
  if ext == '.pdf':
216
  loader_cls = PyPDFLoader
@@ -220,7 +246,7 @@ class DocumentProcessor:
220
  loader_cls = TextLoader
221
  else:
222
  continue
223
-
224
  # Use DirectoryLoader with glob pattern
225
  loader = DirectoryLoader(
226
  str(data_room_path),
@@ -231,14 +257,14 @@ class DocumentProcessor:
231
  show_progress=False, # Disable verbose progress output
232
  use_multithreading=True
233
  )
234
-
235
  # Load documents for this extension
236
  docs = safe_execute(
237
  lambda: loader.load(),
238
  default=[],
239
  context=f"Loading {ext} files"
240
  )
241
-
242
  if docs:
243
  # Add relative path information to metadata
244
  for doc in docs:
@@ -253,34 +279,55 @@ class DocumentProcessor:
253
  # If relative path fails, use original source
254
  doc.metadata['path'] = doc.metadata['source']
255
  doc.metadata['name'] = source_path.name
256
-
257
  self.documents.extend(docs)
258
  documents_loaded += len(docs)
259
  logger.info(f"Loaded {len(docs)} {ext} documents")
260
-
 
 
 
 
 
 
261
  except Exception as e:
262
  logger.error(f"Error loading {ext} files: {e}")
263
-
264
  scan_time = time.time() - start_time
265
  logger.info(f"Document loading completed in {scan_time:.2f} seconds")
266
-
267
  # Split documents into chunks using the text splitter
268
  chunk_start = time.time()
269
  if self.documents and self.text_splitter:
 
 
 
270
  self.documents = self.text_splitter.split_documents(self.documents)
271
-
272
  # Add chunk metadata and populate chunks for backward compatibility
 
 
273
  self.chunks = []
 
274
  for i, doc in enumerate(self.documents):
275
  doc.metadata['chunk_id'] = f"chunk_{i}"
276
  doc.metadata['processed_at'] = datetime.now().isoformat()
277
-
 
 
 
 
 
 
 
 
 
278
  # Add citation information if available
279
  if 'page' in doc.metadata:
280
  doc.metadata['citation'] = f"page {doc.metadata['page']}"
281
  else:
282
  doc.metadata['citation'] = doc.metadata.get('name', 'document')
283
-
284
  # Create chunk dict for backward compatibility
285
  chunk_dict = {
286
  'text': doc.page_content,
@@ -290,33 +337,29 @@ class DocumentProcessor:
290
  'metadata': doc.metadata
291
  }
292
  self.chunks.append(chunk_dict)
293
-
 
 
 
294
  chunk_time = time.time() - chunk_start
295
  logger.info(f"Text splitting completed in {chunk_time:.2f} seconds")
296
-
297
- # Create or update FAISS vector store
298
  embedding_time = 0
299
  if self.embeddings and self.documents:
300
  embedding_start = time.time()
301
-
302
  if self.vector_store is None:
303
- # Create new FAISS store
304
- self.vector_store = FAISS.from_documents(self.documents, self.embeddings)
305
- logger.info(f"Created new FAISS store with {len(self.documents)} documents")
306
  else:
307
- # Add documents to existing store
308
- self.vector_store.add_documents(self.documents)
309
- logger.info(f"Added {len(self.documents)} documents to existing FAISS store")
310
-
311
- # Save the updated store
312
- self._save_store()
313
-
314
  embedding_time = time.time() - embedding_start
315
- logger.info(f"FAISS processing completed in {embedding_time:.2f} seconds")
316
-
317
  total_time = time.time() - start_time
318
  logger.info(f"Total data room processing completed in {total_time:.2f} seconds")
319
-
320
  # Store performance stats
321
  self.performance_stats = {
322
  'total_time': total_time,
@@ -325,7 +368,7 @@ class DocumentProcessor:
325
  'embedding_time': embedding_time,
326
  'documents_per_second': documents_loaded / scan_time if scan_time > 0 else 0
327
  }
328
-
329
  return {
330
  'documents_count': documents_loaded,
331
  'chunks_count': len(self.documents),
@@ -333,65 +376,80 @@ class DocumentProcessor:
333
  'has_embeddings': self.vector_store is not None,
334
  'performance': self.performance_stats
335
  }
336
-
337
  def search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
338
  """
339
  Search documents using FAISS similarity search
340
-
341
  Args:
342
  query: Search query
343
  top_k: Number of top results to return
344
  threshold: Minimum similarity threshold
345
-
346
  Returns:
347
  List of search results with scores and metadata
348
  """
349
  if not self.vector_store:
350
  logger.warning("FAISS vector store not available for search")
351
  return []
352
-
353
- config = get_config()
354
  if threshold is None:
355
- threshold = config.processing.similarity_threshold
356
-
357
  try:
358
- # Perform similarity search with scores
359
- docs_and_scores = self.vector_store.similarity_search_with_score(query, k=top_k*2)
360
-
361
- results = []
 
362
  seen_texts = set()
363
-
364
  for doc, score in docs_and_scores:
365
  # Convert FAISS distance to similarity score (higher is better)
366
- similarity_score = 1.0 / (1.0 + score) if score >= 0 else 1.0
367
-
368
  if similarity_score < threshold:
369
  continue
370
-
371
  # Avoid duplicates based on text content
372
  text_preview = doc.page_content[:100]
373
  if text_preview not in seen_texts:
374
  seen_texts.add(text_preview)
375
-
376
- results.append({
377
  'text': doc.page_content,
378
  'source': doc.metadata.get('name', ''),
379
  'path': doc.metadata.get('path', ''),
380
- 'full_path': doc.metadata.get('source', ''),
381
- 'citation': doc.metadata.get('citation', 'document'),
382
  'score': float(similarity_score),
383
  'metadata': doc.metadata
384
  })
385
-
386
- if len(results) >= top_k:
387
- break
388
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  return results
390
-
391
  except Exception as e:
392
  logger.error(f"Failed to search FAISS store: {e}")
393
- return []
394
-
395
  def get_statistics(self) -> Dict[str, Any]:
396
  """Get processing statistics"""
397
  stats = {
@@ -401,10 +459,9 @@ class DocumentProcessor:
401
  'store_name': self.store_name,
402
  'model_name': self.model_name
403
  }
404
-
405
  # Add performance metrics if available
406
  if self.performance_stats:
407
  stats['performance'] = self.performance_stats
408
-
409
  return stats
410
-
 
2
  """
3
  Streamlined Document Processing Module
4
 
5
+ This module provides a document processing pipeline with:
6
+ - Direct LangChain loader integration with glob patterns
7
  - Built-in FAISS vector storage without external file tracking
8
  - Semantic text chunking using RecursiveCharacterTextSplitter
9
  - Consolidated document metadata handling
10
  """
11
 
12
  import os
13
+ import time
14
 
15
+ # Enable tokenizers parallelism for better performance
16
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")
 
17
 
18
  from pathlib import Path
19
  from typing import Dict, List, Optional, Any, Callable
 
22
  # LangChain imports
23
  from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, Docx2txtLoader, TextLoader
24
  from langchain_community.vectorstores import FAISS
 
25
  from langchain_core.documents import Document
26
  from langchain_text_splitters import RecursiveCharacterTextSplitter
27
+ from langchain_huggingface import HuggingFaceEmbeddings
28
 
29
+ # Import configuration and utilities from app modules
30
+ from app.core.config import get_app_config
31
+ from app.core.model_cache import get_cached_embeddings
32
+ from app.core.logging import logger
33
+ from app.core.performance import get_performance_manager, monitor_performance, cached_by_content
34
 
35
+ # Optional accelerate import
36
+ try:
37
+ from accelerate import Accelerator
38
+ ACCELERATE_AVAILABLE = True
39
+ except ImportError:
40
+ ACCELERATE_AVAILABLE = False
41
+ Accelerator = None
42
 
43
 
44
  # =============================================================================
 
48
  def safe_execute(func: Callable, default: Any = None, context: str = "", log_errors: bool = True) -> Any:
49
  """
50
  Execute a function with basic error handling and logging
51
+
52
  Args:
53
  func: Function to execute
54
  default: Value to return on error
55
  context: Brief description for logs
56
  log_errors: Whether to log errors
57
+
58
  Returns:
59
  Function result or default value on error
60
  """
 
83
  class DocumentProcessor:
84
  """
85
  Streamlined document processing class with integrated FAISS vector storage
86
+
87
  This class consolidates all document processing functionality including:
88
  - Document loading using LangChain's DirectoryLoader with glob patterns
89
  - Semantic text chunking with RecursiveCharacterTextSplitter
90
  - FAISS vector storage for similarity search
91
  - Document metadata handling
92
  """
93
+
94
  def __init__(self, model_name: Optional[str] = None, store_name: Optional[str] = None):
95
  """
96
  Initialize the document processor
97
+
98
  Args:
99
  model_name: Name of the sentence transformer model for embeddings (optional)
100
  store_name: Name for the FAISS store (optional, uses config default)
101
  """
102
+ config = get_app_config()
103
+ self.model_name = model_name or config.model['sentence_transformer_model']
104
+ self.store_name = store_name or config.processing['faiss_store_name']
105
+
106
  # Initialize components
107
  self.documents: List[Document] = []
108
  self.vector_store: Optional[FAISS] = None
109
  self.embeddings: Optional[HuggingFaceEmbeddings] = None
110
  self.text_splitter: Optional[RecursiveCharacterTextSplitter] = None
111
  self.performance_stats = {}
112
+
113
  # Convenience properties for backward compatibility
114
  self.chunks = [] # Will be populated after processing
115
+
116
  # Initialize text splitter with semantic boundaries
117
  self._init_text_splitter()
118
+
119
  # Initialize embeddings if model name provided
120
  if self.model_name:
121
+ self.embeddings = get_cached_embeddings(self.model_name)
122
+ logger.info(f"Initialized cached embeddings with model: {self.model_name}")
123
+
124
+ # Setup accelerate for GPU optimization if available
125
+ if ACCELERATE_AVAILABLE:
126
+ try:
127
+ self.accelerator = Accelerator()
128
+ logger.info(f"Accelerate initialized with device: {self.accelerator.device}")
129
+ except Exception as e:
130
+ logger.warning(f"Failed to initialize accelerate: {e}")
131
+ self.accelerator = None
132
+ else:
133
+ self.accelerator = None
134
  else:
135
  logger.warning("No model name provided - embeddings not initialized")
136
+ self.accelerator = None
137
+
138
  # Try to load existing FAISS store
139
  self._load_existing_store()
140
+
141
  def _init_text_splitter(self):
142
  """Initialize the text splitter with optimal settings for semantic chunking"""
143
+ config = get_app_config()
144
  self.text_splitter = RecursiveCharacterTextSplitter(
145
+ chunk_size=config.processing['chunk_size'],
146
+ chunk_overlap=config.processing['chunk_overlap'],
147
+ # Better separators for business documents with semantic boundaries
148
+ separators=[
149
+ "\n\n\n", # Triple newlines (major section breaks)
150
+ "\n\n", # Double newlines (paragraph breaks)
151
+ "\n", # Single newlines
152
+ ". ", # Sentences
153
+ ".\n", # Sentences with newlines
154
+ "! ", # Exclamations
155
+ "? ", # Questions
156
+ "; ", # Semicolons (common in legal/business docs)
157
+ ", ", # Commas (last resort for long sentences)
158
+ " ", # Spaces
159
+ "", # Character level (absolute last resort)
160
+ ],
161
  length_function=len,
162
  is_separator_regex=False,
163
+ # Keep related content together
164
+ keep_separator=True, # Keep separators to maintain context
165
  )
166
+ logger.info(f"Initialized semantic text splitter: {config.processing['chunk_size']} chars, {config.processing['chunk_overlap']} overlap")
167
+
168
  def _load_existing_store(self):
169
  """Load existing FAISS store if available"""
170
  if not self.embeddings:
171
  return
172
+
173
+ config = get_app_config()
174
+ faiss_dir = config.paths['faiss_dir']
175
  faiss_index_path = faiss_dir / f"{self.store_name}.faiss"
176
  faiss_pkl_path = faiss_dir / f"{self.store_name}.pkl"
177
+
178
  try:
179
  if faiss_index_path.exists() and faiss_pkl_path.exists():
180
  self.vector_store = FAISS.load_local(
 
189
  except Exception as e:
190
  logger.error(f"Failed to load FAISS store: {e}")
191
  self.vector_store = None
192
+
193
+ @monitor_performance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  def load_data_room(self, data_room_path: str, progress_bar=None) -> Dict[str, Any]:
195
  """
196
  Load and process an entire data room using DirectoryLoader with glob patterns
197
+
198
  Args:
199
  data_room_path: Path to the data room directory
200
  progress_bar: Optional Streamlit progress bar object
201
+
202
  Returns:
203
  Dictionary with processing results including performance metrics
204
  """
205
  import time
206
  start_time = time.time()
207
+
208
+ config = get_app_config()
209
  data_room_path = Path(data_room_path)
210
+
211
  if not data_room_path.exists():
212
  logger.error(f"Data room path does not exist: {data_room_path}")
213
  return {'documents_count': 0, 'chunks_count': 0, 'has_embeddings': False}
214
+
215
  logger.info(f"Starting streamlined data room processing: {data_room_path}")
216
+
217
  # Clear existing documents
218
  self.documents = []
219
+
220
+ @monitor_performance
221
+ def load_data_room(self, data_room_path: str, progress_bar=None) -> Dict[str, Any]:
222
+ start_time = time.time()
223
  documents_loaded = 0
224
+ config = get_app_config()
225
+
226
  # Load documents by file type using DirectoryLoader with glob patterns
227
+ supported_extensions = config.processing['supported_file_extensions']
228
+ perf_manager = get_performance_manager()
229
+
230
+ # Get memory info for batch optimization
231
+ mem_info = perf_manager.monitor_memory_usage()
232
+ logger.info(f"Memory usage at start: {mem_info['percent']:.1f}%")
233
+ logger.info(f"Available memory: {mem_info['rss']:.1f}MB")
234
+
235
  for ext in supported_extensions:
236
  try:
237
  # Create glob pattern for this extension
238
  glob_pattern = f"**/*{ext}"
239
+
240
  # Choose appropriate loader based on extension
241
  if ext == '.pdf':
242
  loader_cls = PyPDFLoader
 
246
  loader_cls = TextLoader
247
  else:
248
  continue
249
+
250
  # Use DirectoryLoader with glob pattern
251
  loader = DirectoryLoader(
252
  str(data_room_path),
 
257
  show_progress=False, # Disable verbose progress output
258
  use_multithreading=True
259
  )
260
+
261
  # Load documents for this extension
262
  docs = safe_execute(
263
  lambda: loader.load(),
264
  default=[],
265
  context=f"Loading {ext} files"
266
  )
267
+
268
  if docs:
269
  # Add relative path information to metadata
270
  for doc in docs:
 
279
  # If relative path fails, use original source
280
  doc.metadata['path'] = doc.metadata['source']
281
  doc.metadata['name'] = source_path.name
282
+
283
  self.documents.extend(docs)
284
  documents_loaded += len(docs)
285
  logger.info(f"Loaded {len(docs)} {ext} documents")
286
+
287
+ # Monitor memory usage and trigger GC if needed
288
+ mem_usage = perf_manager.monitor_memory_usage()
289
+ if perf_manager.should_gc_collect(mem_usage):
290
+ import gc
291
+ gc.collect()
292
+ logger.debug(f"GC triggered - memory usage: {mem_usage['rss']:.1f}MB")
293
  except Exception as e:
294
  logger.error(f"Error loading {ext} files: {e}")
295
+
296
  scan_time = time.time() - start_time
297
  logger.info(f"Document loading completed in {scan_time:.2f} seconds")
298
+
299
  # Split documents into chunks using the text splitter
300
  chunk_start = time.time()
301
  if self.documents and self.text_splitter:
302
+ # Track original documents to identify first chunks
303
+ original_docs = {doc.metadata.get('source', ''): True for doc in self.documents}
304
+
305
  self.documents = self.text_splitter.split_documents(self.documents)
306
+
307
  # Add chunk metadata and populate chunks for backward compatibility
308
+ # Track which documents we've seen to mark first chunks
309
+ seen_documents = {}
310
  self.chunks = []
311
+
312
  for i, doc in enumerate(self.documents):
313
  doc.metadata['chunk_id'] = f"chunk_{i}"
314
  doc.metadata['processed_at'] = datetime.now().isoformat()
315
+
316
+ # Mark first chunks for each document (critical for document type matching)
317
+ doc_source = doc.metadata.get('source', '')
318
+ if doc_source not in seen_documents:
319
+ doc.metadata['is_first_chunk'] = True
320
+ seen_documents[doc_source] = True
321
+ logger.debug(f"First chunk marked for: {doc_source}")
322
+ else:
323
+ doc.metadata['is_first_chunk'] = False
324
+
325
  # Add citation information if available
326
  if 'page' in doc.metadata:
327
  doc.metadata['citation'] = f"page {doc.metadata['page']}"
328
  else:
329
  doc.metadata['citation'] = doc.metadata.get('name', 'document')
330
+
331
  # Create chunk dict for backward compatibility
332
  chunk_dict = {
333
  'text': doc.page_content,
 
337
  'metadata': doc.metadata
338
  }
339
  self.chunks.append(chunk_dict)
340
+
341
+ first_chunks_count = len([doc for doc in self.documents if doc.metadata.get('is_first_chunk', False)])
342
+ logger.info(f"Marked {first_chunks_count} first chunks out of {len(self.documents)} total chunks")
343
+
344
  chunk_time = time.time() - chunk_start
345
  logger.info(f"Text splitting completed in {chunk_time:.2f} seconds")
346
+
347
+ # FAISS vector store should be loaded from pre-built indices
348
  embedding_time = 0
349
  if self.embeddings and self.documents:
350
  embedding_start = time.time()
351
+
352
  if self.vector_store is None:
353
+ logger.debug("FAISS store not pre-loaded (expected during index building)")
 
 
354
  else:
355
+ logger.info(f"Using pre-loaded FAISS store with {self.vector_store.index.ntotal} vectors")
356
+
 
 
 
 
 
357
  embedding_time = time.time() - embedding_start
358
+ logger.info(f"FAISS check completed in {embedding_time:.2f} seconds")
359
+
360
  total_time = time.time() - start_time
361
  logger.info(f"Total data room processing completed in {total_time:.2f} seconds")
362
+
363
  # Store performance stats
364
  self.performance_stats = {
365
  'total_time': total_time,
 
368
  'embedding_time': embedding_time,
369
  'documents_per_second': documents_loaded / scan_time if scan_time > 0 else 0
370
  }
371
+
372
  return {
373
  'documents_count': documents_loaded,
374
  'chunks_count': len(self.documents),
 
376
  'has_embeddings': self.vector_store is not None,
377
  'performance': self.performance_stats
378
  }
379
+
380
  def search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
381
  """
382
  Search documents using FAISS similarity search
383
+
384
  Args:
385
  query: Search query
386
  top_k: Number of top results to return
387
  threshold: Minimum similarity threshold
388
+
389
  Returns:
390
  List of search results with scores and metadata
391
  """
392
  if not self.vector_store:
393
  logger.warning("FAISS vector store not available for search")
394
  return []
395
+
396
+ config = get_app_config()
397
  if threshold is None:
398
+ threshold = config.processing['similarity_threshold']
399
+
400
  try:
401
+ # Perform similarity search with scores - get more candidates for reranking
402
+ docs_and_scores = self.vector_store.similarity_search_with_score(query, k=max(20, top_k*3))
403
+
404
+ # Initial filtering and conversion to candidates format
405
+ candidates = []
406
  seen_texts = set()
407
+
408
  for doc, score in docs_and_scores:
409
  # Convert FAISS distance to similarity score (higher is better)
410
+ similarity_score = 1.0 - (score / 2.0) if score <= 2.0 else 0.0
411
+
412
  if similarity_score < threshold:
413
  continue
414
+
415
  # Avoid duplicates based on text content
416
  text_preview = doc.page_content[:100]
417
  if text_preview not in seen_texts:
418
  seen_texts.add(text_preview)
419
+
420
+ candidates.append({
421
  'text': doc.page_content,
422
  'source': doc.metadata.get('name', ''),
423
  'path': doc.metadata.get('path', ''),
 
 
424
  'score': float(similarity_score),
425
  'metadata': doc.metadata
426
  })
427
+
428
+ # Apply reranking if we have candidates
429
+ if candidates:
430
+ try:
431
+ # Import rerank_results from ranking module to avoid circular import
432
+ from app.core.ranking import rerank_results
433
+
434
+ # Rerank the top candidates (limit to reasonable number for performance)
435
+ candidates_to_rerank = candidates[:min(15, len(candidates))] # Rerank up to 15 candidates
436
+
437
+ reranked_results = rerank_results(query, candidates_to_rerank)
438
+ results = reranked_results[:top_k] # Take top_k after reranking
439
+ logger.info(f"Reranked {len(reranked_results)} search results for query: {query[:50]}...")
440
+ except Exception as e:
441
+ # Reranking failed - use original results without reranking
442
+ logger.warning(f"Reranking failed for search query '{query}': {e}. Using original similarity scores.")
443
+ results = candidates[:top_k]
444
+ else:
445
+ results = []
446
+
447
  return results
448
+
449
  except Exception as e:
450
  logger.error(f"Failed to search FAISS store: {e}")
451
+ raise RuntimeError(f"Document search failed for query '{query}': {e}") from e
452
+
453
  def get_statistics(self) -> Dict[str, Any]:
454
  """Get processing statistics"""
455
  stats = {
 
459
  'store_name': self.store_name,
460
  'model_name': self.model_name
461
  }
462
+
463
  # Add performance metrics if available
464
  if self.performance_stats:
465
  stats['performance'] = self.performance_stats
466
+
467
  return stats
 
app/core/exceptions.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Core Exception Classes
4
+
5
+ Centralized exception definitions for the application.
6
+ This module provides clean exception classes without
7
+ depending on UI or external frameworks.
8
+ """
9
+
10
+
11
+ class AppException(Exception):
12
+ """Base exception class for application-specific errors"""
13
+
14
+ def __init__(self, message: str, user_message: str = None, recovery_hint: str = None):
15
+ self.message = message
16
+ self.user_message = user_message or message
17
+ self.recovery_hint = recovery_hint
18
+ super().__init__(message)
19
+
20
+
21
+ class ValidationError(AppException):
22
+ """Error for input validation failures"""
23
+ pass
24
+
25
+
26
+ class ProcessingError(AppException):
27
+ """Error for document processing failures"""
28
+ pass
29
+
30
+
31
+ class AIError(AppException):
32
+ """Error for AI service failures"""
33
+ pass
34
+
35
+
36
+ class ConfigError(AppException):
37
+ """Error for configuration issues"""
38
+ pass
39
+
40
+
41
+ class FileOperationError(AppException):
42
+ """Error for file operation failures"""
43
+ pass
44
+
45
+
46
+ class NetworkError(AppException):
47
+ """Error for network-related failures"""
48
+ pass
49
+
50
+
51
+ class LLMConnectionError(AIError):
52
+ """Error for LLM API connection failures"""
53
+ pass
54
+
55
+
56
+ class LLMAuthenticationError(AIError):
57
+ """Error for LLM API authentication failures"""
58
+ pass
59
+
60
+
61
+ class LLMTimeoutError(AIError):
62
+ """Error for LLM API timeout failures"""
63
+ pass
64
+
65
+
66
+ class LLMQuotaExceededError(AIError):
67
+ """Error for LLM API quota/rate limit exceeded"""
68
+ pass
69
+
70
+
71
+ class LLMInvalidResponseError(AIError):
72
+ """Error for invalid LLM API responses"""
73
+ pass
74
+
75
+
76
+ class DocumentProcessingError(ProcessingError):
77
+ """Error for document processing failures"""
78
+ pass
79
+
80
+
81
+ class SearchError(AppException):
82
+ """Error for search operation failures"""
83
+ pass
84
+
85
+
86
+ # Convenience functions for creating exceptions
87
+ def create_validation_error(message: str, recovery_hint: str = None) -> ValidationError:
88
+ """Create a validation error with consistent formatting"""
89
+ return ValidationError(
90
+ message,
91
+ user_message=f"Validation error: {message}",
92
+ recovery_hint=recovery_hint or "Please check your input and try again"
93
+ )
94
+
95
+
96
+ def create_processing_error(message: str, recovery_hint: str = None) -> ProcessingError:
97
+ """Create a processing error with consistent formatting"""
98
+ return ProcessingError(
99
+ message,
100
+ user_message=f"Processing error: {message}",
101
+ recovery_hint=recovery_hint or "Please check your files and try again"
102
+ )
103
+
104
+
105
+ def create_ai_error(message: str, recovery_hint: str = None) -> AIError:
106
+ """Create an AI error with consistent formatting"""
107
+ return AIError(
108
+ message,
109
+ user_message=f"AI service error: {message}",
110
+ recovery_hint=recovery_hint or "Please check your API key and try again"
111
+ )
112
+
113
+
114
+ def create_config_error(message: str, recovery_hint: str = None) -> ConfigError:
115
+ """Create a configuration error with consistent formatting"""
116
+ return ConfigError(
117
+ message,
118
+ user_message=f"Configuration error: {message}",
119
+ recovery_hint=recovery_hint or "Please check your configuration and environment variables"
120
+ )
121
+
122
+
123
+ def create_file_error(message: str, recovery_hint: str = None) -> FileOperationError:
124
+ """Create a file operation error with consistent formatting"""
125
+ return FileOperationError(
126
+ message,
127
+ user_message=f"File error: {message}",
128
+ recovery_hint=recovery_hint or "Please check file permissions and paths"
129
+ )
130
+
131
+
132
+ def create_network_error(message: str, recovery_hint: str = None) -> NetworkError:
133
+ """Create a network error with consistent formatting"""
134
+ return NetworkError(
135
+ message,
136
+ user_message=f"Network error: {message}",
137
+ recovery_hint=recovery_hint or "Please check your internet connection and try again"
138
+ )
139
+
140
+
141
+ def create_llm_connection_error(message: str, recovery_hint: str = None) -> LLMConnectionError:
142
+ """Create an LLM connection error with consistent formatting"""
143
+ return LLMConnectionError(
144
+ message,
145
+ user_message=f"AI service connection error: {message}",
146
+ recovery_hint=recovery_hint or "Please check your internet connection and try again"
147
+ )
148
+
149
+
150
+ def create_llm_authentication_error(message: str, recovery_hint: str = None) -> LLMAuthenticationError:
151
+ """Create an LLM authentication error with consistent formatting"""
152
+ return LLMAuthenticationError(
153
+ message,
154
+ user_message=f"AI service authentication error: {message}",
155
+ recovery_hint=recovery_hint or "Please check your API key and try again"
156
+ )
157
+
158
+
159
+ def create_llm_timeout_error(message: str, recovery_hint: str = None) -> LLMTimeoutError:
160
+ """Create an LLM timeout error with consistent formatting"""
161
+ return LLMTimeoutError(
162
+ message,
163
+ user_message=f"AI service timeout: {message}",
164
+ recovery_hint=recovery_hint or "Please try again in a few moments"
165
+ )
166
+
167
+
168
+ def create_llm_quota_error(message: str, recovery_hint: str = None) -> LLMQuotaExceededError:
169
+ """Create an LLM quota exceeded error with consistent formatting"""
170
+ return LLMQuotaExceededError(
171
+ message,
172
+ user_message=f"AI service quota exceeded: {message}",
173
+ recovery_hint=recovery_hint or "Please check your API usage limits and try again later"
174
+ )
175
+
176
+
177
+ def create_llm_invalid_response_error(message: str, recovery_hint: str = None) -> LLMInvalidResponseError:
178
+ """Create an LLM invalid response error with consistent formatting"""
179
+ return LLMInvalidResponseError(
180
+ message,
181
+ user_message=f"AI service returned invalid response: {message}",
182
+ recovery_hint=recovery_hint or "Please try again or contact support if the issue persists"
183
+ )
184
+
185
+
186
+ def create_document_processing_error(message: str, recovery_hint: str = None) -> DocumentProcessingError:
187
+ """Create a document processing error with consistent formatting"""
188
+ return DocumentProcessingError(
189
+ message,
190
+ user_message=f"Document processing error: {message}",
191
+ recovery_hint=recovery_hint or "Please check your document format and try again"
192
+ )
193
+
194
+
195
+ def create_search_error(message: str, recovery_hint: str = None) -> SearchError:
196
+ """Create a search error with consistent formatting"""
197
+ return SearchError(
198
+ message,
199
+ user_message=f"Search error: {message}",
200
+ recovery_hint=recovery_hint or "Please try adjusting your search terms"
201
+ )
app/core/knowledge_graph.py ADDED
@@ -0,0 +1,639 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Knowledge Graph Module
4
+
5
+ This module provides efficient loading and querying of pre-computed knowledge graphs
6
+ in Streamlit applications. It's designed to work with graphs generated by the
7
+ build_knowledge_graphs.py script.
8
+
9
+ Key features:
10
+ - Fast graph loading with caching
11
+ - Rich query interface for graph exploration
12
+ - Integration with existing document processor workflow
13
+ - Memory-efficient graph operations
14
+ """
15
+
16
+ import pickle
17
+ import json
18
+ import numpy as np
19
+ from pathlib import Path
20
+ from typing import Dict, List, Any, Optional, Set, Tuple
21
+ from datetime import datetime
22
+ import streamlit as st
23
+
24
+ import networkx as nx
25
+ from sklearn.metrics.pairwise import cosine_similarity
26
+ from app.core.config import get_config
27
+ from app.core.logging import logger
28
+
29
+ class KnowledgeGraphManager:
30
+ """
31
+ Manages loading and querying of knowledge graphs for due diligence analysis.
32
+
33
+ This class provides a clean interface for working with pre-computed knowledge
34
+ graphs in Streamlit applications, with efficient caching and query capabilities.
35
+ """
36
+
37
+ def __init__(self, store_name: str):
38
+ """
39
+ Initialize the knowledge graph manager for a specific company.
40
+
41
+ Args:
42
+ store_name: The company store name (matches FAISS index name)
43
+ """
44
+ self.store_name = store_name
45
+ self.graph: Optional[nx.MultiDiGraph] = None
46
+ self.metadata: Optional[Dict[str, Any]] = None
47
+ self.entities: Optional[Dict[str, List[Dict]]] = None
48
+ self.document_processor = None # Will be loaded on-demand for semantic search
49
+ self._config = get_config()
50
+
51
+ @st.cache_data(ttl=3600) # Cache for 1 hour
52
+ def load_graph(_self) -> bool:
53
+ """
54
+ Load the knowledge graph from disk with caching.
55
+
56
+ Returns:
57
+ bool: True if graph was loaded successfully, False otherwise
58
+ """
59
+ try:
60
+ graphs_dir = _self._config.paths['faiss_dir'] / 'knowledge_graphs'
61
+
62
+ # Load main graph
63
+ graph_file = graphs_dir / f"{_self.store_name}_knowledge_graph.pkl"
64
+ if not graph_file.exists():
65
+ logger.warning(f"Knowledge graph not found: {graph_file}")
66
+ return False
67
+
68
+ with open(graph_file, 'rb') as f:
69
+ _self.graph = pickle.load(f)
70
+
71
+ # Load metadata
72
+ metadata_file = graphs_dir / f"{_self.store_name}_graph_metadata.json"
73
+ if metadata_file.exists():
74
+ with open(metadata_file, 'r') as f:
75
+ _self.metadata = json.load(f)
76
+
77
+ # Load entities
78
+ entities_file = graphs_dir / f"{_self.store_name}_entities.json"
79
+ if entities_file.exists():
80
+ with open(entities_file, 'r') as f:
81
+ _self.entities = json.load(f)
82
+
83
+ logger.info(f"Loaded knowledge graph for {_self.store_name}: "
84
+ f"{len(_self.graph.nodes())} nodes, {len(_self.graph.edges())} edges")
85
+ return True
86
+
87
+ except Exception as e:
88
+ logger.error(f"Failed to load knowledge graph for {_self.store_name}: {e}")
89
+ return False
90
+
91
+ def is_available(self) -> bool:
92
+ """Check if knowledge graph is available and loaded"""
93
+ return self.graph is not None and len(self.graph.nodes()) > 0
94
+
95
+ def get_summary_stats(self) -> Dict[str, Any]:
96
+ """Get summary statistics about the knowledge graph"""
97
+ if not self.is_available():
98
+ return {}
99
+
100
+ stats = {
101
+ 'num_entities': len(self.graph.nodes()),
102
+ 'num_relationships': len(self.graph.edges()),
103
+ 'entity_types': {},
104
+ 'relationship_types': {},
105
+ 'created_at': self.metadata.get('created_at') if self.metadata else None
106
+ }
107
+
108
+ # Count entity types
109
+ for node in self.graph.nodes():
110
+ node_type = self.graph.nodes[node].get('type', 'unknown')
111
+ stats['entity_types'][node_type] = stats['entity_types'].get(node_type, 0) + 1
112
+
113
+ # Count relationship types
114
+ for _, _, edge_data in self.graph.edges(data=True):
115
+ rel_type = edge_data.get('relationship', 'unknown')
116
+ stats['relationship_types'][rel_type] = stats['relationship_types'].get(rel_type, 0) + 1
117
+
118
+ return stats
119
+
120
+ def search_entities(self, query: str, entity_type: Optional[str] = None, limit: int = 10) -> List[Dict[str, Any]]:
121
+ """
122
+ Search for entities by name or content.
123
+
124
+ Args:
125
+ query: Search query string
126
+ entity_type: Filter by entity type (companies, people, etc.)
127
+ limit: Maximum number of results
128
+
129
+ Returns:
130
+ List of matching entities with metadata
131
+ """
132
+ if not self.is_available():
133
+ return []
134
+
135
+ query_lower = query.lower()
136
+ results = []
137
+
138
+ for node in self.graph.nodes():
139
+ node_data = self.graph.nodes[node]
140
+ node_name = node_data.get('name', '').lower()
141
+ node_type = node_data.get('type', '')
142
+
143
+ # Filter by type if specified
144
+ if entity_type and node_type != entity_type:
145
+ continue
146
+
147
+ # Check if query matches name or context
148
+ if query_lower in node_name:
149
+ score = 1.0 if query_lower == node_name else 0.8
150
+
151
+ results.append({
152
+ 'node_id': node,
153
+ 'name': node_data.get('name', ''),
154
+ 'type': node_type,
155
+ 'score': score,
156
+ 'sources': node_data.get('sources', ''),
157
+ 'document_type': node_data.get('document_type', 'unknown'),
158
+ 'context_samples': node_data.get('context_samples', [])[:2] # Limit context
159
+ })
160
+
161
+ # Sort by score and limit results
162
+ results.sort(key=lambda x: x['score'], reverse=True)
163
+ return results[:limit]
164
+
165
+ def get_entity_relationships(self, entity_name: str) -> Dict[str, List[Dict[str, Any]]]:
166
+ """
167
+ Get all relationships for a specific entity.
168
+
169
+ Args:
170
+ entity_name: Name of the entity to find relationships for
171
+
172
+ Returns:
173
+ Dictionary with 'incoming' and 'outgoing' relationship lists
174
+ """
175
+ if not self.is_available():
176
+ return {'incoming': [], 'outgoing': []}
177
+
178
+ # Find matching nodes
179
+ matching_nodes = []
180
+ for node in self.graph.nodes():
181
+ if entity_name.lower() in self.graph.nodes[node].get('name', '').lower():
182
+ matching_nodes.append(node)
183
+
184
+ if not matching_nodes:
185
+ return {'incoming': [], 'outgoing': []}
186
+
187
+ relationships = {'incoming': [], 'outgoing': []}
188
+
189
+ for node in matching_nodes:
190
+ # Outgoing relationships
191
+ for _, target, edge_data in self.graph.out_edges(node, data=True):
192
+ relationships['outgoing'].append({
193
+ 'target': self.graph.nodes[target].get('name', target),
194
+ 'target_type': self.graph.nodes[target].get('type', 'unknown'),
195
+ 'relationship': edge_data.get('relationship', 'unknown'),
196
+ 'source_document': edge_data.get('source_document', ''),
197
+ 'context': edge_data.get('context', '')[:200], # Truncate context
198
+ 'confidence': edge_data.get('confidence', 0.0)
199
+ })
200
+
201
+ # Incoming relationships
202
+ for source, _, edge_data in self.graph.in_edges(node, data=True):
203
+ relationships['incoming'].append({
204
+ 'source': self.graph.nodes[source].get('name', source),
205
+ 'source_type': self.graph.nodes[source].get('type', 'unknown'),
206
+ 'relationship': edge_data.get('relationship', 'unknown'),
207
+ 'source_document': edge_data.get('source_document', ''),
208
+ 'context': edge_data.get('context', '')[:200], # Truncate context
209
+ 'confidence': edge_data.get('confidence', 0.0)
210
+ })
211
+
212
+ return relationships
213
+
214
+ def find_paths(self, source_entity: str, target_entity: str, max_length: int = 3) -> List[List[str]]:
215
+ """
216
+ Find paths between two entities in the knowledge graph.
217
+
218
+ Args:
219
+ source_entity: Starting entity name
220
+ target_entity: Target entity name
221
+ max_length: Maximum path length to search
222
+
223
+ Returns:
224
+ List of paths (each path is a list of entity names)
225
+ """
226
+ if not self.is_available():
227
+ return []
228
+
229
+ # Find matching nodes
230
+ source_nodes = [n for n in self.graph.nodes()
231
+ if source_entity.lower() in self.graph.nodes[n].get('name', '').lower()]
232
+ target_nodes = [n for n in self.graph.nodes()
233
+ if target_entity.lower() in self.graph.nodes[n].get('name', '').lower()]
234
+
235
+ if not source_nodes or not target_nodes:
236
+ return []
237
+
238
+ paths = []
239
+ for source_node in source_nodes:
240
+ for target_node in target_nodes:
241
+ if source_node == target_node:
242
+ continue
243
+
244
+ try:
245
+ # Find all simple paths up to max_length
246
+ simple_paths = list(nx.all_simple_paths(
247
+ self.graph, source_node, target_node, cutoff=max_length
248
+ ))
249
+
250
+ # Convert node IDs to entity names
251
+ for path in simple_paths[:5]: # Limit to 5 paths per pair
252
+ entity_path = [self.graph.nodes[node].get('name', node) for node in path]
253
+ paths.append(entity_path)
254
+
255
+ except nx.NetworkXNoPath:
256
+ continue
257
+
258
+ return paths[:10] # Return max 10 paths total
259
+
260
+ def get_central_entities(self, limit: int = 10) -> List[Dict[str, Any]]:
261
+ """
262
+ Get the most central/important entities in the graph.
263
+
264
+ Args:
265
+ limit: Maximum number of entities to return
266
+
267
+ Returns:
268
+ List of entities with centrality scores
269
+ """
270
+ if not self.is_available() or len(self.graph.nodes()) < 2:
271
+ return []
272
+
273
+ try:
274
+ # Calculate degree centrality
275
+ centrality = nx.degree_centrality(self.graph)
276
+
277
+ # Get top central entities
278
+ top_entities = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:limit]
279
+
280
+ results = []
281
+ for node, score in top_entities:
282
+ node_data = self.graph.nodes[node]
283
+ results.append({
284
+ 'name': node_data.get('name', ''),
285
+ 'type': node_data.get('type', 'unknown'),
286
+ 'centrality_score': round(score, 3),
287
+ 'num_connections': len(list(self.graph.neighbors(node))),
288
+ 'sources': node_data.get('sources', '')
289
+ })
290
+
291
+ return results
292
+
293
+ except Exception as e:
294
+ logger.error(f"Error calculating centrality: {e}")
295
+ return []
296
+
297
+ def get_entity_clusters(self) -> List[List[str]]:
298
+ """
299
+ Find clusters of related entities using community detection.
300
+
301
+ Returns:
302
+ List of clusters (each cluster is a list of entity names)
303
+ """
304
+ if not self.is_available() or len(self.graph.nodes()) < 3:
305
+ return []
306
+
307
+ try:
308
+ # Convert to undirected graph for community detection
309
+ undirected = self.graph.to_undirected()
310
+
311
+ # Use simple connected components as clusters
312
+ components = list(nx.connected_components(undirected))
313
+
314
+ clusters = []
315
+ for component in components:
316
+ if len(component) > 1: # Only include clusters with multiple entities
317
+ cluster_names = [self.graph.nodes[node].get('name', node) for node in component]
318
+ clusters.append(cluster_names)
319
+
320
+ # Sort clusters by size
321
+ clusters.sort(key=len, reverse=True)
322
+ return clusters[:5] # Return top 5 clusters
323
+
324
+ except Exception as e:
325
+ logger.error(f"Error finding clusters: {e}")
326
+ return []
327
+
328
+ def export_graph_data(self) -> Dict[str, Any]:
329
+ """
330
+ Export graph data for visualization or further analysis.
331
+
332
+ Returns:
333
+ Dictionary with nodes and edges data suitable for visualization
334
+ """
335
+ if not self.is_available():
336
+ return {'nodes': [], 'edges': []}
337
+
338
+ # Export nodes
339
+ nodes = []
340
+ for node in self.graph.nodes():
341
+ node_data = self.graph.nodes[node]
342
+ nodes.append({
343
+ 'id': node,
344
+ 'name': node_data.get('name', ''),
345
+ 'type': node_data.get('type', 'unknown'),
346
+ 'sources': node_data.get('sources', ''),
347
+ 'document_type': node_data.get('document_type', 'unknown')
348
+ })
349
+
350
+ # Export edges
351
+ edges = []
352
+ for source, target, edge_data in self.graph.edges(data=True):
353
+ edges.append({
354
+ 'source': source,
355
+ 'target': target,
356
+ 'relationship': edge_data.get('relationship', 'unknown'),
357
+ 'source_document': edge_data.get('source_document', ''),
358
+ 'confidence': edge_data.get('confidence', 0.0)
359
+ })
360
+
361
+ return {
362
+ 'nodes': nodes,
363
+ 'edges': edges,
364
+ 'metadata': self.metadata or {}
365
+ }
366
+
367
+ def _load_document_processor(self):
368
+ """Load document processor for semantic search capabilities"""
369
+ if self.document_processor is None:
370
+ try:
371
+ from app.core.utils import create_document_processor
372
+ self.document_processor = create_document_processor(store_name=self.store_name)
373
+ if not self.document_processor.vector_store:
374
+ logger.warning(f"No FAISS vector store available for {self.store_name}")
375
+ self.document_processor = None
376
+ except Exception as e:
377
+ logger.error(f"Failed to load document processor for {self.store_name}: {e}")
378
+ self.document_processor = None
379
+
380
+ def semantic_search_entities(self, query: str, limit: int = 10, similarity_threshold: float = 0.3) -> List[Dict[str, Any]]:
381
+ """
382
+ Perform semantic search on entities using FAISS embeddings.
383
+
384
+ This method finds entities whose source contexts are semantically similar
385
+ to the query, providing more intelligent search than simple text matching.
386
+
387
+ Args:
388
+ query: Natural language query
389
+ limit: Maximum number of results
390
+ similarity_threshold: Minimum similarity score to include
391
+
392
+ Returns:
393
+ List of entities with similarity scores and context
394
+ """
395
+ if not self.is_available():
396
+ return []
397
+
398
+ # Load document processor if not already loaded
399
+ self._load_document_processor()
400
+ if not self.document_processor or not self.document_processor.vector_store:
401
+ logger.warning("Semantic search not available - falling back to text search")
402
+ return self.search_entities(query, limit=limit)
403
+
404
+ try:
405
+ # Perform semantic search on FAISS index
406
+ relevant_docs = self.document_processor.vector_store.similarity_search_with_score(
407
+ query, k=min(50, limit * 5) # Get more candidates for filtering
408
+ )
409
+
410
+ # Map document chunks back to entities
411
+ entity_matches = []
412
+ seen_entities = set()
413
+
414
+ for doc, score in relevant_docs:
415
+ if score < similarity_threshold:
416
+ continue
417
+
418
+ # Find entities that originated from this document chunk
419
+ chunk_id = doc.metadata.get('chunk_id', '')
420
+ doc_source = doc.metadata.get('source', '')
421
+
422
+ # Search for entities that came from this chunk/document
423
+ for node in self.graph.nodes():
424
+ node_data = self.graph.nodes[node]
425
+ entity_sources = node_data.get('sources', '')
426
+
427
+ # Check if entity came from this document
428
+ if (doc_source and doc_source in entity_sources) or (chunk_id and chunk_id in str(node_data.get('context_samples', []))):
429
+ entity_key = f"{node_data.get('name', '')}_{node_data.get('type', '')}"
430
+
431
+ if entity_key not in seen_entities:
432
+ seen_entities.add(entity_key)
433
+ entity_matches.append({
434
+ 'node_id': node,
435
+ 'name': node_data.get('name', ''),
436
+ 'type': node_data.get('type', 'unknown'),
437
+ 'similarity_score': 1.0 - score, # Convert distance to similarity
438
+ 'sources': entity_sources,
439
+ 'document_type': node_data.get('document_type', 'unknown'),
440
+ 'context_samples': node_data.get('context_samples', [])[:2],
441
+ 'matching_context': doc.page_content[:300] # Show relevant context
442
+ })
443
+
444
+ if len(entity_matches) >= limit:
445
+ break
446
+
447
+ if len(entity_matches) >= limit:
448
+ break
449
+
450
+ # Sort by similarity score
451
+ entity_matches.sort(key=lambda x: x['similarity_score'], reverse=True)
452
+ return entity_matches[:limit]
453
+
454
+ except Exception as e:
455
+ logger.error(f"Semantic search failed: {e}")
456
+ # Fallback to regular text search
457
+ return self.search_entities(query, limit=limit)
458
+
459
+ def find_related_entities_by_context(self, entity_name: str, limit: int = 5) -> List[Dict[str, Any]]:
460
+ """
461
+ Find entities related to the given entity based on semantic similarity of their contexts.
462
+
463
+ Args:
464
+ entity_name: Name of the reference entity
465
+ limit: Maximum number of related entities to return
466
+
467
+ Returns:
468
+ List of related entities with similarity scores
469
+ """
470
+ if not self.is_available():
471
+ return []
472
+
473
+ # Find the reference entity
474
+ reference_entities = [n for n in self.graph.nodes()
475
+ if entity_name.lower() in self.graph.nodes[n].get('name', '').lower()]
476
+
477
+ if not reference_entities:
478
+ return []
479
+
480
+ # Load document processor
481
+ self._load_document_processor()
482
+ if not self.document_processor or not self.document_processor.vector_store:
483
+ return []
484
+
485
+ try:
486
+ # Get context samples from the reference entity
487
+ reference_node = reference_entities[0]
488
+ reference_data = self.graph.nodes[reference_node]
489
+ context_samples = reference_data.get('context_samples', [])
490
+
491
+ if not context_samples:
492
+ return []
493
+
494
+ # Use the first context sample as a query
495
+ query_context = context_samples[0][:500] # Limit context length
496
+
497
+ # Find semantically similar contexts
498
+ similar_docs = self.document_processor.vector_store.similarity_search_with_score(
499
+ query_context, k=20
500
+ )
501
+
502
+ # Map back to entities
503
+ related_entities = []
504
+ seen_entities = {reference_data.get('name', '')}
505
+
506
+ for doc, score in similar_docs:
507
+ doc_source = doc.metadata.get('source', '')
508
+
509
+ # Find entities from this document
510
+ for node in self.graph.nodes():
511
+ if node == reference_node:
512
+ continue
513
+
514
+ node_data = self.graph.nodes[node]
515
+ entity_name_node = node_data.get('name', '')
516
+ entity_sources = node_data.get('sources', '')
517
+
518
+ if (entity_name_node not in seen_entities and
519
+ doc_source and doc_source in entity_sources):
520
+
521
+ seen_entities.add(entity_name_node)
522
+ related_entities.append({
523
+ 'name': entity_name_node,
524
+ 'type': node_data.get('type', 'unknown'),
525
+ 'similarity_score': 1.0 - score,
526
+ 'sources': entity_sources,
527
+ 'context_samples': node_data.get('context_samples', [])[:1],
528
+ 'relationship_reason': 'Semantic context similarity'
529
+ })
530
+
531
+ if len(related_entities) >= limit:
532
+ break
533
+
534
+ if len(related_entities) >= limit:
535
+ break
536
+
537
+ # Sort by similarity
538
+ related_entities.sort(key=lambda x: x['similarity_score'], reverse=True)
539
+ return related_entities[:limit]
540
+
541
+ except Exception as e:
542
+ logger.error(f"Context-based entity search failed: {e}")
543
+ return []
544
+
545
+ def semantic_path_search(self, query: str, max_paths: int = 5) -> List[Dict[str, Any]]:
546
+ """
547
+ Find paths in the graph that are semantically relevant to a query.
548
+
549
+ Args:
550
+ query: Natural language description of what to find
551
+ max_paths: Maximum number of paths to return
552
+
553
+ Returns:
554
+ List of paths with relevance scores
555
+ """
556
+ if not self.is_available():
557
+ return []
558
+
559
+ # First, find entities semantically related to the query
560
+ relevant_entities = self.semantic_search_entities(query, limit=10)
561
+
562
+ if len(relevant_entities) < 2:
563
+ return []
564
+
565
+ # Find interesting paths between the most relevant entities
566
+ paths_found = []
567
+
568
+ for i, entity1 in enumerate(relevant_entities[:5]): # Limit to top 5 for performance
569
+ for entity2 in relevant_entities[i+1:]:
570
+ try:
571
+ # Find paths between these entities
572
+ paths = self.find_paths(entity1['name'], entity2['name'], max_length=3)
573
+
574
+ for path in paths[:2]: # Limit paths per pair
575
+ # Calculate path relevance based on entity similarity scores
576
+ path_score = (entity1['similarity_score'] + entity2['similarity_score']) / 2
577
+
578
+ paths_found.append({
579
+ 'path': path,
580
+ 'relevance_score': path_score,
581
+ 'start_entity': entity1['name'],
582
+ 'end_entity': entity2['name'],
583
+ 'query_relevance': f"Related to: {query}",
584
+ 'path_length': len(path) - 1
585
+ })
586
+
587
+ if len(paths_found) >= max_paths:
588
+ break
589
+
590
+ except Exception as e:
591
+ logger.debug(f"Path finding failed between {entity1['name']} and {entity2['name']}: {e}")
592
+ continue
593
+
594
+ if len(paths_found) >= max_paths:
595
+ break
596
+
597
+ if len(paths_found) >= max_paths:
598
+ break
599
+
600
+ # Sort by relevance score
601
+ paths_found.sort(key=lambda x: x['relevance_score'], reverse=True)
602
+ return paths_found[:max_paths]
603
+
604
+ @st.cache_data(ttl=3600)
605
+ def get_available_knowledge_graphs() -> List[str]:
606
+ """
607
+ Get list of available knowledge graphs.
608
+
609
+ Returns:
610
+ List of store names that have knowledge graphs available
611
+ """
612
+ try:
613
+ config = get_config()
614
+ graphs_dir = config.paths['faiss_dir'] / 'knowledge_graphs'
615
+
616
+ if not graphs_dir.exists():
617
+ return []
618
+
619
+ # Find all knowledge graph files
620
+ graph_files = list(graphs_dir.glob("*_knowledge_graph.pkl"))
621
+ store_names = [f.stem.replace('_knowledge_graph', '') for f in graph_files]
622
+
623
+ return sorted(store_names)
624
+
625
+ except Exception as e:
626
+ logger.error(f"Error getting available knowledge graphs: {e}")
627
+ return []
628
+
629
+ def create_knowledge_graph_manager(store_name: str) -> KnowledgeGraphManager:
630
+ """
631
+ Factory function to create a knowledge graph manager.
632
+
633
+ Args:
634
+ store_name: Company store name
635
+
636
+ Returns:
637
+ Configured KnowledgeGraphManager instance
638
+ """
639
+ return KnowledgeGraphManager(store_name)
app/core/logging.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Logging Configuration Module
4
+
5
+ Provides consistent logging setup for the application.
6
+ This replaces the old src-based logging with a cleaner, app-specific solution.
7
+ """
8
+
9
+ import logging
10
+ import sys
11
+ from pathlib import Path
12
+ from logging.handlers import RotatingFileHandler
13
+
14
+
15
+ def configure_langchain_logging(log_level: str = "WARNING") -> None:
16
+ """
17
+ Configure LangChain library logging levels to reduce verbosity.
18
+
19
+ Args:
20
+ log_level: Logging level for LangChain modules (default: WARNING)
21
+ """
22
+ langchain_modules = [
23
+ "langchain",
24
+ "langchain_core",
25
+ "langchain_community",
26
+ "langchain_huggingface"
27
+ ]
28
+
29
+ level = getattr(logging, log_level.upper())
30
+ for module in langchain_modules:
31
+ logging.getLogger(module).setLevel(level)
32
+
33
+
34
+ def setup_logging(
35
+ name: str = "dd_poc",
36
+ log_level: str = "INFO",
37
+ log_file: str = None
38
+ ) -> logging.Logger:
39
+ """
40
+ Set up standard Python logging with rotating file handler
41
+
42
+ Args:
43
+ name: Logger name
44
+ log_level: Logging level
45
+ log_file: Optional log file path
46
+
47
+ Returns:
48
+ Configured logger instance
49
+ """
50
+ logger = logging.getLogger(name)
51
+
52
+ # Avoid duplicate setup if logger already has handlers
53
+ if logger.handlers:
54
+ return logger
55
+
56
+ logger.setLevel(getattr(logging, log_level.upper()))
57
+
58
+ # Console handler
59
+ console_handler = logging.StreamHandler(sys.stdout)
60
+ console_formatter = logging.Formatter(
61
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
62
+ )
63
+ console_handler.setFormatter(console_formatter)
64
+ logger.addHandler(console_handler)
65
+
66
+ # Rotating file handler (if possible)
67
+ if log_file or True: # Always try to set up file logging
68
+ try:
69
+ log_dir = Path(".logs")
70
+ log_dir.mkdir(exist_ok=True)
71
+
72
+ if not log_file:
73
+ log_file = log_dir / f"dd_poc_{Path.cwd().name}.log"
74
+
75
+ # Use RotatingFileHandler for better log management
76
+ file_handler = RotatingFileHandler(
77
+ log_file,
78
+ maxBytes=10 * 1024 * 1024, # 10MB
79
+ backupCount=5
80
+ )
81
+ file_formatter = logging.Formatter(
82
+ '%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s'
83
+ )
84
+ file_handler.setFormatter(file_formatter)
85
+ logger.addHandler(file_handler)
86
+ except Exception:
87
+ # File logging not available (e.g., on Streamlit Cloud)
88
+ pass
89
+
90
+ return logger
91
+
92
+
93
+ # Global logger instance
94
+ logger = setup_logging()
app/core/model_cache.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Model Cache Manager
4
+
5
+ Provides global caching for HuggingFace models to prevent re-downloads
6
+ across multiple instances and sessions.
7
+ """
8
+
9
+ import logging
10
+ from typing import Optional
11
+ from pathlib import Path
12
+ from langchain_huggingface import HuggingFaceEmbeddings
13
+ from sentence_transformers import CrossEncoder
14
+
15
+ from app.core.logging import logger
16
+
17
+ # Optional accelerate import
18
+ try:
19
+ from accelerate import Accelerator
20
+ ACCELERATE_AVAILABLE = True
21
+ except ImportError:
22
+ ACCELERATE_AVAILABLE = False
23
+ Accelerator = None
24
+
25
+ # Global model cache
26
+ _EMBEDDINGS_CACHE = {}
27
+ _CROSS_ENCODER_CACHE = {}
28
+
29
+ # Local models directory
30
+ _MODELS_DIR = Path(__file__).parent.parent.parent / "models"
31
+
32
+ def _get_local_model_path(model_name: str) -> Optional[Path]:
33
+ """
34
+ Get local path for a model if it exists.
35
+
36
+ Args:
37
+ model_name: HuggingFace model name
38
+
39
+ Returns:
40
+ Path to local model directory or None if not found
41
+ """
42
+ if "/" in model_name:
43
+ # Handle different model name formats
44
+ if model_name.startswith("sentence-transformers/"):
45
+ # For sentence transformers: sentence-transformers/all-mpnet-base-v2
46
+ model_short_name = model_name.split("/")[-1]
47
+ local_path = _MODELS_DIR / "sentence_transformers" / model_short_name
48
+ elif model_name.startswith("cross-encoder/"):
49
+ # For cross encoders: cross-encoder/ms-marco-MiniLM-L-6-v2
50
+ model_short_name = model_name.split("/")[-1]
51
+ local_path = _MODELS_DIR / "cross_encoder" / model_short_name
52
+ else:
53
+ # Fallback for other models
54
+ model_short_name = model_name.split("/")[-1]
55
+ local_path = _MODELS_DIR / model_short_name
56
+
57
+ if local_path.exists():
58
+ return local_path
59
+
60
+ return None
61
+
62
+ def get_cached_embeddings(model_name: str = "sentence-transformers/all-mpnet-base-v2") -> HuggingFaceEmbeddings:
63
+ """
64
+ Get cached HuggingFace embeddings model with accelerate optimization.
65
+
66
+ Creates the model only once and reuses it across all instances.
67
+ Uses local models directory if available, otherwise downloads from HuggingFace.
68
+ Automatically uses GPU if available via accelerate.
69
+ """
70
+ if model_name not in _EMBEDDINGS_CACHE:
71
+ # Check for local model first
72
+ local_path = _get_local_model_path(model_name)
73
+ if local_path:
74
+ logger.info(f"Using local embeddings model: {local_path}")
75
+ embeddings = HuggingFaceEmbeddings(model_name=str(local_path))
76
+ else:
77
+ logger.info(f"Downloading embeddings model: {model_name}")
78
+ embeddings = HuggingFaceEmbeddings(model_name=model_name)
79
+
80
+ # Optimize device placement with accelerate if available
81
+ if ACCELERATE_AVAILABLE:
82
+ try:
83
+ accelerator = Accelerator()
84
+ logger.info(f"Embeddings model optimized for device: {accelerator.device}")
85
+ # Accelerate will automatically handle device placement
86
+ except Exception as e:
87
+ logger.warning(f"Failed to optimize embeddings with accelerate: {e}")
88
+
89
+ _EMBEDDINGS_CACHE[model_name] = embeddings
90
+ else:
91
+ logger.debug(f"Using cached embeddings model: {model_name}")
92
+
93
+ return _EMBEDDINGS_CACHE[model_name]
94
+
95
+ def get_cached_cross_encoder(model_name: str = 'cross-encoder/ms-marco-MiniLM-L-6-v2') -> CrossEncoder:
96
+ """
97
+ Get cached cross-encoder model.
98
+
99
+ Creates the model only once and reuses it across all instances.
100
+ Uses local models directory if available, otherwise downloads from HuggingFace.
101
+ """
102
+ if model_name not in _CROSS_ENCODER_CACHE:
103
+ # Check for local model first
104
+ local_path = _get_local_model_path(model_name)
105
+ if local_path:
106
+ logger.info(f"Using local cross-encoder model: {local_path}")
107
+ _CROSS_ENCODER_CACHE[model_name] = CrossEncoder(str(local_path))
108
+ else:
109
+ logger.info(f"Downloading cross-encoder model: {model_name}")
110
+ _CROSS_ENCODER_CACHE[model_name] = CrossEncoder(model_name)
111
+ else:
112
+ logger.debug(f"Using cached cross-encoder model: {model_name}")
113
+
114
+ return _CROSS_ENCODER_CACHE[model_name]
115
+
116
+ def clear_model_cache():
117
+ """
118
+ Clear all cached models.
119
+
120
+ Useful for memory management or testing.
121
+ """
122
+ _EMBEDDINGS_CACHE.clear()
123
+ _CROSS_ENCODER_CACHE.clear()
124
+ logger.info("Model cache cleared")
app/core/parsers.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ LLM-based parsing functions for due diligence documents.
4
+
5
+ This module provides modern structured output parsing using Pydantic models
6
+ to ensure reliable, type-safe parsing of LLM responses.
7
+ """
8
+
9
+ from typing import Dict, List
10
+ from app.core.logging import logger
11
+
12
+
13
+ def parse_checklist(checklist_text: str, llm) -> Dict:
14
+ """
15
+ Parse markdown checklist using Pydantic structured output.
16
+
17
+ This approach uses LangChain's PydanticOutputParser to ensure the LLM
18
+ returns properly structured data that matches our expected format.
19
+
20
+ Args:
21
+ checklist_text: The raw checklist text to parse
22
+ llm: LLM instance to use for parsing
23
+
24
+ Returns:
25
+ Dictionary with categories and their items
26
+
27
+ Raises:
28
+ RuntimeError: If LLM is not available or parsing fails
29
+ ValueError: If llm parameter is not provided
30
+ """
31
+ if llm is None:
32
+ raise ValueError("LLM parameter is required")
33
+
34
+ try:
35
+ from langchain_core.output_parsers import PydanticOutputParser
36
+ from app.ai.processing_pipeline import StructuredChecklist
37
+ from app.ai.prompts import get_checklist_parsing_prompt
38
+
39
+ # Set up structured output parser
40
+ parser = PydanticOutputParser(pydantic_object=StructuredChecklist)
41
+
42
+ # Use centralized prompt from prompts.py (avoid duplication)
43
+ prompt = get_checklist_parsing_prompt()
44
+
45
+ # Format the prompt with the checklist text and format instructions
46
+ formatted_prompt = prompt.format_messages(
47
+ checklist_text=checklist_text,
48
+ format_instructions=parser.get_format_instructions()
49
+ )
50
+
51
+ # Get LLM response
52
+ logger.info(f"Sending checklist to LLM for parsing (length: {len(checklist_text)} chars)")
53
+ llm_response = llm.invoke(formatted_prompt)
54
+ logger.debug(f"LLM response length: {len(llm_response.content)} chars")
55
+
56
+ # Parse the response using the Pydantic parser
57
+ result = parser.parse(llm_response.content)
58
+
59
+ # Convert Pydantic model to expected dictionary format
60
+ categories_dict = {}
61
+ for key, category in result.categories.items():
62
+ categories_dict[key] = {
63
+ 'name': category.name,
64
+ 'items': [
65
+ {
66
+ 'text': item.text,
67
+ 'original': item.original
68
+ }
69
+ for item in category.items
70
+ ]
71
+ }
72
+
73
+ logger.info(f"Successfully parsed {len(categories_dict)} categories: {list(categories_dict.keys())}")
74
+ return categories_dict
75
+
76
+ except Exception as e:
77
+ raise RuntimeError(f"Structured parsing failed: {str(e)}")
78
+
79
+
80
+ def parse_questions(questions_text: str, llm) -> List[Dict]:
81
+ """
82
+ Parse markdown questions using Pydantic structured output.
83
+
84
+ Args:
85
+ questions_text: The raw questions text to parse
86
+ llm: LLM instance to use for parsing
87
+
88
+ Returns:
89
+ List of dictionaries with question data
90
+
91
+ Raises:
92
+ RuntimeError: If LLM is not available or parsing fails
93
+ ValueError: If llm parameter is not provided
94
+ """
95
+ if llm is None:
96
+ raise ValueError("LLM parameter is required")
97
+
98
+ try:
99
+ from langchain_core.output_parsers import PydanticOutputParser
100
+ from langchain_core.prompts import ChatPromptTemplate
101
+ from langchain_core.messages import SystemMessage, HumanMessage
102
+ from app.ai.processing_pipeline import StructuredQuestions
103
+
104
+ # Set up structured output parser
105
+ parser = PydanticOutputParser(pydantic_object=StructuredQuestions)
106
+
107
+ # Create prompt with format instructions
108
+ from langchain_core.prompts import HumanMessagePromptTemplate
109
+ prompt = ChatPromptTemplate.from_messages([
110
+ SystemMessage(content="""
111
+ You are a document parser. Parse the due diligence questions document into the EXACT JSON format specified.
112
+
113
+ CRITICAL:
114
+ - Return ONLY valid JSON, no additional text or explanations
115
+ - Extract categories (like "### A. Category Name")
116
+ - Extract numbered questions within each category
117
+ - Clean up markdown formatting but preserve core text
118
+ - Follow the exact format specified in the format instructions
119
+
120
+ The output must be valid JSON that can be parsed directly.
121
+ """),
122
+ HumanMessagePromptTemplate.from_template("""Parse these questions into the exact JSON format:
123
+
124
+ {questions_text}
125
+
126
+ Required JSON schema:
127
+ {format_instructions}
128
+
129
+ Return only the JSON:""")
130
+ ])
131
+
132
+ # Format the prompt with the questions text and format instructions
133
+ formatted_prompt = prompt.format_messages(
134
+ questions_text=questions_text,
135
+ format_instructions=parser.get_format_instructions()
136
+ )
137
+
138
+ # Get LLM response
139
+ llm_response = llm.invoke(formatted_prompt)
140
+
141
+ # Parse the response using the Pydantic parser
142
+ result = parser.parse(llm_response.content)
143
+
144
+ # Convert Pydantic model to expected list format
145
+ return [
146
+ {
147
+ 'category': question.category,
148
+ 'question': question.question,
149
+ 'id': question.id
150
+ }
151
+ for question in result.questions
152
+ ]
153
+
154
+ except Exception as e:
155
+ raise RuntimeError(f"Structured parsing failed: {str(e)}")
app/core/performance.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Performance Optimization Module
4
+
5
+ This module provides performance optimizations using prebuilt libraries:
6
+ - diskcache: Smart caching system
7
+ - joblib: Function result caching
8
+ - httpx: Async HTTP client
9
+ - backoff: Retry logic with exponential backoff
10
+ - psutil: System resource monitoring
11
+ """
12
+
13
+ import asyncio
14
+ import hashlib
15
+ import logging
16
+ import time
17
+ from pathlib import Path
18
+ from typing import Dict, List, Any, Optional, Callable, TypeVar, Union
19
+ from functools import wraps
20
+
21
+ import diskcache
22
+ import joblib
23
+ import httpx
24
+ import backoff
25
+ import psutil
26
+ from tqdm import tqdm
27
+
28
+ # Optional imports for GPU/CPU optimization
29
+ try:
30
+ import accelerate
31
+ ACCELERATE_AVAILABLE = True
32
+ except ImportError:
33
+ ACCELERATE_AVAILABLE = False
34
+
35
+ try:
36
+ import memory_profiler
37
+ MEMORY_PROFILER_AVAILABLE = True
38
+ except ImportError:
39
+ MEMORY_PROFILER_AVAILABLE = False
40
+
41
+ from app.core.config import get_config
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+ # Type hints
46
+ T = TypeVar('T')
47
+
48
+ class PerformanceManager:
49
+ """Central manager for performance optimizations"""
50
+
51
+ def __init__(self):
52
+ self.config = get_config()
53
+ self._setup_caches()
54
+ self._setup_clients()
55
+
56
+ def _setup_caches(self):
57
+ """Initialize caching systems"""
58
+ faiss_dir = self.config.paths['faiss_dir']
59
+ faiss_dir.mkdir(parents=True, exist_ok=True)
60
+
61
+ # Document content cache
62
+ self.doc_cache = diskcache.Cache(
63
+ str(faiss_dir / '.doc_cache'),
64
+ size_limit=500 * 1024 * 1024, # 500MB
65
+ eviction_policy='least-recently-used'
66
+ )
67
+
68
+ # Embedding cache
69
+ self.embedding_cache = diskcache.Cache(
70
+ str(faiss_dir / '.embedding_cache'),
71
+ size_limit=2 * 1024 * 1024 * 1024, # 2GB
72
+ eviction_policy='least-recently-used'
73
+ )
74
+
75
+ # Joblib memory cache for expensive computations
76
+ self.memory = joblib.Memory(
77
+ location=str(faiss_dir / '.joblib_cache'),
78
+ verbose=0,
79
+ compress=True
80
+ )
81
+
82
+ def _setup_clients(self):
83
+ """Initialize HTTP clients"""
84
+ # Async HTTP client for AI API calls
85
+ self.http_client = httpx.AsyncClient(
86
+ timeout=httpx.Timeout(60.0, connect=10.0),
87
+ limits=httpx.Limits(max_connections=10, max_keepalive_connections=5)
88
+ )
89
+
90
+ @staticmethod
91
+ def get_file_hash(file_path: Path) -> str:
92
+ """Calculate SHA256 hash of file content"""
93
+ hash_sha256 = hashlib.sha256()
94
+ with open(file_path, 'rb') as f:
95
+ for chunk in iter(lambda: f.read(4096), b""):
96
+ hash_sha256.update(chunk)
97
+ return hash_sha256.hexdigest()
98
+
99
+ def cache_document_content(self, file_path: Path, content: str) -> None:
100
+ """Cache document content with hash-based key"""
101
+ file_hash = self.get_file_hash(file_path)
102
+ cache_key = f"doc_content:{file_hash}"
103
+ self.doc_cache.set(cache_key, content, expire=86400 * 30) # 30 days
104
+
105
+ def get_cached_document_content(self, file_path: Path) -> Optional[str]:
106
+ """Get cached document content"""
107
+ file_hash = self.get_file_hash(file_path)
108
+ cache_key = f"doc_content:{file_hash}"
109
+ return self.doc_cache.get(cache_key)
110
+
111
+ def cache_embeddings(self, text_hash: str, embeddings: List[List[float]]) -> None:
112
+ """Cache embeddings with content hash"""
113
+ cache_key = f"embeddings:{text_hash}"
114
+ self.embedding_cache.set(cache_key, embeddings, expire=86400 * 30)
115
+
116
+ def get_cached_embeddings(self, text_hash: str) -> Optional[List[List[float]]]:
117
+ """Get cached embeddings"""
118
+ cache_key = f"embeddings:{text_hash}"
119
+ return self.embedding_cache.get(cache_key)
120
+
121
+ @backoff.on_exception(
122
+ backoff.expo,
123
+ (httpx.RequestError, httpx.TimeoutException),
124
+ max_tries=3,
125
+ jitter=backoff.random_jitter
126
+ )
127
+ async def make_api_request(self, url: str, **kwargs) -> httpx.Response:
128
+ """Make API request with automatic retry logic"""
129
+ return await self.http_client.request(url=url, **kwargs)
130
+
131
+ def monitor_memory_usage(self) -> Dict[str, float]:
132
+ """Monitor current memory usage"""
133
+ process = psutil.Process()
134
+ memory_info = process.memory_info()
135
+
136
+ result = {
137
+ 'rss': memory_info.rss / 1024 / 1024, # MB
138
+ 'vms': memory_info.vms / 1024 / 1024, # MB
139
+ 'percent': process.memory_percent()
140
+ }
141
+
142
+ # Add GPU memory info if available
143
+ if ACCELERATE_AVAILABLE:
144
+ try:
145
+ import torch
146
+ if torch.cuda.is_available():
147
+ gpu_memory = torch.cuda.get_device_properties(0)
148
+ result.update({
149
+ 'gpu_total': gpu_memory.total_memory / 1024 / 1024 / 1024, # GB
150
+ 'gpu_allocated': torch.cuda.memory_allocated(0) / 1024 / 1024 / 1024, # GB
151
+ 'gpu_reserved': torch.cuda.memory_reserved(0) / 1024 / 1024 / 1024, # GB
152
+ })
153
+ except Exception as e:
154
+ logger.debug(f"Could not get GPU memory info: {e}")
155
+
156
+ return result
157
+
158
+ def should_gc_collect(self, memory_usage: Dict[str, float]) -> bool:
159
+ """Determine if garbage collection should be triggered"""
160
+ return memory_usage['percent'] > 80.0 or memory_usage['rss'] > 2000 # 80% or 2GB
161
+
162
+ def cleanup_cache(self) -> Dict[str, int]:
163
+ """Clean up expired cache entries"""
164
+ doc_cleaned = self.doc_cache.expire()
165
+ embedding_cleaned = self.embedding_cache.expire()
166
+
167
+ return {
168
+ 'doc_cache_cleaned': doc_cleaned,
169
+ 'embedding_cache_cleaned': embedding_cleaned
170
+ }
171
+
172
+ async def close(self):
173
+ """Clean up resources"""
174
+ await self.http_client.aclose()
175
+ self.doc_cache.close()
176
+ self.embedding_cache.close()
177
+
178
+ def optimize_batch_size(self, available_memory: float, item_size_estimate: float = 0.1) -> int:
179
+ """Dynamically optimize batch size based on available memory"""
180
+ # Reserve 20% of memory for overhead
181
+ usable_memory = available_memory * 0.8
182
+
183
+ # Estimate optimal batch size
184
+ optimal_batch = int(usable_memory / item_size_estimate)
185
+
186
+ # Clamp to reasonable bounds
187
+ return max(1, min(optimal_batch, 1000))
188
+
189
+ def get_optimal_device(self) -> str:
190
+ """Get the optimal device for computations"""
191
+ if ACCELERATE_AVAILABLE:
192
+ try:
193
+ import torch
194
+ if torch.cuda.is_available():
195
+ return 'cuda'
196
+ except:
197
+ pass
198
+ return 'cpu'
199
+
200
+ def setup_accelerate(self):
201
+ """Setup accelerate for optimal performance"""
202
+ if ACCELERATE_AVAILABLE:
203
+ try:
204
+ from accelerate import Accelerator
205
+ self.accelerator = Accelerator()
206
+ logger.info(f"Accelerate initialized with device: {self.accelerator.device}")
207
+ return self.accelerator
208
+ except Exception as e:
209
+ logger.warning(f"Failed to initialize accelerate: {e}")
210
+ return None
211
+
212
+
213
+ # Global performance manager instance
214
+ _perf_manager = None
215
+
216
+ def get_performance_manager() -> PerformanceManager:
217
+ """Get global performance manager instance"""
218
+ global _perf_manager
219
+ if _perf_manager is None:
220
+ _perf_manager = PerformanceManager()
221
+ return _perf_manager
222
+
223
+
224
+ # Decorators for easy optimization
225
+ def cached_by_content(func: Callable[..., T]) -> Callable[..., T]:
226
+ """Cache function results based on content hash"""
227
+ @wraps(func)
228
+ def wrapper(*args, **kwargs):
229
+ # Generate content hash from relevant arguments
230
+ content_parts = []
231
+ for arg in args[1:]: # Skip self
232
+ if isinstance(arg, (str, Path)):
233
+ content_parts.append(str(arg))
234
+
235
+ content_hash = hashlib.sha256(
236
+ '|'.join(content_parts).encode()
237
+ ).hexdigest()[:16]
238
+
239
+ perf_manager = get_performance_manager()
240
+ cache_key = f"{func.__name__}:{content_hash}"
241
+
242
+ # Try cache first
243
+ result = perf_manager.doc_cache.get(cache_key)
244
+ if result is not None:
245
+ logger.debug(f"Cache hit for {func.__name__}")
246
+ return result
247
+
248
+ # Compute and cache
249
+ result = func(*args, **kwargs)
250
+ perf_manager.doc_cache.set(cache_key, result, expire=86400 * 7) # 7 days
251
+ return result
252
+
253
+ return wrapper
254
+
255
+
256
+ def memory_cached(func: Callable[..., T]) -> Callable[..., T]:
257
+ """Cache function results using joblib memory cache"""
258
+ perf_manager = get_performance_manager()
259
+ cached_func = perf_manager.memory.cache(func)
260
+ return cached_func
261
+
262
+
263
+ def monitor_performance(func: Callable[..., T]) -> Callable[..., T]:
264
+ """Monitor function performance and memory usage"""
265
+ @wraps(func)
266
+ def wrapper(*args, **kwargs):
267
+ start_time = time.time()
268
+ perf_manager = get_performance_manager()
269
+
270
+ # Memory before
271
+ mem_before = perf_manager.monitor_memory_usage()
272
+
273
+ try:
274
+ result = func(*args, **kwargs)
275
+ return result
276
+ finally:
277
+ # Memory after
278
+ mem_after = perf_manager.monitor_memory_usage()
279
+ duration = time.time() - start_time
280
+
281
+ logger.debug(
282
+ f"{func.__name__}: {duration:.2f}s, "
283
+ f"Memory: {mem_before['rss']:.1f}MB -> {mem_after['rss']:.1f}MB"
284
+ )
285
+
286
+ # Trigger GC if needed
287
+ if perf_manager.should_gc_collect(mem_after):
288
+ import gc
289
+ gc.collect()
290
+ logger.debug("Garbage collection triggered")
291
+
292
+ return wrapper
293
+
294
+
295
+ # Utility functions
296
+ def get_text_hash(text: str) -> str:
297
+ """Generate hash for text content"""
298
+ return hashlib.sha256(text.encode()).hexdigest()[:16]
299
+
300
+
301
+ def parallel_process(items: List[T], func: Callable[[T], Any],
302
+ max_workers: int = 4, desc: str = "Processing") -> List[Any]:
303
+ """Process items in parallel using ThreadPoolExecutor"""
304
+ from concurrent.futures import ThreadPoolExecutor, as_completed
305
+
306
+ results = []
307
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
308
+ futures = {executor.submit(func, item): item for item in items}
309
+
310
+ with tqdm(total=len(items), desc=desc) as pbar:
311
+ for future in as_completed(futures):
312
+ result = future.result()
313
+ results.append(result)
314
+ pbar.update(1)
315
+
316
+ return results
317
+
318
+
319
+ def optimize_embedding_batch(texts: List[str], embeddings_model,
320
+ batch_size: int = 32) -> List[List[float]]:
321
+ """Optimize embedding generation with dynamic batching"""
322
+ perf_manager = get_performance_manager()
323
+
324
+ # Get available memory for batch optimization
325
+ mem_info = perf_manager.monitor_memory_usage()
326
+ available_memory = mem_info['rss']
327
+
328
+ # Dynamically adjust batch size based on memory
329
+ optimal_batch = perf_manager.optimize_batch_size(available_memory, item_size_estimate=0.001)
330
+ batch_size = min(batch_size, optimal_batch)
331
+
332
+ logger.info(f"Using optimized batch size: {batch_size} (memory: {available_memory:.1f}MB)")
333
+
334
+ all_embeddings = []
335
+
336
+ # Process in optimized batches
337
+ for i in range(0, len(texts), batch_size):
338
+ batch = texts[i:i + batch_size]
339
+
340
+ # Monitor memory before processing
341
+ mem_before = perf_manager.monitor_memory_usage()
342
+
343
+ try:
344
+ # Generate embeddings for this batch
345
+ batch_embeddings = embeddings_model.embed_documents(batch)
346
+ all_embeddings.extend(batch_embeddings)
347
+
348
+ # Monitor memory after processing
349
+ mem_after = perf_manager.monitor_memory_usage()
350
+
351
+ # Trigger GC if memory usage is high
352
+ if perf_manager.should_gc_collect(mem_after):
353
+ import gc
354
+ gc.collect()
355
+ logger.debug("GC triggered during embedding generation")
356
+
357
+ except Exception as e:
358
+ logger.error(f"Failed to process embedding batch {i//batch_size}: {e}")
359
+ # Continue with empty embeddings for this batch
360
+ all_embeddings.extend([[] for _ in batch])
361
+
362
+ return all_embeddings
363
+
364
+
365
+ async def gather_with_concurrency(n: int, *tasks):
366
+ """Run async tasks with controlled concurrency"""
367
+ semaphore = asyncio.Semaphore(n)
368
+
369
+ async def sem_task(task):
370
+ async with semaphore:
371
+ return await task
372
+
373
+ return await asyncio.gather(*(sem_task(task) for task in tasks))
374
+
375
+
376
+ # Cleanup function for graceful shutdown
377
+ async def cleanup_performance_resources():
378
+ """Clean up performance resources"""
379
+ global _perf_manager
380
+ if _perf_manager:
381
+ await _perf_manager.close()
382
+ _perf_manager = None
app/core/ranking.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Ranking utilities for search results reranking.
4
+
5
+ This module provides functions for reranking search results using cross-encoder models
6
+ to improve relevance scoring. Separated from search.py to avoid circular imports.
7
+ """
8
+
9
+ from typing import Dict, List
10
+ from app.core.logging import logger
11
+ from app.core.model_cache import get_cached_cross_encoder
12
+
13
+
14
+ def rerank_results(query: str, candidates: List[Dict]) -> List[Dict]:
15
+ """
16
+ Rerank search results using cross-encoder model for improved relevance
17
+
18
+ Args:
19
+ query: The search query
20
+ candidates: List of candidate documents with 'text', 'score', etc.
21
+
22
+ Returns:
23
+ Reranked list of candidates with updated scores
24
+ """
25
+ if not candidates:
26
+ return candidates
27
+
28
+ try:
29
+ # Get cached cross-encoder model
30
+ cross_encoder = get_cached_cross_encoder()
31
+
32
+ # Prepare input pairs for cross-encoder
33
+ query_doc_pairs = [(query, candidate['text']) for candidate in candidates]
34
+
35
+ # Get cross-encoder scores
36
+ ce_scores = cross_encoder.predict(query_doc_pairs)
37
+
38
+ # Update candidates with reranked scores
39
+ for i, candidate in enumerate(candidates):
40
+ candidate['reranked_score'] = float(ce_scores[i])
41
+ candidate['score'] = float(ce_scores[i]) # Update main score for consistency
42
+
43
+ # Sort by reranked score (higher is better for cross-encoder)
44
+ candidates.sort(key=lambda x: x['reranked_score'], reverse=True)
45
+
46
+ logger.info(f"Reranked {len(candidates)} results using cross-encoder")
47
+ return candidates
48
+
49
+ except Exception as e:
50
+ logger.warning(f"Cross-encoder reranking failed: {e}. Using original scores.")
51
+ return candidates
app/core/reports.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Report generation functions for due diligence analysis.
4
+ """
5
+
6
+ from typing import Dict
7
+
8
+ from app.core.logging import logger
9
+
10
+
11
+ def generate_reports_from_cache(checklist_results: Dict, questions_answers: Dict, strategy_text: str, checklist_text: str, questions_text: str) -> Dict:
12
+ """Generate reports from cached results (placeholder implementation)"""
13
+ logger.info("Generating reports from cache")
14
+
15
+ return {
16
+ 'overview': "Report generated from cached data",
17
+ 'strategic': strategy_text[:500] if strategy_text else "No strategy provided",
18
+ 'checklist_summary': f"Processed {len(checklist_results)} categories",
19
+ 'questions_summary': f"Processed {len(questions_answers)} questions"
20
+ }
21
+
22
+
23
+ def generate_reports(checklist_results: Dict, questions_answers: Dict, strategy_text: str, checklist_text: str, questions_text: str) -> Dict:
24
+ """Generate comprehensive reports (placeholder implementation)"""
25
+ logger.info("Generating comprehensive reports")
26
+
27
+ return {
28
+ 'overview': "Comprehensive report generated",
29
+ 'strategic': strategy_text[:1000] if strategy_text else "No strategy provided",
30
+ 'checklist_summary': f"Processed {len(checklist_results)} categories with detailed analysis",
31
+ 'questions_summary': f"Processed {len(questions_answers)} questions with detailed answers"
32
+ }
app/core/search.py ADDED
@@ -0,0 +1,773 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Search and analysis functions for document retrieval and ranking.
4
+ """
5
+
6
+ # Standard library imports
7
+ from typing import Dict, List
8
+ from pathlib import Path
9
+
10
+ # Third-party imports for Unicode normalization
11
+ import unidecode
12
+
13
+ # Third-party imports
14
+ import numpy as np
15
+ from langchain.chains.combine_documents import create_stuff_documents_chain
16
+ from langchain.chains.retrieval import create_retrieval_chain
17
+ from langchain_community.vectorstores import FAISS
18
+ from langchain_core.prompts import PromptTemplate
19
+
20
+ # Local imports
21
+ from app.core.constants import SIMILARITY_THRESHOLD
22
+ from app.core.document_processor import DocumentProcessor
23
+ from app.core.logging import logger
24
+ from app.core.ranking import rerank_results
25
+ from app.core.sparse_index import load_sparse_index_for_store, BM25Index
26
+
27
+
28
+ def search_and_analyze(queries: List[Dict], vector_store: FAISS, llm=None, threshold: float = SIMILARITY_THRESHOLD, search_type: str = 'items', store_name: str = None, session=None) -> Dict:
29
+ """Unified search function for both checklist items and questions using direct FAISS search for accurate scores"""
30
+
31
+ # Create RAG chain if LLM is provided
32
+ qa_chain = None
33
+ if llm:
34
+ retriever = vector_store.as_retriever(
35
+ search_type="similarity_score_threshold",
36
+ search_kwargs={"score_threshold": threshold, "k": 5 if search_type == 'questions' else 10}
37
+ )
38
+
39
+ prompt_template = PromptTemplate(
40
+ input_variables=["context", "input"],
41
+ template="""Use the provided context to answer the question. Be concise and factual.
42
+
43
+ Context: {context}
44
+
45
+ Question: {input}
46
+
47
+ Answer:"""
48
+ )
49
+ # Create the document chain and then the retrieval chain
50
+ document_chain = create_stuff_documents_chain(llm, prompt_template)
51
+ qa_chain = create_retrieval_chain(retriever, document_chain)
52
+
53
+ if search_type == 'items':
54
+ return _process_checklist_items(queries, vector_store, threshold, store_name, session)
55
+ else:
56
+ return _process_questions(queries, vector_store, threshold, qa_chain, llm)
57
+
58
+
59
+ def _process_checklist_items(checklist: Dict, vector_store: FAISS, threshold: float, store_name: str = None, session=None) -> Dict:
60
+ """Compare checklist items directly against LLM-generated document type classifications"""
61
+
62
+ # Ensure checklist embeddings are preloaded first
63
+ if not hasattr(get_checklist_embedding, '_cache') or not get_checklist_embedding._cache:
64
+ logger.info("Checklist embeddings cache is empty, preloading...")
65
+ try:
66
+ from app.core.search import preload_checklist_embeddings
67
+ count = preload_checklist_embeddings()
68
+ logger.info(f"✅ Preloaded {count} checklist embeddings for processing")
69
+ except Exception as e:
70
+ logger.error(f"Failed to preload checklist embeddings: {e}")
71
+ return {}
72
+
73
+ # Ensure document type embeddings are available
74
+ if session:
75
+ logger.debug(f"Checklist processing session ID: {id(session)}, has embeddings: {hasattr(session, 'document_type_embeddings')}")
76
+ if hasattr(session, 'document_type_embeddings'):
77
+ logger.debug(f"Embeddings count: {len(session.document_type_embeddings) if session.document_type_embeddings else 0}")
78
+
79
+ # Try to auto-preload embeddings if missing
80
+ embeddings_missing = not session or not hasattr(session, 'document_type_embeddings') or not session.document_type_embeddings
81
+
82
+ if embeddings_missing and store_name:
83
+ logger.info(f"Document type embeddings missing, attempting auto-preload for {store_name}...")
84
+ try:
85
+ from app.core.search import preload_document_type_embeddings
86
+ type_embeddings = preload_document_type_embeddings(store_name)
87
+ if not hasattr(session, 'document_type_embeddings') or session.document_type_embeddings is None:
88
+ session.document_type_embeddings = {}
89
+ session.document_type_embeddings.update(type_embeddings)
90
+ logger.info(f"✅ Auto-preloaded {len(type_embeddings)} document type embeddings")
91
+ embeddings_missing = False
92
+ except Exception as e:
93
+ logger.warning(f"Failed to auto-preload document type embeddings: {e}")
94
+
95
+ if embeddings_missing:
96
+ logger.error("Document type embeddings not available. Checklist processing requires preloaded embeddings.")
97
+ logger.error("Make sure data room processing completed successfully or embeddings can be auto-loaded.")
98
+ return {}
99
+
100
+ # Load document type classifications - these are our primary comparison targets
101
+ doc_types = {}
102
+ if store_name:
103
+ doc_types = _load_document_types(vector_store, store_name)
104
+
105
+ if not doc_types:
106
+ logger.warning(f"No document type classifications found for {store_name}")
107
+ return {}
108
+
109
+ results = {}
110
+ for cat_letter, category in checklist.items():
111
+ cat_results = {
112
+ 'name': category['name'],
113
+ 'items': [],
114
+ 'total_items': len(category['items']),
115
+ 'matched_items': 0
116
+ }
117
+
118
+ for item in category['items']:
119
+ checklist_item_text = item['text'].lower().strip()
120
+ matches = []
121
+
122
+ # Compare checklist item against each document's type classification
123
+ for doc_path, doc_type in doc_types.items():
124
+ if not doc_type or doc_type == 'not classified':
125
+ continue
126
+
127
+ doc_type_lower = doc_type.lower().strip()
128
+
129
+ # Calculate semantic similarity between checklist item and document type
130
+ try:
131
+ # Get checklist embedding from memory cache (preloaded during data room processing)
132
+ checklist_embedding = get_checklist_embedding(checklist_item_text)
133
+
134
+ # Get document type embedding (from preloaded cache)
135
+ doc_type_embedding = get_document_type_embedding(doc_type_lower, session)
136
+
137
+ # Calculate cosine similarity
138
+ import numpy as np
139
+ similarity = np.dot(checklist_embedding, doc_type_embedding) / (
140
+ np.linalg.norm(checklist_embedding) * np.linalg.norm(doc_type_embedding)
141
+ )
142
+
143
+ # Only include matches above threshold
144
+ if similarity >= threshold:
145
+ # Find the document metadata from the vector store
146
+ # We need to get the document name and other metadata
147
+ doc_name = _extract_doc_name_from_path(doc_path)
148
+
149
+ matches.append({
150
+ 'name': doc_name,
151
+ 'path': doc_path,
152
+ 'full_path': doc_path, # For consistency
153
+ 'score': round(float(similarity), 3),
154
+ 'document_type': doc_type,
155
+ 'text': f"Document type: {doc_type}" # Include document type as text
156
+ })
157
+
158
+ except Exception as e:
159
+ logger.warning(f"Error calculating similarity for {doc_path}: {e}")
160
+ continue
161
+
162
+ # Sort matches by score (highest first)
163
+ matches.sort(key=lambda x: x['score'], reverse=True)
164
+
165
+ # Limit to top matches for performance
166
+ matches = matches[:10]
167
+
168
+ if matches:
169
+ cat_results['matched_items'] += 1
170
+ logger.info(f"✅ Found {len(matches)} matches for checklist item: '{checklist_item_text[:50]}...'")
171
+
172
+ cat_results['items'].append({
173
+ 'text': item['text'],
174
+ 'original': item['original'],
175
+ 'matches': matches
176
+ })
177
+
178
+ results[cat_letter] = cat_results
179
+
180
+ return results
181
+
182
+
183
+ def _load_document_types(vector_store, store_name: str):
184
+ """Load document type classifications for the given store"""
185
+ try:
186
+ from pathlib import Path
187
+ from app.core.config import get_app_config
188
+ config = get_app_config()
189
+ doc_types_path = config.paths['faiss_dir'] / f"{store_name}_document_types.json"
190
+ if doc_types_path.exists():
191
+ import json
192
+ with open(doc_types_path, 'r') as f:
193
+ return json.load(f)
194
+ except Exception as e:
195
+ logger.warning(f"Failed to load document types for {store_name}: {e}")
196
+ return {}
197
+
198
+
199
+ def _extract_doc_name_from_path(doc_path: str) -> str:
200
+ """Extract document name from file path"""
201
+ try:
202
+ path_obj = Path(doc_path)
203
+ return path_obj.name
204
+ except Exception:
205
+ # Fallback: extract name from path string
206
+ return doc_path.split('/')[-1] if '/' in doc_path else doc_path.split('\\')[-1] if '\\' in doc_path else doc_path
207
+
208
+
209
+ def get_checklist_embedding(checklist_text: str):
210
+ """
211
+ Get cached embedding for checklist item from in-memory cache.
212
+
213
+ This function only uses in-memory cache that should be preloaded during
214
+ data room processing. It will fail if the embedding is not available.
215
+
216
+ Args:
217
+ checklist_text: The checklist item text to look up
218
+
219
+ Returns:
220
+ numpy array: The embedding vector for the checklist text
221
+
222
+ Raises:
223
+ RuntimeError: If embedding is not found in cache
224
+ """
225
+ # Initialize cache if not exists
226
+ if not hasattr(get_checklist_embedding, '_cache'):
227
+ get_checklist_embedding._cache = {}
228
+ logger.warning("Checklist embedding cache was not initialized - this should not happen!")
229
+
230
+ # Create cache key from checklist text with normalized Unicode
231
+ cache_key = checklist_text.lower().strip()
232
+ # Use unidecode for comprehensive Unicode to ASCII conversion
233
+ cache_key = unidecode.unidecode(cache_key)
234
+
235
+ # Check in-memory cache only
236
+ if cache_key in get_checklist_embedding._cache:
237
+ return get_checklist_embedding._cache[cache_key]
238
+
239
+ # Enhanced debugging for troubleshooting
240
+ cache_size = len(get_checklist_embedding._cache)
241
+ logger.warning(f"Checklist embedding not found: '{checklist_text[:50]}...'")
242
+ logger.warning(f"Cache key generated: '{cache_key}'")
243
+ logger.warning(f"Cache has {cache_size} items total")
244
+
245
+ if cache_size > 0:
246
+ # Look for similar keys to help debug
247
+ similar_keys = []
248
+ search_terms = checklist_text.lower().split()
249
+ for key in get_checklist_embedding._cache.keys():
250
+ if any(term in key for term in search_terms if len(term) > 3):
251
+ similar_keys.append(key)
252
+
253
+ if similar_keys:
254
+ logger.warning(f"Similar keys found: {similar_keys[:3]}")
255
+ else:
256
+ logger.warning("No similar keys found in cache")
257
+
258
+ # Show a few sample keys
259
+ sample_keys = list(get_checklist_embedding._cache.keys())[:5]
260
+ logger.warning(f"Sample cache keys: {sample_keys}")
261
+ else:
262
+ logger.error("Cache is completely empty - embeddings were not preloaded!")
263
+
264
+ # Fail if not found - no fallbacks
265
+ raise RuntimeError(
266
+ f"Checklist embedding not found for: '{checklist_text[:50]}...' (cache key: '{cache_key}'). "
267
+ f"Cache has {cache_size} items. "
268
+ "Make sure embeddings were preloaded during data room processing."
269
+ )
270
+
271
+
272
+ def get_document_type_embedding(doc_type: str, session=None):
273
+ """
274
+ Get cached embedding for document type from session cache.
275
+
276
+ Args:
277
+ doc_type: The document type text to get embedding for
278
+ session: The session object containing preloaded embeddings
279
+
280
+ Returns:
281
+ numpy.ndarray: The embedding vector
282
+
283
+ Raises:
284
+ RuntimeError: If embedding is not found in cache
285
+ """
286
+ if not session or not hasattr(session, 'document_type_embeddings') or not session.document_type_embeddings:
287
+ raise RuntimeError(f"Document type embedding not found for: '{doc_type[:50]}...'. Preloaded embeddings required.")
288
+
289
+ # Create cache key with normalized Unicode
290
+ cache_key = unidecode.unidecode(doc_type.lower().strip())
291
+
292
+ # Get from session cache only
293
+ if cache_key in session.document_type_embeddings:
294
+ return session.document_type_embeddings[cache_key]
295
+
296
+ raise RuntimeError(f"Document type embedding not found for: '{doc_type[:50]}...' (cache key: '{cache_key}')")
297
+
298
+
299
+ def generate_checklist_embeddings():
300
+ """
301
+ Generate embeddings for all checklist items and save to disk.
302
+
303
+ This function should be called during the build process to pre-calculate
304
+ embeddings for all checklist items from the available checklist files.
305
+
306
+ Returns:
307
+ int: Number of embeddings generated and saved
308
+ """
309
+ try:
310
+ from app.core.config import get_config
311
+ from app.core.model_cache import get_cached_embeddings
312
+ import json
313
+ import numpy as np
314
+
315
+ config = get_config()
316
+ embeddings_model = get_cached_embeddings()
317
+ checklist_dir = config.paths['checklist_dir']
318
+
319
+ logger.info("🔄 Generating checklist embeddings...")
320
+
321
+ # Initialize embeddings cache
322
+ embeddings_cache = {}
323
+
324
+ # Process all checklist files
325
+ checklist_files = list(checklist_dir.glob("*.md"))
326
+ if not checklist_files:
327
+ logger.warning(f"No checklist files found in {checklist_dir}")
328
+ return 0
329
+
330
+ for checklist_file in checklist_files:
331
+ logger.info(f"Processing checklist: {checklist_file.name}")
332
+
333
+ try:
334
+ # Read checklist content
335
+ content = checklist_file.read_text(encoding='utf-8')
336
+
337
+ # Parse checklist items from markdown
338
+ checklist_items = _parse_checklist_items_from_markdown(content)
339
+
340
+ # Generate embeddings for each item
341
+ for item_text in checklist_items:
342
+ # Normalize Unicode in cache key
343
+ cache_key = item_text.lower().strip()
344
+ cache_key = unidecode.unidecode(cache_key)
345
+
346
+ # Skip if already processed
347
+ if cache_key in embeddings_cache:
348
+ continue
349
+
350
+ try:
351
+ # Generate embedding
352
+ embedding = embeddings_model.embed_query(item_text)
353
+
354
+ # Handle both list and numpy array cases
355
+ if hasattr(embedding, 'tolist'):
356
+ embeddings_cache[cache_key] = embedding.tolist()
357
+ else:
358
+ # Already a list
359
+ embeddings_cache[cache_key] = embedding
360
+
361
+ logger.debug(f"✅ Embedded: {item_text[:50]}...")
362
+
363
+ except Exception as e:
364
+ logger.warning(f"Failed to embed checklist item '{item_text[:50]}...': {e}")
365
+ continue
366
+
367
+ except Exception as e:
368
+ logger.error(f"Failed to process checklist file {checklist_file}: {e}")
369
+ continue
370
+
371
+ # Save to disk
372
+ cache_file = config.paths['faiss_dir'] / "checklist_embeddings.json"
373
+ cache_file.parent.mkdir(parents=True, exist_ok=True)
374
+
375
+ with open(cache_file, 'w', encoding='utf-8') as f:
376
+ json.dump(embeddings_cache, f, indent=2, ensure_ascii=False)
377
+
378
+ logger.info(f"💾 Saved {len(embeddings_cache)} checklist embeddings to {cache_file}")
379
+ return len(embeddings_cache)
380
+
381
+ except Exception as e:
382
+ error_msg = f"Failed to generate checklist embeddings: {e}"
383
+ logger.error(error_msg)
384
+ raise RuntimeError(error_msg)
385
+
386
+
387
+ def _parse_checklist_items_from_markdown(content: str) -> list:
388
+ """
389
+ Parse checklist items from markdown content.
390
+
391
+ Args:
392
+ content: Markdown content containing checklist items
393
+
394
+ Returns:
395
+ list: List of checklist item texts
396
+ """
397
+ import re
398
+
399
+ items = []
400
+
401
+ # Find numbered items like "1. Item text" or "- Item text"
402
+ # Look for patterns after category headers
403
+ lines = content.split('\n')
404
+
405
+ for line in lines:
406
+ line = line.strip()
407
+
408
+ # Skip empty lines and headers
409
+ if not line or line.startswith('#') or line.startswith('⸻'):
410
+ continue
411
+
412
+ # Look for numbered items: "1. ", "2. ", etc. or bullet points
413
+ if re.match(r'^\d+\.\s+', line) or line.startswith('- '):
414
+ # Clean up the item text
415
+ if line.startswith('- '):
416
+ item_text = line[2:].strip()
417
+ else:
418
+ # Remove the number prefix
419
+ item_text = re.sub(r'^\d+\.\s+', '', line).strip()
420
+
421
+ # Skip if too short or looks like a header
422
+ if len(item_text) > 10 and not item_text.isupper():
423
+ items.append(item_text)
424
+
425
+ logger.info(f"Parsed {len(items)} checklist items from markdown")
426
+ return items
427
+
428
+
429
+ def preload_checklist_embeddings():
430
+ """
431
+ Preload all checklist embeddings into memory during data room processing.
432
+
433
+ This function loads pre-calculated embeddings from disk into the in-memory cache.
434
+ It should be called once during data room processing to prepare for fast searches.
435
+
436
+ Returns:
437
+ int: Number of embeddings successfully preloaded
438
+
439
+ Raises:
440
+ RuntimeError: If embeddings file doesn't exist or can't be loaded
441
+ """
442
+ try:
443
+ from app.core.config import get_config
444
+ import json
445
+ import numpy as np
446
+
447
+ config = get_config()
448
+ cache_file = config.paths['faiss_dir'] / "checklist_embeddings.json"
449
+
450
+ if not cache_file.exists():
451
+ logger.warning(f"Checklist embeddings file not found: {cache_file}")
452
+ logger.info("Generating checklist embeddings now...")
453
+
454
+ # Try to generate embeddings on-the-fly
455
+ try:
456
+ generated_count = generate_checklist_embeddings()
457
+ if generated_count > 0:
458
+ logger.info(f"✅ Generated {generated_count} embeddings, now preloading...")
459
+ else:
460
+ raise RuntimeError("No checklist items found to embed")
461
+ except Exception as gen_error:
462
+ raise RuntimeError(
463
+ f"Could not generate checklist embeddings: {gen_error}. "
464
+ "Make sure checklist files exist and are properly formatted."
465
+ )
466
+
467
+ # Initialize cache
468
+ if not hasattr(get_checklist_embedding, '_cache'):
469
+ get_checklist_embedding._cache = {}
470
+
471
+ # Load all embeddings from disk
472
+ with open(cache_file, 'r', encoding='utf-8') as f:
473
+ cache_data = json.load(f)
474
+
475
+ # Convert and cache all embeddings in memory
476
+ preloaded_count = 0
477
+ for cache_key, embedding_list in cache_data.items():
478
+ # Normalize Unicode in cache key to match search normalization
479
+ normalized_key = unidecode.unidecode(cache_key)
480
+ embedding_array = np.array(embedding_list, dtype=np.float32)
481
+ get_checklist_embedding._cache[normalized_key] = embedding_array
482
+ preloaded_count += 1
483
+
484
+ logger.info(f"✅ Preloaded {preloaded_count} checklist embeddings into memory")
485
+ return preloaded_count
486
+
487
+ except Exception as e:
488
+ error_msg = f"Failed to preload checklist embeddings: {e}"
489
+ logger.error(error_msg)
490
+ raise RuntimeError(error_msg)
491
+
492
+
493
+ def preload_document_type_embeddings(store_name: str):
494
+ """
495
+ Preload all document type embeddings into memory during data room processing.
496
+
497
+ This function loads document type classifications and computes their embeddings
498
+ once during data room processing to avoid runtime computation.
499
+
500
+ Returns:
501
+ dict: Dictionary mapping normalized document types to their embeddings
502
+
503
+ Raises:
504
+ RuntimeError: If document types can't be loaded or embeddings can't be computed
505
+ """
506
+ try:
507
+ from app.core.model_cache import get_cached_embeddings
508
+ import numpy as np
509
+
510
+ # Load document type classifications
511
+ doc_types = _load_document_types(None, store_name)
512
+ if not doc_types:
513
+ raise RuntimeError(f"No document type classifications found for {store_name}")
514
+
515
+ # Get embeddings model
516
+ embeddings = get_cached_embeddings()
517
+
518
+ # Precompute embeddings for all unique document types
519
+ type_embeddings = {}
520
+ unique_types = set()
521
+
522
+ # Collect all unique document types
523
+ for doc_path, doc_type in doc_types.items():
524
+ if doc_type and doc_type != 'not classified':
525
+ normalized_type = unidecode.unidecode(doc_type.lower().strip())
526
+ unique_types.add(normalized_type)
527
+
528
+ # Precompute embeddings for each unique type
529
+ for doc_type in unique_types:
530
+ try:
531
+ embedding = embeddings.embed_query(doc_type)
532
+ # Ensure it's a numpy array
533
+ if hasattr(embedding, 'tolist'):
534
+ embedding_array = np.array(embedding, dtype=np.float32)
535
+ else:
536
+ embedding_array = np.array(embedding, dtype=np.float32)
537
+ type_embeddings[doc_type] = embedding_array
538
+ except Exception as e:
539
+ logger.warning(f"Failed to compute embedding for document type '{doc_type}': {e}")
540
+ continue
541
+
542
+ logger.info(f"✅ Precomputed {len(type_embeddings)} document type embeddings")
543
+ return type_embeddings
544
+
545
+ except Exception as e:
546
+ error_msg = f"Failed to preload document type embeddings: {e}"
547
+ logger.error(error_msg)
548
+ raise RuntimeError(error_msg)
549
+
550
+
551
+
552
+
553
+ def _process_questions(queries: List[Dict], vector_store: FAISS, threshold: float, qa_chain=None, llm=None) -> Dict:
554
+ """Process questions using batch processing for parallel LLM calls"""
555
+ if not queries:
556
+ return {'questions': []}
557
+
558
+ if qa_chain and llm:
559
+ return _process_questions_with_rag_batch(queries, vector_store, threshold, llm)
560
+ elif qa_chain:
561
+ raise ValueError("LLM required for RAG processing but not provided")
562
+ else:
563
+ return _process_questions_simple_search(queries, vector_store, threshold)
564
+
565
+
566
+ def _process_questions_with_rag_batch(queries: List[Dict], vector_store: FAISS, threshold: float, llm) -> Dict:
567
+ """Process questions using batch processing - fail fast, no fallbacks"""
568
+ from app.ai.agent_utils import create_batch_processor
569
+ from langchain_core.messages import HumanMessage
570
+
571
+ # Create batch processor
572
+ batch_processor = create_batch_processor(llm, max_concurrency=5)
573
+
574
+ logger.info(f"Processing {len(queries)} questions using batch processing")
575
+
576
+ # Prepare all batch inputs
577
+ batch_inputs = []
578
+ question_contexts = []
579
+
580
+ for query in queries:
581
+ question = query['question']
582
+
583
+ # Retrieve documents for this question
584
+ docs_with_scores = vector_store.similarity_search_with_score(question, k=5)
585
+ relevant_docs = [doc for doc, score in docs_with_scores if (1.0 - (score / 2.0) if score <= 2.0 else 0.0) >= threshold]
586
+
587
+ # Create context and sources
588
+ if relevant_docs:
589
+ context = "\n".join([f"- {doc.metadata.get('name', 'Unknown')}: {doc.page_content[:200]}..."
590
+ for doc in relevant_docs[:5]])
591
+ sources = [{'name': doc.metadata.get('name', ''),
592
+ 'path': doc.metadata.get('path', ''),
593
+ 'score': round(1.0 - (score / 2.0) if score <= 2.0 else 0.0, 3)}
594
+ for doc, score in docs_with_scores[:5] if (1.0 - (score / 2.0) if score <= 2.0 else 0.0) >= threshold]
595
+ else:
596
+ context = ""
597
+ sources = []
598
+
599
+ question_contexts.append(sources)
600
+
601
+ # Create prompt
602
+ prompt_content = f"""Use the provided context to answer the question. Be concise and factual.
603
+
604
+ Context: {context}
605
+
606
+ Question: {question}
607
+
608
+ Answer:"""
609
+
610
+ messages = [HumanMessage(content=prompt_content)]
611
+ batch_inputs.append((messages, query))
612
+
613
+ # Process batch - fail if anything goes wrong
614
+ batch_results = batch_processor.invoke(batch_inputs)
615
+
616
+ # Build results
617
+ results = []
618
+ for idx, result in enumerate(batch_results):
619
+ if not result['success'] or not result['response']:
620
+ raise RuntimeError(f"Failed to process question: {result['item_info']['question']}")
621
+
622
+ query = result['item_info']
623
+ answer = result['response'].content.strip()
624
+ sources = question_contexts[idx]
625
+
626
+ results.append({
627
+ 'question': query['question'],
628
+ 'category': query.get('category', ''),
629
+ 'answer': answer,
630
+ 'sources': sources,
631
+ 'method': 'rag_batch',
632
+ 'has_answer': bool(answer and answer.strip())
633
+ })
634
+
635
+ return {'questions': results}
636
+
637
+
638
+
639
+
640
+ def _process_questions_simple_search(queries: List[Dict], vector_store: FAISS, threshold: float) -> Dict:
641
+ """Process questions using simple search without RAG (already fast, no batch needed)"""
642
+ results = []
643
+
644
+ for query in queries:
645
+ question = query['question']
646
+ category = query.get('category', '')
647
+
648
+ # Simple search without RAG
649
+ docs_with_scores = vector_store.similarity_search_with_score(question, k=5)
650
+ sources = []
651
+ for doc, score in docs_with_scores:
652
+ if score >= threshold:
653
+ sources.append({
654
+ 'name': doc.metadata.get('name', ''),
655
+ 'path': doc.metadata.get('path', ''),
656
+ 'score': round(score, 3)
657
+ })
658
+
659
+ answer = f"Based on the following documents: {', '.join([s['name'] for s in sources])}" if sources else "No relevant documents found"
660
+ results.append({
661
+ 'question': question,
662
+ 'category': category,
663
+ 'answer': answer,
664
+ 'sources': sources,
665
+ 'method': 'search',
666
+ 'has_answer': bool(sources)
667
+ })
668
+
669
+ return {'questions': results}
670
+
671
+
672
+ def search_documents(query: str, document_processor: DocumentProcessor, top_k: int = 5, threshold: float = None):
673
+ """Search documents using the document processor"""
674
+ if not document_processor:
675
+ return []
676
+
677
+ return document_processor.search(query, top_k=top_k, threshold=threshold)
678
+
679
+
680
+ def hybrid_search(query: str, vector_store: FAISS, store_name: str,
681
+ top_k: int = 10, sparse_weight: float = 0.3,
682
+ dense_weight: float = 0.7, threshold: float = SIMILARITY_THRESHOLD) -> List[Dict]:
683
+ """
684
+ Hybrid search combining sparse (BM25) and dense retrieval.
685
+
686
+ Args:
687
+ query: Search query
688
+ vector_store: FAISS vector store for dense retrieval
689
+ store_name: Name of the document store
690
+ top_k: Number of top results to return
691
+ sparse_weight: Weight for sparse scores (0-1)
692
+ dense_weight: Weight for dense scores (0-1)
693
+ threshold: Minimum similarity threshold for dense retrieval
694
+
695
+ Returns:
696
+ Combined search results sorted by hybrid score
697
+ """
698
+ logger.info(f"Performing hybrid search for query: {query[:50]}...")
699
+
700
+ # Get sparse results
701
+ sparse_results = []
702
+ bm25_index = load_sparse_index_for_store(store_name)
703
+
704
+ if bm25_index:
705
+ sparse_results = bm25_index.search(query, top_k=top_k*2)
706
+ logger.info(f"Sparse search returned {len(sparse_results)} results")
707
+ else:
708
+ logger.warning(f"No sparse index found for {store_name}, falling back to dense only")
709
+
710
+ # Get dense results
711
+ dense_docs = vector_store.similarity_search_with_score(query, k=top_k*2)
712
+ dense_results = []
713
+
714
+ for doc, score in dense_docs:
715
+ if score >= threshold:
716
+ dense_results.append({
717
+ 'doc_id': doc.metadata.get('source', ''),
718
+ 'document': doc.page_content,
719
+ 'score': float(score),
720
+ 'metadata': doc.metadata
721
+ })
722
+
723
+ logger.info(f"Dense search returned {len(dense_results)} results")
724
+
725
+ # Combine results using reciprocal rank fusion or weighted scoring
726
+ combined_scores = {}
727
+
728
+ # Process sparse results
729
+ for result in sparse_results:
730
+ doc_id = result['doc_id']
731
+ combined_scores[doc_id] = {
732
+ 'sparse_score': result['score'] * sparse_weight,
733
+ 'dense_score': 0.0,
734
+ 'result': result
735
+ }
736
+
737
+ # Process dense results
738
+ for result in dense_results:
739
+ doc_id = result['doc_id']
740
+ if doc_id in combined_scores:
741
+ combined_scores[doc_id]['dense_score'] = result['score'] * dense_weight
742
+ else:
743
+ combined_scores[doc_id] = {
744
+ 'sparse_score': 0.0,
745
+ 'dense_score': result['score'] * dense_weight,
746
+ 'result': result
747
+ }
748
+
749
+ # Calculate final hybrid scores
750
+ final_results = []
751
+ for doc_id, scores in combined_scores.items():
752
+ hybrid_score = scores['sparse_score'] + scores['dense_score']
753
+
754
+ # Create unified result format
755
+ result = scores['result'].copy()
756
+ result.update({
757
+ 'hybrid_score': hybrid_score,
758
+ 'sparse_score': scores['sparse_score'] / sparse_weight if sparse_weight > 0 else 0,
759
+ 'dense_score': scores['dense_score'] / dense_weight if dense_weight > 0 else 0,
760
+ 'score': hybrid_score # For backward compatibility
761
+ })
762
+ final_results.append(result)
763
+
764
+ # Sort by hybrid score
765
+ final_results.sort(key=lambda x: x['hybrid_score'], reverse=True)
766
+
767
+ # Return top_k results
768
+ top_results = final_results[:top_k]
769
+ logger.info(f"Hybrid search returned {len(top_results)} final results")
770
+
771
+ return top_results
772
+
773
+
app/core/sparse_index.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ BM25 Sparse Index Implementation for Due Diligence Documents
4
+
5
+ This module provides BM25-based sparse retrieval that complements the existing
6
+ dense retrieval system. The index is pre-calculated locally and persisted
7
+ to disk for fast loading on Streamlit Cloud.
8
+ """
9
+
10
+ import pickle
11
+ import os
12
+ import re
13
+ from typing import List, Dict, Optional, Callable, Tuple
14
+ from pathlib import Path
15
+
16
+ from rank_bm25 import BM25Okapi
17
+ from app.core.logging import logger
18
+
19
+
20
+ class BM25Index:
21
+ """
22
+ BM25-based sparse index for document retrieval.
23
+
24
+ This class provides:
25
+ - Pre-calculated BM25 index persistence
26
+ - Custom tokenization for legal/financial documents
27
+ - Efficient search with relevance scoring
28
+ - Integration with existing document processing pipeline
29
+ """
30
+
31
+ def __init__(self, index_path: str):
32
+ """
33
+ Initialize BM25 index.
34
+
35
+ Args:
36
+ index_path: Path to save/load the index file
37
+ """
38
+ self.index_path = Path(index_path)
39
+ self.bm25: Optional[BM25Okapi] = None
40
+ self.documents: List[str] = []
41
+ self.doc_ids: List[str] = []
42
+ self.tokenized_docs: List[List[str]] = []
43
+ self.metadata: Dict = {}
44
+
45
+ def custom_tokenizer(self, text: str) -> List[str]:
46
+ """
47
+ Custom tokenization optimized for legal and financial documents.
48
+
49
+ Handles:
50
+ - Legal abbreviations (LLC, Inc., Corp.)
51
+ - Financial terms (IPO, GAAP, EBITDA)
52
+ - Contract terminology (force majeure, indemnification)
53
+ - Proper names and entities
54
+ """
55
+ if not text:
56
+ return []
57
+
58
+ # Convert to lowercase
59
+ text = text.lower()
60
+
61
+ # Preserve important legal/financial abbreviations
62
+ legal_abbrevs = [
63
+ 'llc', 'inc', 'corp', 'ltd', 'co', 'lp', 'llp',
64
+ 'ipo', 'gaap', 'sec', 'fdic', 'irs', 'sox', 'gdpr',
65
+ 'nda', 'mou', 'spa', 'joa', 'ipa', 'dpa'
66
+ ]
67
+
68
+ # Replace common terms to avoid splitting
69
+ for abbrev in legal_abbrevs:
70
+ text = text.replace(abbrev, abbrev.replace(' ', '_'))
71
+
72
+ # Split on whitespace and punctuation
73
+ tokens = re.findall(r'\b\w+\b', text)
74
+
75
+ # Restore underscores to spaces for abbreviations
76
+ tokens = [token.replace('_', '') for token in tokens]
77
+
78
+ # Filter out very short tokens (likely noise)
79
+ tokens = [token for token in tokens if len(token) > 1]
80
+
81
+ return tokens
82
+
83
+ def build_index(self, documents: List[Dict[str, str]], custom_tokenizer: Optional[Callable] = None):
84
+ """
85
+ Build BM25 index from documents.
86
+
87
+ Args:
88
+ documents: List of dicts with 'id' and 'content' keys
89
+ custom_tokenizer: Optional custom tokenization function
90
+ """
91
+ logger.info(f"Building BM25 index from {len(documents)} documents")
92
+
93
+ # Extract content and IDs
94
+ self.documents = [doc['content'] for doc in documents]
95
+ self.doc_ids = [doc['id'] for doc in documents]
96
+
97
+ # Tokenize documents
98
+ tokenizer = custom_tokenizer or self.custom_tokenizer
99
+ self.tokenized_docs = [tokenizer(doc) for doc in self.documents]
100
+
101
+ # Build BM25 index
102
+ self.bm25 = BM25Okapi(self.tokenized_docs)
103
+
104
+ # Store metadata
105
+ self.metadata = {
106
+ 'total_documents': len(self.documents),
107
+ 'total_tokens': sum(len(tokens) for tokens in self.tokenized_docs),
108
+ 'avg_tokens_per_doc': sum(len(tokens) for tokens in self.tokenized_docs) / len(self.documents) if self.documents else 0
109
+ }
110
+
111
+ # Save to disk
112
+ self._save_index()
113
+
114
+ logger.info(f"✅ BM25 index built and saved: {self.metadata}")
115
+
116
+ def _save_index(self):
117
+ """Save index to pickle file"""
118
+ self.index_path.parent.mkdir(parents=True, exist_ok=True)
119
+
120
+ index_data = {
121
+ 'bm25': self.bm25,
122
+ 'documents': self.documents,
123
+ 'doc_ids': self.doc_ids,
124
+ 'tokenized_docs': self.tokenized_docs,
125
+ 'metadata': self.metadata
126
+ }
127
+
128
+ with open(self.index_path, 'wb') as f:
129
+ pickle.dump(index_data, f)
130
+
131
+ logger.info(f"💾 BM25 index saved to {self.index_path}")
132
+
133
+ def load_index(self) -> bool:
134
+ """
135
+ Load index from disk.
136
+
137
+ Returns:
138
+ True if index loaded successfully, False otherwise
139
+ """
140
+ if self.index_path.exists():
141
+ try:
142
+ with open(self.index_path, 'rb') as f:
143
+ index_data = pickle.load(f)
144
+
145
+ self.bm25 = index_data['bm25']
146
+ self.documents = index_data['documents']
147
+ self.doc_ids = index_data['doc_ids']
148
+ self.tokenized_docs = index_data['tokenized_docs']
149
+ self.metadata = index_data.get('metadata', {})
150
+
151
+ logger.info(f"📂 BM25 index loaded: {len(self.documents)} documents")
152
+ return True
153
+
154
+ except Exception as e:
155
+ logger.error(f"Failed to load BM25 index: {e}")
156
+ return False
157
+ else:
158
+ logger.warning(f"BM25 index not found: {self.index_path}")
159
+ return False
160
+
161
+ def search(self, query: str, top_k: int = 10, custom_tokenizer: Optional[Callable] = None) -> List[Dict]:
162
+ """
163
+ Search the BM25 index.
164
+
165
+ Args:
166
+ query: Search query
167
+ top_k: Number of top results to return
168
+ custom_tokenizer: Optional custom tokenization function
169
+
170
+ Returns:
171
+ List of search results with scores
172
+ """
173
+ if not self.bm25:
174
+ logger.warning("BM25 index not loaded")
175
+ return []
176
+
177
+ # Tokenize query
178
+ tokenizer = custom_tokenizer or self.custom_tokenizer
179
+ tokenized_query = tokenizer(query)
180
+
181
+ if not tokenized_query:
182
+ logger.warning("Query produced no tokens")
183
+ return []
184
+
185
+ # Get BM25 scores
186
+ scores = self.bm25.get_scores(tokenized_query)
187
+
188
+ # Get top results
189
+ if len(scores) == 0:
190
+ return []
191
+
192
+ # Get indices of top scores (handling edge case of fewer results than requested)
193
+ num_results = min(top_k, len(scores))
194
+ top_indices = scores.argsort()[-num_results:][::-1]
195
+
196
+ results = []
197
+ for idx in top_indices:
198
+ if scores[idx] > 0: # Only return relevant results
199
+ results.append({
200
+ 'doc_id': self.doc_ids[idx],
201
+ 'document': self.documents[idx],
202
+ 'score': float(scores[idx]),
203
+ 'rank': len(results) + 1
204
+ })
205
+
206
+ logger.debug(f"BM25 search returned {len(results)} results for query: {query[:50]}...")
207
+ return results
208
+
209
+ def get_stats(self) -> Dict:
210
+ """Get index statistics"""
211
+ if not self.index_path.exists():
212
+ return {'status': 'index_not_found'}
213
+
214
+ stats = {
215
+ 'index_path': str(self.index_path),
216
+ 'index_exists': self.index_path.exists(),
217
+ 'is_loaded': self.bm25 is not None,
218
+ 'index_size_mb': self.index_path.stat().st_size / (1024 * 1024) if self.index_path.exists() else 0
219
+ }
220
+
221
+ if self.metadata:
222
+ stats.update(self.metadata)
223
+
224
+ return stats
225
+
226
+
227
+ def build_sparse_index_for_store(store_name: str, documents: List[Dict[str, str]],
228
+ index_dir: str = "data/search_indexes") -> BM25Index:
229
+ """
230
+ Convenience function to build sparse index for a document store.
231
+
232
+ Args:
233
+ store_name: Name of the document store (e.g., 'summit-digital-solutions-inc')
234
+ documents: List of documents with 'id' and 'content' keys
235
+ index_dir: Directory to store the index
236
+
237
+ Returns:
238
+ BM25Index instance
239
+ """
240
+ index_path = f"{index_dir}/{store_name}_bm25.pkl"
241
+ bm25_index = BM25Index(index_path)
242
+ bm25_index.build_index(documents)
243
+ return bm25_index
244
+
245
+
246
+ def load_sparse_index_for_store(store_name: str, index_dir: str = "data/search_indexes") -> Optional[BM25Index]:
247
+ """
248
+ Convenience function to load sparse index for a document store.
249
+
250
+ Args:
251
+ store_name: Name of the document store
252
+ index_dir: Directory containing the index
253
+
254
+ Returns:
255
+ BM25Index instance if found, None otherwise
256
+ """
257
+ index_path = f"{index_dir}/{store_name}_bm25.pkl"
258
+ bm25_index = BM25Index(index_path)
259
+
260
+ if bm25_index.load_index():
261
+ return bm25_index
262
+
263
+ return None
app/core/stage_manager.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Stage-based Build System for FAISS Index Generation
4
+
5
+ This module provides a stage-based build system that allows for incremental
6
+ builds, dependency management, and smart skipping of completed stages.
7
+ """
8
+
9
+ import json
10
+ import logging
11
+ import time
12
+ from pathlib import Path
13
+ from typing import Dict, List, Any, Optional, Set
14
+ from datetime import datetime
15
+ import glob
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Stage definitions with dependencies and outputs
20
+ STAGES = {
21
+ 'scan': {
22
+ 'name': 'Document Scanning',
23
+ 'description': 'Scan and catalog all documents',
24
+ 'dependencies': [],
25
+ 'outputs': ['.scan_cache.json'],
26
+ 'estimated_duration': '30s'
27
+ },
28
+ 'extract': {
29
+ 'name': 'Text Extraction',
30
+ 'description': 'Extract text from PDFs and documents',
31
+ 'dependencies': ['scan'],
32
+ 'outputs': ['.extraction_cache.json'],
33
+ 'estimated_duration': '5-10m'
34
+ },
35
+ 'classify': {
36
+ 'name': 'Document Classification',
37
+ 'description': 'Classify document types using AI',
38
+ 'dependencies': ['extract'],
39
+ 'outputs': ['*_document_types.json'],
40
+ 'estimated_duration': '3-5m'
41
+ },
42
+ 'chunk': {
43
+ 'name': 'Text Chunking',
44
+ 'description': 'Split documents into semantic chunks',
45
+ 'dependencies': ['extract'],
46
+ 'outputs': ['.chunking_cache.json'],
47
+ 'estimated_duration': '2-3m'
48
+ },
49
+ 'embed': {
50
+ 'name': 'Vector Embeddings',
51
+ 'description': 'Generate embeddings for all chunks',
52
+ 'dependencies': ['chunk'],
53
+ 'outputs': ['*.pkl'],
54
+ 'estimated_duration': '5-8m'
55
+ },
56
+ 'index': {
57
+ 'name': 'FAISS Indexing',
58
+ 'description': 'Build and save FAISS vector indices',
59
+ 'dependencies': ['embed'],
60
+ 'outputs': ['*.faiss'],
61
+ 'estimated_duration': '1-2m'
62
+ },
63
+ 'sparse': {
64
+ 'name': 'BM25 Sparse Indexing',
65
+ 'description': 'Build BM25 sparse indices for hybrid search',
66
+ 'dependencies': ['extract'],
67
+ 'outputs': ['*_bm25.pkl'],
68
+ 'estimated_duration': '2-3m'
69
+ }
70
+ }
71
+
72
+
73
+ class StageTracker:
74
+ """Tracks the state and completion status of build stages"""
75
+
76
+ def __init__(self, faiss_dir: Path):
77
+ self.faiss_dir = faiss_dir
78
+ self.state_file = faiss_dir / '.build_state.json'
79
+ self.state = self._load_state()
80
+
81
+ def _load_state(self) -> Dict[str, Any]:
82
+ """Load current build state from disk"""
83
+ if self.state_file.exists():
84
+ try:
85
+ return json.loads(self.state_file.read_text())
86
+ except json.JSONDecodeError as e:
87
+ logger.warning(f"Corrupted state file, starting fresh: {e}")
88
+ return self._create_initial_state()
89
+ else:
90
+ return self._create_initial_state()
91
+
92
+ def _create_initial_state(self) -> Dict[str, Any]:
93
+ """Create initial state structure"""
94
+ return {
95
+ 'stages': {},
96
+ 'last_build': None,
97
+ 'version': '1.0',
98
+ 'total_builds': 0
99
+ }
100
+
101
+ def _save_state(self):
102
+ """Save current state to disk"""
103
+ self.state_file.parent.mkdir(parents=True, exist_ok=True)
104
+ self.state_file.write_text(json.dumps(self.state, indent=2))
105
+
106
+ def is_stage_complete(self, stage_name: str) -> bool:
107
+ """Check if stage is complete and all outputs exist"""
108
+ if stage_name not in self.state['stages']:
109
+ return False
110
+
111
+ stage_info = self.state['stages'][stage_name]
112
+ stage_config = STAGES[stage_name]
113
+
114
+ # Check if all output files exist
115
+ for output_pattern in stage_config['outputs']:
116
+ pattern_path = self.faiss_dir / output_pattern
117
+ if not glob.glob(str(pattern_path)):
118
+ logger.debug(f"Missing output: {pattern_path}")
119
+ return False
120
+
121
+ return True
122
+
123
+ def mark_stage_complete(self, stage_name: str, metadata: dict = None):
124
+ """Mark stage as completed with metadata"""
125
+ self.state['stages'][stage_name] = {
126
+ 'completed_at': datetime.now().isoformat(),
127
+ 'metadata': metadata or {}
128
+ }
129
+ self._save_state()
130
+
131
+ def mark_stage_failed(self, stage_name: str, error: str):
132
+ """Mark stage as failed"""
133
+ self.state['stages'][stage_name] = {
134
+ 'failed_at': datetime.now().isoformat(),
135
+ 'error': error,
136
+ 'status': 'failed'
137
+ }
138
+ self._save_state()
139
+
140
+ def should_skip_stage(self, stage_name: str, force_clean: bool) -> bool:
141
+ """Determine if stage should be skipped"""
142
+ if force_clean:
143
+ return False
144
+ return self.is_stage_complete(stage_name)
145
+
146
+ def get_stage_status(self, stage_name: str) -> Dict[str, Any]:
147
+ """Get detailed status of a stage"""
148
+ if stage_name not in self.state['stages']:
149
+ return {'status': 'not_started'}
150
+
151
+ stage_info = self.state['stages'][stage_name]
152
+ is_complete = self.is_stage_complete(stage_name)
153
+
154
+ return {
155
+ 'status': 'completed' if is_complete else 'incomplete',
156
+ 'completed_at': stage_info.get('completed_at'),
157
+ 'metadata': stage_info.get('metadata', {}),
158
+ 'error': stage_info.get('error'),
159
+ 'is_complete': is_complete
160
+ }
161
+
162
+ def get_build_summary(self) -> Dict[str, Any]:
163
+ """Get summary of current build state"""
164
+ completed_stages = []
165
+ incomplete_stages = []
166
+ failed_stages = []
167
+
168
+ for stage_name in STAGES.keys():
169
+ status = self.get_stage_status(stage_name)
170
+ if status['status'] == 'completed':
171
+ completed_stages.append(stage_name)
172
+ elif status.get('error'):
173
+ failed_stages.append(stage_name)
174
+ else:
175
+ incomplete_stages.append(stage_name)
176
+
177
+ return {
178
+ 'completed_stages': completed_stages,
179
+ 'incomplete_stages': incomplete_stages,
180
+ 'failed_stages': failed_stages,
181
+ 'last_build': self.state.get('last_build'),
182
+ 'total_builds': self.state.get('total_builds', 0)
183
+ }
184
+
185
+ def reset_stage(self, stage_name: str):
186
+ """Reset a specific stage to not started"""
187
+ if stage_name in self.state['stages']:
188
+ del self.state['stages'][stage_name]
189
+ self._save_state()
190
+
191
+ def reset_all_stages(self):
192
+ """Reset all stages to not started"""
193
+ self.state['stages'] = {}
194
+ self._save_state()
195
+
196
+
197
+ class StageManager:
198
+ """Manages execution of build stages with dependency resolution"""
199
+
200
+ def __init__(self, faiss_dir: Path):
201
+ self.faiss_dir = faiss_dir
202
+ self.tracker = StageTracker(faiss_dir)
203
+
204
+ def resolve_dependencies(self, target_stages: List[str], completed_stages: Set[str]) -> List[str]:
205
+ """Resolve which stages need to run based on dependencies"""
206
+ to_run = []
207
+
208
+ for stage_name in target_stages:
209
+ if stage_name not in STAGES:
210
+ raise ValueError(f"Unknown stage: {stage_name}")
211
+
212
+ # Check dependencies recursively
213
+ for dep in STAGES[stage_name]['dependencies']:
214
+ if dep not in completed_stages:
215
+ dep_chain = self.resolve_dependencies([dep], completed_stages)
216
+ to_run.extend(dep_chain)
217
+
218
+ if stage_name not in completed_stages:
219
+ to_run.append(stage_name)
220
+
221
+ # Remove duplicates while preserving order
222
+ seen = set()
223
+ result = []
224
+ for stage in to_run:
225
+ if stage not in seen:
226
+ seen.add(stage)
227
+ result.append(stage)
228
+
229
+ return result
230
+
231
+ def get_completed_stages(self, force_clean: bool = False) -> Set[str]:
232
+ """Get set of completed stages"""
233
+ if force_clean:
234
+ return set()
235
+
236
+ completed = set()
237
+ for stage_name in STAGES.keys():
238
+ if self.tracker.is_stage_complete(stage_name):
239
+ completed.add(stage_name)
240
+ return completed
241
+
242
+ def execute_stage(self, stage_name: str, **kwargs) -> Dict[str, Any]:
243
+ """Execute a specific stage - to be implemented by subclasses"""
244
+ raise NotImplementedError(f"Stage execution not implemented for: {stage_name}")
245
+
246
+ def run_build_pipeline(self, target_stages: Optional[List[str]] = None,
247
+ force_clean: bool = False) -> Dict[str, Any]:
248
+ """Run the build pipeline with dependency resolution"""
249
+
250
+ # Default to all stages if none specified
251
+ if target_stages is None:
252
+ target_stages = list(STAGES.keys())
253
+
254
+ # Get completed stages
255
+ completed_stages = self.get_completed_stages(force_clean)
256
+
257
+ # Resolve which stages need to run
258
+ stages_to_run = self.resolve_dependencies(target_stages, completed_stages)
259
+
260
+ logger.info(f"Build pipeline: {len(stages_to_run)} stages to execute")
261
+
262
+ results = []
263
+ for stage_name in stages_to_run:
264
+ stage_config = STAGES[stage_name]
265
+
266
+ if self.tracker.should_skip_stage(stage_name, force_clean):
267
+ logger.info(f"⏭️ Skipping stage '{stage_name}' (already complete)")
268
+ results.append({
269
+ 'stage': stage_name,
270
+ 'status': 'skipped',
271
+ 'reason': 'already_complete'
272
+ })
273
+ continue
274
+
275
+ logger.info(f"🚀 Executing stage '{stage_name}': {stage_config['description']}")
276
+ start_time = time.time()
277
+
278
+ try:
279
+ # Execute the stage
280
+ result = self.execute_stage(stage_name, force_clean=force_clean)
281
+
282
+ # Mark as complete
283
+ execution_time = time.time() - start_time
284
+ self.tracker.mark_stage_complete(stage_name, {
285
+ 'execution_time': execution_time,
286
+ 'result': result
287
+ })
288
+
289
+ logger.info(f"✅ Stage '{stage_name}' completed in {execution_time:.1f}s")
290
+ results.append({
291
+ 'stage': stage_name,
292
+ 'status': 'completed',
293
+ 'execution_time': execution_time,
294
+ 'result': result
295
+ })
296
+
297
+ except Exception as e:
298
+ execution_time = time.time() - start_time
299
+ error_msg = f"Stage '{stage_name}' failed after {execution_time:.1f}s: {e}"
300
+ logger.error(f"❌ {error_msg}")
301
+
302
+ self.tracker.mark_stage_failed(stage_name, str(e))
303
+
304
+ results.append({
305
+ 'stage': stage_name,
306
+ 'status': 'failed',
307
+ 'execution_time': execution_time,
308
+ 'error': str(e)
309
+ })
310
+
311
+ # Don't continue with dependent stages on failure
312
+ break
313
+
314
+ # Update build metadata
315
+ self.tracker.state['last_build'] = datetime.now().isoformat()
316
+ self.tracker.state['total_builds'] = self.tracker.state.get('total_builds', 0) + 1
317
+ self.tracker._save_state()
318
+
319
+ return {
320
+ 'success': all(r['status'] in ['completed', 'skipped'] for r in results),
321
+ 'stages_executed': len([r for r in results if r['status'] == 'completed']),
322
+ 'stages_skipped': len([r for r in results if r['status'] == 'skipped']),
323
+ 'stages_failed': len([r for r in results if r['status'] == 'failed']),
324
+ 'results': results,
325
+ 'total_time': sum(r.get('execution_time', 0) for r in results)
326
+ }
app/core/utils.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Utility Functions Module
4
+
5
+ Collection of utility functions used throughout the application.
6
+ This module contains helper functions for file operations, formatting,
7
+ and document processing utilities.
8
+ """
9
+
10
+ from typing import List, Optional
11
+ from pathlib import Path
12
+
13
+
14
+ def get_mime_type(file_path: Path) -> str:
15
+ """Get MIME type based on file extension"""
16
+ file_extension = file_path.suffix.lower()
17
+ if file_extension == '.pdf':
18
+ return 'application/pdf'
19
+ elif file_extension in ['.doc', '.docx']:
20
+ return 'application/msword'
21
+ elif file_extension == '.txt':
22
+ return 'text/plain'
23
+ elif file_extension == '.md':
24
+ return 'text/markdown'
25
+ else:
26
+ return 'application/octet-stream'
27
+
28
+
29
+ def format_document_title(doc_name: str) -> str:
30
+ """Format document name into a readable title"""
31
+ if '.' in doc_name:
32
+ doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
33
+ else:
34
+ doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
35
+ return doc_title
36
+
37
+
38
+ def count_documents_in_directory(directory: Path, supported_extensions: Optional[List[str]] = None) -> int:
39
+ """Count supported documents in a directory recursively"""
40
+ if supported_extensions is None:
41
+ supported_extensions = ['.pdf', '.docx', '.doc', '.txt', '.md']
42
+
43
+ return sum(1 for f in directory.rglob('*')
44
+ if f.is_file() and f.suffix.lower() in supported_extensions)
45
+
46
+
47
+ def create_document_processor(store_name: Optional[str] = None) -> 'DocumentProcessor':
48
+ """
49
+ Create and initialize a DocumentProcessor.
50
+
51
+ This utility function encapsulates the common pattern of creating a DocumentProcessor
52
+ instance.
53
+
54
+ Args:
55
+ store_name: Optional name for the FAISS store (uses config default if None)
56
+
57
+ Returns:
58
+ Initialized DocumentProcessor instance
59
+ """
60
+ from app.core.document_processor import DocumentProcessor
61
+
62
+ # Create document processor instance
63
+ processor = DocumentProcessor(store_name=store_name)
64
+
65
+ return processor
app/handlers/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Handlers Package
3
+
4
+ Contains business logic handlers that coordinate between UI and services.
5
+ """
6
+
7
+ from .document_handler import DocumentHandler
8
+ from .ai_handler import AIHandler
9
+ from .export_handler import ExportHandler
10
+
11
+ __all__ = ['DocumentHandler', 'AIHandler', 'ExportHandler']
app/handlers/ai_handler.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ AI Handler
4
+
5
+ Handles AI operations and coordinates between UI and AI service.
6
+ """
7
+
8
+ from typing import Optional, List
9
+
10
+ from app.ui.session_manager import SessionManager
11
+ from app.services.ai_service import AIService, create_ai_service
12
+ from app.core.exceptions import AIError, ConfigError, create_ai_error
13
+ from app.ui.error_handler import handle_processing_errors
14
+ from app.core.logging import logger
15
+
16
+
17
+ class AIHandler:
18
+ """
19
+ AI handler that manages AI operations using the AI service.
20
+
21
+ Provides a clean interface between UI and AI service.
22
+ """
23
+
24
+ def __init__(self, session: SessionManager):
25
+ """Initialize handler with session manager"""
26
+ self.session = session
27
+ self._ai_service: Optional[AIService] = None
28
+
29
+ @handle_processing_errors("AI service setup", "Please check your API key and try again")
30
+ def setup_agent(self, api_key: str, model_choice: str) -> bool:
31
+ """
32
+ Setup AI service with given credentials.
33
+
34
+ Args:
35
+ api_key: Anthropic API key
36
+ model_choice: Claude model to use
37
+
38
+ Returns:
39
+ True if AI service was successfully initialized
40
+
41
+ Raises:
42
+ AIError: If AI service setup fails
43
+ ConfigError: If API key or model is invalid
44
+ """
45
+ # Get appropriate max_tokens for the model
46
+ from app.core.config import get_app_config
47
+ config = get_app_config()
48
+
49
+ # Adjust max_tokens based on model limitations
50
+ max_tokens = config.model['max_tokens']
51
+ original_max_tokens = max_tokens
52
+
53
+ if 'haiku' in model_choice.lower():
54
+ # Claude Haiku has a maximum of 8192 output tokens
55
+ max_tokens = min(max_tokens, 8192)
56
+ elif 'sonnet' in model_choice.lower():
57
+ # Claude Sonnet models can handle higher token counts
58
+ max_tokens = min(max_tokens, 8192) # Conservative limit for reliability
59
+
60
+ if max_tokens != original_max_tokens:
61
+ logger.info(f"Adjusted max_tokens for {model_choice}: {original_max_tokens} -> {max_tokens}")
62
+
63
+ logger.info(f"Initializing AI service: model={model_choice}, max_tokens={max_tokens}, temperature={config.model['temperature']}")
64
+
65
+ # Create AI service with proper token limits
66
+ self._ai_service = create_ai_service(
67
+ api_key=api_key,
68
+ model=model_choice,
69
+ temperature=config.model['temperature'],
70
+ max_tokens=max_tokens
71
+ )
72
+
73
+ # Check if service was created successfully
74
+ if self._ai_service is None:
75
+ raise create_ai_error(
76
+ "AI service creation failed",
77
+ recovery_hint="Please check your API key and try again"
78
+ )
79
+
80
+ # Test the service
81
+ if self._ai_service.is_available:
82
+ # Store the AI service in the session for other components to access
83
+ self.session.agent = self._ai_service
84
+ return True
85
+ else:
86
+ raise create_ai_error(
87
+ "AI service initialization failed",
88
+ recovery_hint="Please check your API key and network connection"
89
+ )
90
+
91
+ def is_agent_available(self) -> bool:
92
+ """
93
+ Check if AI service is available and ready.
94
+
95
+ Returns:
96
+ True if AI service is available
97
+ """
98
+ # Check local AI service first
99
+ if self._ai_service is not None and self._ai_service.is_available:
100
+ return True
101
+
102
+ # Check session for existing agent
103
+ if self.session.agent is not None:
104
+ # Update local reference if session has an agent
105
+ self._ai_service = self.session.agent
106
+ return self._ai_service.is_available
107
+
108
+ return False
109
+
110
+
111
+ @handle_processing_errors("Report generation", "Please check your documents and try again")
112
+ def generate_report(self, report_type: str, **kwargs) -> Optional[str]:
113
+ """
114
+ Generate a report using the AI service.
115
+
116
+ Args:
117
+ report_type: Type of report ('overview', 'strategic', 'checklist', 'questions')
118
+ **kwargs: Additional arguments for report generation
119
+
120
+ Returns:
121
+ Generated report content or None if failed
122
+
123
+ Raises:
124
+ AIError: If report generation fails
125
+ """
126
+ if not self.is_agent_available():
127
+ raise create_ai_error(
128
+ "AI service not available",
129
+ recovery_hint="Please configure your API key in the sidebar"
130
+ )
131
+
132
+ documents = kwargs.get('documents', {})
133
+ strategy_text = kwargs.get('strategy_text')
134
+ checklist_results = kwargs.get('checklist_results')
135
+
136
+ return self._ai_service.analyze_documents(
137
+ documents=documents,
138
+ analysis_type=report_type,
139
+ strategy_text=strategy_text,
140
+ checklist_results=checklist_results
141
+ )
142
+
143
+
144
+ @handle_processing_errors("Question answering", "Please try rephrasing your question")
145
+ def answer_question(self, question: str, context_docs: List[str]) -> str:
146
+ """
147
+ Answer a specific question using AI.
148
+
149
+ Args:
150
+ question: The question to answer
151
+ context_docs: List of relevant document excerpts
152
+
153
+ Returns:
154
+ AI-generated answer
155
+
156
+ Raises:
157
+ AIError: If question answering fails
158
+ """
159
+ if not self.is_agent_available():
160
+ raise create_ai_error(
161
+ "AI service not available",
162
+ recovery_hint="Please configure your API key in the sidebar"
163
+ )
164
+
165
+ return self._ai_service.answer_question(question, context_docs)
166
+
167
+ @property
168
+ def llm(self):
169
+ """Get the underlying LLM instance"""
170
+ # Check local AI service first
171
+ if self._ai_service is not None:
172
+ return self._ai_service.llm
173
+
174
+ # Check session for existing agent
175
+ if self.session.agent is not None:
176
+ # Update local reference if session has an agent
177
+ self._ai_service = self.session.agent
178
+ return self._ai_service.llm
179
+
180
+ return None
app/handlers/document_handler.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Document Handler
4
+
5
+ Handles document processing operations and coordinates with the document processor.
6
+ """
7
+
8
+ from pathlib import Path
9
+ from typing import Dict, List, Any
10
+
11
+ from app.ui.session_manager import SessionManager
12
+ from app.core.exceptions import ProcessingError
13
+ from app.ui.error_handler import ErrorHandler, handle_processing_errors
14
+ from app.core.exceptions import DocumentProcessingError, FileOperationError, create_processing_error
15
+ from app.core.logging import logger
16
+
17
+
18
+ class DocumentHandler:
19
+ """
20
+ Document handler that manages document processing operations.
21
+ """
22
+
23
+ def __init__(self, session: SessionManager):
24
+ """Initialize handler with session manager"""
25
+ self.session = session
26
+
27
+ @handle_processing_errors("Data room processing", "Please check that the data room exists and contains documents")
28
+ def process_data_room_fast(self, data_room_path: str):
29
+ """
30
+ Fast data room processing using pre-built FAISS indices.
31
+
32
+ Args:
33
+ data_room_path: Path to the data room directory
34
+
35
+ Returns:
36
+ Tuple of (documents_count, chunks_count) or None on error
37
+ """
38
+ # Extract company name from path
39
+ company_name = Path(data_room_path).name.lower()
40
+
41
+ # Initialize document processor with loaded FAISS store
42
+ from app.core.utils import create_document_processor
43
+ document_processor = create_document_processor(store_name=company_name)
44
+
45
+ if not document_processor.vector_store:
46
+ raise create_processing_error(
47
+ f"No pre-built FAISS index found for '{company_name}'",
48
+ recovery_hint="Please run scripts/build_indexes.py first to create the index"
49
+ )
50
+
51
+ # Quick document metadata scan
52
+ documents_dict = self._quick_document_scan(data_room_path)
53
+
54
+ # Get chunks from FAISS metadata
55
+ chunks = self._extract_chunks_from_faiss(document_processor)
56
+
57
+ # Store in session
58
+ self.session.documents = documents_dict
59
+ self.session.chunks = chunks
60
+ self.session.embeddings = document_processor.embeddings
61
+ self.session.vdr_store = company_name
62
+
63
+ # Preload checklist embeddings into memory for fast search
64
+ from app.core.search import preload_checklist_embeddings
65
+ logger.info("Attempting to preload checklist embeddings...")
66
+ try:
67
+ preloaded_count = preload_checklist_embeddings()
68
+ logger.info(f"✅ Successfully preloaded {preloaded_count} checklist embeddings for fast searching")
69
+ except RuntimeError as e:
70
+ logger.error(f"❌ Failed to preload checklist embeddings: {e}")
71
+ logger.error("This will cause checklist matching to fail - embeddings must be available for search")
72
+ # Don't fail the entire data room processing, but make it very clear this is a problem
73
+ raise # Re-raise to make this a hard failure
74
+
75
+ # Preload document type embeddings into memory for fast search
76
+ from app.core.search import preload_document_type_embeddings
77
+ logger.info("Attempting to preload document type embeddings...")
78
+ try:
79
+ type_embeddings = preload_document_type_embeddings(company_name)
80
+ # Store in session for use during search
81
+ self.session.document_type_embeddings = type_embeddings
82
+ logger.info(f"✅ Successfully preloaded {len(type_embeddings)} document type embeddings for fast searching")
83
+ logger.info(f"Session ID: {id(self.session)}, Embeddings stored: {bool(self.session.document_type_embeddings)}")
84
+ except RuntimeError as e:
85
+ logger.error(f"❌ Failed to preload document type embeddings: {e}")
86
+ logger.error("Checklist processing will fail - embeddings are required")
87
+ raise # Make this a hard failure since embeddings are now required
88
+
89
+ # Clear existing analysis
90
+ self.session.reset()
91
+
92
+ logger.info(f"Successfully processed {len(documents_dict)} documents and {len(chunks)} chunks")
93
+ return len(documents_dict), len(chunks)
94
+
95
+ def _quick_document_scan(self, data_room_path: str) -> Dict[str, Any]:
96
+ """Quick scan of document files without loading content"""
97
+ documents_dict = {}
98
+ data_room_path_obj = Path(data_room_path)
99
+
100
+ # Validate data room path exists
101
+ if not data_room_path_obj.exists():
102
+ raise create_processing_error(
103
+ f"Data room path does not exist: {data_room_path}",
104
+ recovery_hint="Please select a valid data room directory"
105
+ )
106
+
107
+ # Quick file system scan for supported extensions
108
+ from app.core import get_config
109
+ config = get_config()
110
+ supported_extensions = config.get_supported_extensions()
111
+
112
+ for ext in supported_extensions:
113
+ for file_path in data_room_path_obj.rglob(f"*{ext}"):
114
+ if file_path.is_file():
115
+ try:
116
+ rel_path = file_path.relative_to(data_room_path_obj)
117
+ documents_dict[str(file_path)] = {
118
+ 'name': file_path.name,
119
+ 'path': str(rel_path),
120
+ 'content': f"[Indexed - {file_path.stat().st_size:,} bytes]",
121
+ 'metadata': {
122
+ 'source': str(file_path),
123
+ 'name': file_path.name,
124
+ 'path': str(rel_path)
125
+ }
126
+ }
127
+ except ValueError:
128
+ # Skip files outside data room path
129
+ continue
130
+
131
+ if not documents_dict:
132
+ raise create_processing_error(
133
+ f"No supported documents found in {data_room_path}",
134
+ recovery_hint="Please ensure the data room contains PDF, DOCX, or text files"
135
+ )
136
+
137
+ return documents_dict
138
+
139
+ def _extract_chunks_from_faiss(self, document_processor) -> List[Dict]:
140
+ """Extract chunk information from loaded FAISS store"""
141
+ chunks = []
142
+
143
+ if not document_processor.vector_store:
144
+ logger.warning("No vector store available for chunk extraction")
145
+ return chunks
146
+
147
+ try:
148
+ # Access the docstore to get document metadata
149
+ docstore = document_processor.vector_store.docstore
150
+
151
+ for doc_id in docstore._dict.keys():
152
+ doc = docstore._dict[doc_id]
153
+ chunk_text = doc.page_content
154
+ if len(chunk_text) > 500:
155
+ chunk_text = chunk_text[:500] + "..."
156
+
157
+ chunk_dict = {
158
+ 'text': chunk_text,
159
+ 'source': doc.metadata.get('name', ''),
160
+ 'path': doc.metadata.get('path', ''),
161
+ 'full_path': doc.metadata.get('source', ''),
162
+ 'metadata': doc.metadata
163
+ }
164
+ chunks.append(chunk_dict)
165
+
166
+ except (DocumentProcessingError, FileOperationError) as e:
167
+ ErrorHandler.handle_error(
168
+ e,
169
+ "Failed to extract chunks from FAISS store",
170
+ recovery_hint="The FAISS index may be corrupted"
171
+ )
172
+ # Fallback: create minimal chunks
173
+ chunks = [{
174
+ 'text': '[Content available in search]',
175
+ 'source': 'indexed_content',
176
+ 'path': '',
177
+ 'full_path': '',
178
+ 'metadata': {}
179
+ }]
180
+
181
+ return chunks
182
+
183
+ def get_document_processor(self, store_name: str = None):
184
+ """
185
+ Get a configured document processor.
186
+
187
+ Args:
188
+ store_name: Optional store name for the processor
189
+
190
+ Returns:
191
+ Configured DocumentProcessor instance
192
+ """
193
+ from app.core.utils import create_document_processor
194
+ return create_document_processor(store_name=store_name)
195
+
196
+ def validate_data_room(self, data_room_path: str) -> bool:
197
+ """
198
+ Validate that a data room path exists and contains documents.
199
+
200
+ Args:
201
+ data_room_path: Path to validate
202
+
203
+ Returns:
204
+ True if valid, False otherwise
205
+ """
206
+ path_obj = Path(data_room_path)
207
+ if not path_obj.exists():
208
+ return False
209
+
210
+ return self._has_supported_files(path_obj)
211
+
212
+ def _has_supported_files(self, path_obj: Path) -> bool:
213
+ """
214
+ Check if path contains files with supported extensions.
215
+
216
+ Args:
217
+ path_obj: Path object to check
218
+
219
+ Returns:
220
+ True if supported files are found
221
+ """
222
+ from app.core import get_config
223
+ config = get_config()
224
+ supported_extensions = config.get_supported_extensions()
225
+
226
+ for ext in supported_extensions:
227
+ if list(path_obj.rglob(f"*{ext}")):
228
+ return True
229
+
230
+ return False
app/handlers/export_handler.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Export Handler
4
+
5
+ Handles report export operations.
6
+ """
7
+
8
+ from pathlib import Path
9
+
10
+ from app.ui.session_manager import SessionManager
11
+ from app.core.exceptions import ProcessingError
12
+ from app.ui.error_handler import handle_ui_errors
13
+ from app.core.exceptions import create_processing_error
14
+
15
+
16
+ class ExportHandler:
17
+ """
18
+ Export handler that manages report export operations.
19
+ """
20
+
21
+ def __init__(self, session: SessionManager):
22
+ """Initialize handler with session manager"""
23
+ self.session = session
24
+
25
+ @handle_ui_errors("Export overview report", "Please ensure overview analysis is complete")
26
+ def export_overview_report(self) -> tuple[str, str]:
27
+ """
28
+ Export company overview report.
29
+
30
+ Returns:
31
+ Tuple of (file_name, content)
32
+ """
33
+ if not self.session.overview_summary:
34
+ raise create_processing_error(
35
+ "No overview analysis available for export",
36
+ recovery_hint="Please complete the overview analysis first"
37
+ )
38
+
39
+ company_name = self._get_company_name()
40
+ file_name = f"company_overview_{company_name}.md"
41
+ content = f"# Company Overview\n\n{self.session.overview_summary}"
42
+
43
+ return file_name, content
44
+
45
+ @handle_ui_errors("Export strategic report", "Please ensure strategic analysis is complete")
46
+ def export_strategic_report(self) -> tuple[str, str]:
47
+ """
48
+ Export strategic analysis report.
49
+
50
+ Returns:
51
+ Tuple of (file_name, content)
52
+ """
53
+ if not self.session.strategic_summary:
54
+ raise create_processing_error(
55
+ "No strategic analysis available for export",
56
+ recovery_hint="Please complete the strategic analysis first"
57
+ )
58
+
59
+ company_name = self._get_company_name()
60
+ file_name = f"dd_report_{company_name}.md"
61
+
62
+ content = "# Due Diligence Report\n\n"
63
+ if self.session.overview_summary:
64
+ content += f"## Company Overview\n\n{self.session.overview_summary}\n\n"
65
+ content += f"## Strategic Analysis\n\n{self.session.strategic_summary}"
66
+
67
+ return file_name, content
68
+
69
+ @handle_ui_errors("Export combined report", "Please ensure analysis is complete")
70
+ def export_combined_report(self) -> tuple[str, str]:
71
+ """
72
+ Export combined due diligence report.
73
+
74
+ Returns:
75
+ Tuple of (file_name, content)
76
+ """
77
+ if not (self.session.overview_summary or self.session.strategic_summary):
78
+ raise create_processing_error(
79
+ "No analysis data available for export",
80
+ recovery_hint="Please complete overview or strategic analysis first"
81
+ )
82
+
83
+ company_name = self._get_company_name()
84
+ file_name = f"complete_dd_report_{company_name}.md"
85
+
86
+ content = f"# Complete Due Diligence Report - {company_name.title()}\n\n"
87
+
88
+ if self.session.overview_summary:
89
+ content += f"## Company Overview\n\n{self.session.overview_summary}\n\n"
90
+
91
+ if self.session.strategic_summary:
92
+ content += f"## Strategic Analysis\n\n{self.session.strategic_summary}\n\n"
93
+
94
+ # Add checklist results if available
95
+ if self.session.checklist_results:
96
+ content += "## Checklist Analysis\n\n"
97
+ for category, items in self.session.checklist_results.items():
98
+ content += f"### {category}\n\n"
99
+ if isinstance(items, list):
100
+ for item in items:
101
+ if isinstance(item, dict):
102
+ content += f"- {item.get('text', str(item))}\n"
103
+ else:
104
+ content += f"- {str(item)}\n"
105
+ content += "\n"
106
+
107
+ # Add question answers if available
108
+ if self.session.question_answers:
109
+ content += "## Due Diligence Questions\n\n"
110
+ for question, answer in self.session.question_answers.items():
111
+ if isinstance(answer, dict) and answer.get('has_answer'):
112
+ content += f"### {question}\n\n{answer.get('answer', '')}\n\n"
113
+
114
+ return file_name, content
115
+
116
+ @handle_ui_errors("Export checklist report", "Please ensure checklist analysis is complete")
117
+ def export_checklist_report(self) -> tuple[str, str]:
118
+ """
119
+ Export checklist analysis report.
120
+
121
+ Returns:
122
+ Tuple of (file_name, content)
123
+ """
124
+ if not self.session.checklist_results:
125
+ raise create_processing_error(
126
+ "No checklist results available for export",
127
+ recovery_hint="Please complete the checklist analysis first"
128
+ )
129
+
130
+ company_name = self._get_company_name()
131
+ file_name = f"checklist_analysis_{company_name}.md"
132
+
133
+ content = f"# Checklist Analysis Report - {company_name.title()}\n\n"
134
+
135
+ for category, items in self.session.checklist_results.items():
136
+ content += f"## {category}\n\n"
137
+ if isinstance(items, list):
138
+ for item in items:
139
+ if isinstance(item, dict):
140
+ content += f"- {item.get('text', str(item))}\n"
141
+ else:
142
+ content += f"- {str(item)}\n"
143
+ content += "\n"
144
+
145
+ return file_name, content
146
+
147
+ def _get_company_name(self) -> str:
148
+ """Get company name from current documents"""
149
+ documents = self.session.documents
150
+ if documents:
151
+ company_name = Path(list(documents.keys())[0]).parent.name
152
+ return company_name
153
+ return 'export'
app/main.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Main Application Entry Point
4
+ """
5
+
6
+ # Standard library imports
7
+ import os
8
+ import warnings
9
+
10
+ # Third-party imports
11
+ import streamlit as st
12
+
13
+ # Local imports
14
+ from app.core.config import init_app_config
15
+ from app.core.logging import configure_langchain_logging
16
+ from app.handlers.ai_handler import AIHandler
17
+ from app.handlers.document_handler import DocumentHandler
18
+ from app.handlers.export_handler import ExportHandler
19
+ from app.ui.session_manager import SessionManager
20
+ from app.ui.sidebar import Sidebar
21
+ from app.ui.tabs.checklist_tab import ChecklistTab
22
+ from app.ui.tabs.graph_tab import GraphTab
23
+ from app.ui.tabs.overview_tab import OverviewTab
24
+ from app.ui.tabs.qa_tab import QATab
25
+ from app.ui.tabs.questions_tab import QuestionsTab
26
+ from app.ui.tabs.strategic_tab import StrategicTab
27
+
28
+ # Enable tokenizers parallelism for better performance
29
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")
30
+
31
+ # Initialize for Streamlit Cloud deployment (must be done before other imports)
32
+ try:
33
+ from scripts.streamlit_cloud_config import initialize_for_streamlit_cloud
34
+ initialize_for_streamlit_cloud()
35
+ except ImportError:
36
+ # Local development - skip cloud initialization
37
+ pass
38
+
39
+ # Only suppress specific known non-critical warnings
40
+ warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
41
+ warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
42
+
43
+
44
+ class App:
45
+ """Main application class that orchestrates all components."""
46
+
47
+ def __init__(self):
48
+ """Initialize the application"""
49
+ # Initialize configuration
50
+ self.config = init_app_config()
51
+
52
+ # Initialize session manager
53
+ self.session = SessionManager()
54
+
55
+ # Initialize handlers
56
+ self.document_handler = DocumentHandler(self.session)
57
+ self.ai_handler = AIHandler(self.session)
58
+ self.export_handler = ExportHandler(self.session)
59
+
60
+ # Initialize UI components
61
+ self.sidebar = Sidebar(self.session, self.config)
62
+ self.tabs = {
63
+ 'overview': OverviewTab(self.session, self.config, self.ai_handler, self.export_handler),
64
+ 'strategic': StrategicTab(self.session, self.config, self.ai_handler, self.export_handler),
65
+ 'checklist': ChecklistTab(self.session, self.config, self.ai_handler),
66
+ 'questions': QuestionsTab(self.session, self.config, self.ai_handler),
67
+ 'qa': QATab(self.session, self.config, self.ai_handler),
68
+ 'graph': GraphTab(self.session, self.config, self.ai_handler, self.export_handler)
69
+ }
70
+
71
+ # Configure Streamlit page
72
+ st.set_page_config(
73
+ page_title=self.config.ui['page_title'],
74
+ page_icon=self.config.ui['page_icon'],
75
+ layout=self.config.ui['layout']
76
+ )
77
+
78
+ def run(self):
79
+ """Run the main application"""
80
+ # Render header
81
+ st.title("🤖 AI Due Diligence")
82
+ st.markdown("**Intelligent M&A Analysis:** Strategic assessment, automated document review, and AI-powered insights")
83
+
84
+ # Render sidebar and get selections
85
+ data_room_path, process_button = self.sidebar.render()
86
+
87
+ # Store the selected data room path
88
+ if data_room_path:
89
+ self.session.data_room_path = data_room_path
90
+
91
+ # Main tabs
92
+ tab_names = [
93
+ "🏢 Company Overview",
94
+ "🎯 Strategic Analysis",
95
+ "📊 Checklist Matching",
96
+ "❓ Due Diligence Questions",
97
+ "💬 Q&A with Citations",
98
+ "🧠 Knowledge Graph"
99
+ ]
100
+
101
+ tabs = st.tabs(tab_names)
102
+
103
+ with tabs[0]:
104
+ self.tabs['overview'].render()
105
+
106
+ with tabs[1]:
107
+ self.tabs['strategic'].render()
108
+
109
+ with tabs[2]:
110
+ self.tabs['checklist'].render()
111
+
112
+ with tabs[3]:
113
+ self.tabs['questions'].render()
114
+
115
+ with tabs[4]:
116
+ self.tabs['qa'].render()
117
+
118
+ with tabs[5]:
119
+ self.tabs['graph'].render()
120
+
121
+ # Processing trigger
122
+ if process_button and data_room_path:
123
+ with st.spinner("🚀 Processing data room..."):
124
+ self.sidebar.process_data_room(data_room_path)
125
+
126
+
127
+ def main():
128
+ """Main application entry point"""
129
+ # Configure LangChain logging
130
+ configure_langchain_logging(log_level="WARNING")
131
+
132
+ try:
133
+ app = App()
134
+ app.run()
135
+ except Exception as e:
136
+ from app.ui.error_handler import ErrorHandler
137
+ ErrorHandler.handle_error(
138
+ e,
139
+ "Application startup failed",
140
+ recovery_hint="Please refresh the page and try again"
141
+ )
142
+ st.stop()
143
+
144
+
145
+ if __name__ == "__main__":
146
+ main()
app/services/ai_client.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ AI Client
4
+
5
+ Handles Anthropic API client and LLM interaction logic.
6
+ Provides clean interface for LLM operations and connection management.
7
+ """
8
+
9
+ from typing import Optional, Any, List
10
+
11
+ from app.core.exceptions import AIError
12
+ from app.services.ai_config import AIConfig
13
+ from app.core.exceptions import LLMConnectionError, LLMAuthenticationError, LLMTimeoutError, LLMQuotaExceededError, LLMInvalidResponseError
14
+
15
+ # Import specific exception types for robust error handling
16
+ try:
17
+ from anthropic import (
18
+ APIConnectionError, APIError, APITimeoutError, AuthenticationError,
19
+ BadRequestError, ConflictError, InternalServerError, NotFoundError,
20
+ PermissionDeniedError, RateLimitError, UnprocessableEntityError,
21
+ ServiceUnavailableError
22
+ )
23
+ except ImportError:
24
+ # Fallback if anthropic package is not directly available
25
+ APIConnectionError = APIError = APITimeoutError = AuthenticationError = None
26
+ BadRequestError = ConflictError = InternalServerError = NotFoundError = None
27
+ PermissionDeniedError = RateLimitError = UnprocessableEntityError = None
28
+ ServiceUnavailableError = None
29
+
30
+
31
+ class AIClient:
32
+ """
33
+ Anthropic API client for LLM interactions.
34
+
35
+ This class manages the connection to Anthropic's Claude models,
36
+ handles initialization, and provides methods for LLM operations.
37
+ """
38
+
39
+ def __init__(self, config: AIConfig) -> None:
40
+ """
41
+ Initialize AI client with configuration.
42
+
43
+ Args:
44
+ config: AIConfig object containing service configuration
45
+
46
+ Raises:
47
+ AIError: If initialization fails
48
+ """
49
+ self.config: AIConfig = config
50
+ self._llm: Optional[Any] = None
51
+ self._initialized: bool = False
52
+
53
+ def _ensure_initialized(self) -> None:
54
+ """
55
+ Ensure the AI client is properly initialized and ready for use.
56
+
57
+ This method handles lazy initialization of the AI client, creating
58
+ the underlying LLM connection and testing it with a simple query.
59
+
60
+ Raises:
61
+ AIError: If initialization fails due to configuration or connection issues
62
+ """
63
+ if self._initialized:
64
+ return
65
+
66
+ try:
67
+ from langchain_anthropic import ChatAnthropic
68
+
69
+ self._llm = ChatAnthropic(
70
+ api_key=self.config.api_key,
71
+ model=self.config.model,
72
+ temperature=self.config.temperature,
73
+ max_tokens=self.config.max_tokens
74
+ )
75
+
76
+ # Test the connection with a simple query that validates AI functionality
77
+ from langchain_core.messages import HumanMessage
78
+ test_response = self._llm.invoke([
79
+ HumanMessage(content="Please respond with 'AI connection successful' if you can read this message.")
80
+ ])
81
+ if not test_response or not hasattr(test_response, 'content') or not test_response.content.strip():
82
+ raise AIError("AI service test failed - no valid response received")
83
+
84
+ # Verify the response contains expected content
85
+ response_text = test_response.content.strip().lower()
86
+ if "successful" not in response_text and "ai" not in response_text:
87
+ raise AIError("AI service test failed - unexpected response format")
88
+
89
+ self._initialized = True
90
+
91
+ except ImportError as e:
92
+ raise AIError(
93
+ f"Missing required AI library: {str(e)}",
94
+ user_message="AI libraries not installed",
95
+ recovery_hint="Please install required dependencies"
96
+ )
97
+ except Exception as e:
98
+ self._handle_llm_error(e)
99
+
100
+ def _handle_llm_error(self, error: Exception, include_invalid_response: bool = False) -> None:
101
+ """
102
+ Handle LLM-related errors with robust error type detection.
103
+
104
+ This method uses exception type checking as the primary classification method,
105
+ with string-based fallbacks for compatibility with different library versions.
106
+
107
+ Args:
108
+ error: The exception that occurred
109
+ include_invalid_response: Whether to include invalid response error handling
110
+
111
+ Raises:
112
+ Specific LLM error types based on exception type and content
113
+ """
114
+ # Primary: Check exception types for robust classification
115
+ if self._is_authentication_error(error):
116
+ raise LLMAuthenticationError(
117
+ f"AI authentication failed: {str(error)}",
118
+ user_message="AI authentication failed",
119
+ recovery_hint="Please check your API key"
120
+ )
121
+ elif self._is_timeout_error(error):
122
+ raise LLMTimeoutError(
123
+ f"AI service timeout: {str(error)}",
124
+ user_message="AI service timed out",
125
+ recovery_hint="Please try again later"
126
+ )
127
+ elif self._is_quota_error(error):
128
+ raise LLMQuotaExceededError(
129
+ f"AI quota exceeded: {str(error)}",
130
+ user_message="AI quota exceeded",
131
+ recovery_hint="Please check your API usage limits"
132
+ )
133
+ elif self._is_connection_error(error):
134
+ raise LLMConnectionError(
135
+ f"AI connection failed: {str(error)}",
136
+ user_message="AI connection failed",
137
+ recovery_hint="Please check your network connection"
138
+ )
139
+ elif include_invalid_response and self._is_invalid_response_error(error):
140
+ raise LLMInvalidResponseError(
141
+ f"AI returned invalid response: {str(error)}",
142
+ user_message="AI returned invalid response",
143
+ recovery_hint="Please try again"
144
+ )
145
+
146
+ # Default error messages based on context
147
+ if include_invalid_response:
148
+ raise AIError(
149
+ f"Response generation failed: {str(error)}",
150
+ user_message="Failed to generate AI response",
151
+ recovery_hint="Please try again or check your API key"
152
+ )
153
+ else:
154
+ raise AIError(
155
+ f"Failed to initialize AI client: {str(error)}",
156
+ user_message="AI client initialization failed",
157
+ recovery_hint="Please check your API key and network connection"
158
+ )
159
+
160
+ def _is_authentication_error(self, error: Exception) -> bool:
161
+ """Check if error is an authentication-related error."""
162
+ # Primary: Check exception types
163
+ if AuthenticationError and isinstance(error, AuthenticationError):
164
+ return True
165
+ if PermissionDeniedError and isinstance(error, PermissionDeniedError):
166
+ return True
167
+
168
+ # Fallback: String-based detection for compatibility
169
+ error_msg = str(error).lower()
170
+ return "authentication" in error_msg or "api key" in error_msg or "unauthorized" in error_msg
171
+
172
+ def _is_timeout_error(self, error: Exception) -> bool:
173
+ """Check if error is a timeout-related error."""
174
+ # Primary: Check exception types
175
+ if APITimeoutError and isinstance(error, APITimeoutError):
176
+ return True
177
+
178
+ # Fallback: String-based detection
179
+ error_msg = str(error).lower()
180
+ return "timeout" in error_msg or "timed out" in error_msg
181
+
182
+ def _is_quota_error(self, error: Exception) -> bool:
183
+ """Check if error is a quota/rate limit related error."""
184
+ # Primary: Check exception types
185
+ if RateLimitError and isinstance(error, RateLimitError):
186
+ return True
187
+
188
+ # Fallback: String-based detection
189
+ error_msg = str(error).lower()
190
+ return "quota" in error_msg or "rate limit" in error_msg or "limit exceeded" in error_msg
191
+
192
+ def _is_connection_error(self, error: Exception) -> bool:
193
+ """Check if error is a connection/network related error."""
194
+ # Primary: Check exception types
195
+ if APIConnectionError and isinstance(error, APIConnectionError):
196
+ return True
197
+ if InternalServerError and isinstance(error, InternalServerError):
198
+ return True
199
+ if ServiceUnavailableError and isinstance(error, ServiceUnavailableError):
200
+ return True
201
+
202
+ # Fallback: String-based detection
203
+ error_msg = str(error).lower()
204
+ return ("connection" in error_msg or "network" in error_msg or
205
+ "connection reset" in error_msg or "connection refused" in error_msg)
206
+
207
+ def _is_invalid_response_error(self, error: Exception) -> bool:
208
+ """Check if error is related to invalid/malformed responses."""
209
+ # Primary: Check exception types
210
+ if BadRequestError and isinstance(error, BadRequestError):
211
+ return True
212
+ if UnprocessableEntityError and isinstance(error, UnprocessableEntityError):
213
+ return True
214
+
215
+ # Fallback: String-based detection
216
+ error_msg = str(error).lower()
217
+ return ("invalid" in error_msg or "malformed" in error_msg or
218
+ "bad request" in error_msg or "unprocessable" in error_msg)
219
+
220
+ @property
221
+ def is_available(self) -> bool:
222
+ """
223
+ Check if AI client is available and ready for operations.
224
+
225
+ This property performs lazy initialization if needed and returns
226
+ the availability status of the AI client.
227
+
228
+ Returns:
229
+ True if AI client is initialized and ready, False otherwise
230
+ """
231
+ try:
232
+ self._ensure_initialized()
233
+ return True
234
+ except (AIError):
235
+ return False
236
+
237
+ @property
238
+ def llm(self) -> Any:
239
+ """
240
+ Get the underlying LLM instance for direct access.
241
+
242
+ This property provides access to the raw LangChain LLM object
243
+ for advanced use cases that require direct interaction.
244
+
245
+ Returns:
246
+ LangChain LLM instance (ChatAnthropic)
247
+
248
+ Raises:
249
+ AIError: If LLM is not initialized
250
+ """
251
+ self._ensure_initialized()
252
+ return self._llm
253
+
254
+ def generate_response(self, messages: List[dict]) -> str:
255
+ """
256
+ Generate a response using the LLM.
257
+
258
+ Args:
259
+ messages: List of message dictionaries for the LLM
260
+
261
+ Returns:
262
+ Generated response content
263
+
264
+ Raises:
265
+ AIError: If response generation fails
266
+ """
267
+ self._ensure_initialized()
268
+
269
+ try:
270
+ response = self._llm.invoke(messages)
271
+ return response.content.strip()
272
+ except Exception as e:
273
+ self._handle_llm_error(e, include_invalid_response=True)
274
+
275
+ def generate_text(self, prompt: str, context: Optional[List[str]] = None) -> str:
276
+ """
277
+ Generate text using the AI client.
278
+
279
+ Args:
280
+ prompt: The main prompt for text generation
281
+ context: Optional context documents
282
+
283
+ Returns:
284
+ Generated text response
285
+ """
286
+ self._ensure_initialized()
287
+
288
+ # Prepare the full prompt
289
+ full_prompt = prompt
290
+ if context:
291
+ context_str = "\n\n".join(context[:3]) # Limit context to prevent token overflow
292
+ full_prompt = f"Context:\n{context_str}\n\n{prompt}"
293
+
294
+ try:
295
+ from langchain_core.messages import HumanMessage
296
+
297
+ response = self._llm.invoke([HumanMessage(content=full_prompt)])
298
+ return response.content.strip()
299
+
300
+ except Exception as e:
301
+ self._handle_llm_error(e, include_invalid_response=True)
app/services/ai_config.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ AI Configuration
4
+
5
+ Configuration settings for AI service operations.
6
+ Provides type safety and validation for AI service parameters.
7
+ """
8
+
9
+ from dataclasses import dataclass
10
+
11
+ from app.core.exceptions import ConfigError
12
+ from app.core.constants import TEMPERATURE
13
+
14
+
15
+ @dataclass
16
+ class AIConfig:
17
+ """
18
+ Configuration settings for AI service operations.
19
+
20
+ This dataclass encapsulates all configuration parameters needed
21
+ for AI service initialization and operation, providing type safety
22
+ and validation.
23
+
24
+ Attributes:
25
+ api_key: Anthropic API key for authentication
26
+ model: Claude model name to use for operations
27
+ temperature: Sampling temperature (0.0 = deterministic, higher = more creative)
28
+ max_tokens: Maximum tokens to generate in responses
29
+
30
+ Example:
31
+ config = AIConfig(
32
+ api_key="sk-ant-...",
33
+ model="claude-3-5-sonnet",
34
+ temperature=TEMPERATURE,
35
+ max_tokens=4000
36
+ )
37
+ """
38
+ api_key: str
39
+ model: str
40
+ temperature: float = TEMPERATURE
41
+ max_tokens: int = 4000
42
+
43
+ def validate(self) -> None:
44
+ """
45
+ Validate the AI configuration for required values and consistency.
46
+
47
+ Performs comprehensive validation of all configuration parameters
48
+ to ensure they are valid for AI service operations.
49
+
50
+ Raises:
51
+ ConfigError: If any configuration values are invalid
52
+ """
53
+ if not self.api_key or not self.api_key.strip():
54
+ raise ConfigError(
55
+ "AI API key is missing",
56
+ user_message="API key is required for AI features",
57
+ recovery_hint="Please configure your Anthropic API key in the sidebar"
58
+ )
59
+
60
+ if not self.model or not self.model.strip():
61
+ raise ConfigError(
62
+ "AI model is not specified",
63
+ user_message="AI model selection is required",
64
+ recovery_hint="Please select a Claude model"
65
+ )
app/services/ai_service.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ AI Service
4
+
5
+ Provides a clean interface for AI operations.
6
+ Reduces coupling between AI components and the rest of the system.
7
+ """
8
+
9
+ from typing import Optional, Dict, List, Any
10
+
11
+ from app.core.exceptions import AIError, ConfigError
12
+ # Removed circular import: from app.ui.error_handler import handle_processing_errors
13
+ from app.core.exceptions import create_config_error
14
+ from app.core.constants import QA_MAX_TOKENS, SUPPORTED_ANALYSIS_TYPES
15
+ from app.services.ai_config import AIConfig
16
+ from app.services.ai_client import AIClient
17
+ from app.services.response_parser import ResponseParser
18
+
19
+
20
+ class AIService:
21
+ """
22
+ Simplified AI service providing clean, type-safe interface for AI operations.
23
+
24
+ This service replaces the complex DDChecklistAgent with a focused, simple interface
25
+ that handles the core AI operations needed by the application. It provides:
26
+
27
+ Features:
28
+ - Type-safe AI operations with comprehensive error handling
29
+ - Multiple analysis types (overview, strategic, checklist, questions)
30
+ - Token usage estimation and limits
31
+ - Configurable AI models and parameters
32
+ - Clean separation of concerns
33
+
34
+ Attributes:
35
+ config: AIConfig object containing service configuration
36
+ is_available: Property indicating if service is ready for use
37
+
38
+ Example:
39
+ config = AIConfig(api_key="sk-ant-...", model="claude-3-sonnet-20240229")
40
+ ai_service = AIService(config)
41
+
42
+ if ai_service.is_available:
43
+ result = ai_service.analyze_documents(docs, "overview")
44
+ answer = ai_service.answer_question("What is the revenue?", context)
45
+ """
46
+
47
+ def __init__(self, config: AIConfig) -> None:
48
+ """
49
+ Initialize AI service with configuration and validate setup.
50
+
51
+ Args:
52
+ config: AIConfig object containing service configuration
53
+
54
+ Raises:
55
+ ConfigError: If configuration validation fails
56
+ """
57
+ self.config: AIConfig = config
58
+ self.config.validate()
59
+ self._client: Optional[AIClient] = None
60
+
61
+ @property
62
+ def _ensure_client(self) -> AIClient:
63
+ """
64
+ Ensure the AI client is properly initialized.
65
+
66
+ Returns:
67
+ Initialized AIClient instance
68
+
69
+ Raises:
70
+ AIError: If client initialization fails
71
+ """
72
+ if self._client is None:
73
+ self._client = AIClient(self.config)
74
+ return self._client
75
+
76
+ @property
77
+ def is_available(self) -> bool:
78
+ """
79
+ Check if AI service is available and ready for operations.
80
+
81
+ This property performs lazy initialization if needed and returns
82
+ the availability status of the AI service.
83
+
84
+ Returns:
85
+ True if AI service is initialized and ready, False otherwise
86
+ """
87
+ try:
88
+ return self._ensure_client.is_available
89
+ except (AIError, ConfigError):
90
+ return False
91
+
92
+ @property
93
+ def llm(self) -> Any:
94
+ """
95
+ Get the underlying LLM instance for direct access.
96
+
97
+ This property provides access to the raw LangChain LLM object
98
+ for advanced use cases that require direct interaction.
99
+
100
+ Returns:
101
+ LangChain LLM instance (ChatAnthropic)
102
+
103
+ Raises:
104
+ AIError: If LLM is not initialized
105
+ """
106
+ return self._ensure_client.llm
107
+
108
+ # Removed decorator to avoid circular imports
109
+ def generate_text(
110
+ self,
111
+ prompt: str,
112
+ context: Optional[List[str]] = None,
113
+ max_length: Optional[int] = None
114
+ ) -> str:
115
+ """
116
+ Generate text using the AI service.
117
+
118
+ Args:
119
+ prompt: The main prompt for text generation
120
+ context: Optional context documents
121
+ max_length: Maximum response length
122
+
123
+ Returns:
124
+ Generated text response
125
+ """
126
+ client = self._ensure_client
127
+ response = client.generate_text(prompt, context)
128
+ return ResponseParser.format_response(response, max_length)
129
+
130
+ # Removed decorator to avoid circular imports
131
+ def analyze_documents(
132
+ self,
133
+ documents: Dict[str, Dict[str, Any]],
134
+ analysis_type: str,
135
+ strategy_text: Optional[str] = None,
136
+ checklist_results: Optional[Dict[str, Any]] = None
137
+ ) -> str:
138
+ """
139
+ Analyze documents using AI with different analysis types.
140
+
141
+ This method performs comprehensive document analysis using AI, supporting
142
+ multiple analysis types for different business use cases.
143
+
144
+ Args:
145
+ documents: Dictionary mapping document names to document data.
146
+ Each document dict should contain 'content' and other metadata.
147
+ analysis_type: Type of analysis to perform. Supported types:
148
+ - "overview": Company overview and business analysis
149
+ - "strategic": Strategic positioning and recommendations
150
+ - "checklist": Due diligence checklist analysis
151
+ - "questions": Answer due diligence questions
152
+ strategy_text: Optional strategy document content for context
153
+ checklist_results: Optional existing checklist results for strategic analysis
154
+
155
+ Returns:
156
+ AI-generated analysis text with comprehensive insights
157
+
158
+ Raises:
159
+ AIError: If analysis fails or service is unavailable
160
+ ValueError: If analysis_type is not supported
161
+
162
+ Example:
163
+ docs = {
164
+ "annual_report.pdf": {"content": "Company financials...", "name": "Annual Report"},
165
+ "strategy.docx": {"content": "Strategic plan...", "name": "Strategy"}
166
+ }
167
+ analysis = ai_service.analyze_documents(docs, "overview")
168
+ """
169
+ # Input validation
170
+ if not documents:
171
+ raise ValueError("Documents dictionary cannot be None or empty")
172
+
173
+ if not isinstance(documents, dict):
174
+ raise ValueError("Documents must be a dictionary")
175
+
176
+ if analysis_type not in SUPPORTED_ANALYSIS_TYPES:
177
+ raise ValueError(f"Invalid analysis type: {analysis_type}. Supported types: {SUPPORTED_ANALYSIS_TYPES}")
178
+
179
+ # Validate each document has content
180
+ for doc_name, doc_data in documents.items():
181
+ if not isinstance(doc_data, dict):
182
+ raise ValueError(f"Document '{doc_name}' must be a dictionary")
183
+ if 'content' not in doc_data:
184
+ raise ValueError(f"Document '{doc_name}' must contain a 'content' key")
185
+ if not doc_data['content']:
186
+ raise ValueError(f"Document '{doc_name}' content cannot be empty")
187
+
188
+ # Prepare context from documents
189
+ context_docs = ResponseParser.prepare_context_documents(documents)
190
+
191
+ # Create analysis prompt based on type
192
+ prompt = self._get_analysis_prompt(analysis_type, context_docs, strategy_text, checklist_results)
193
+
194
+ return self.generate_text(prompt, max_length=3000)
195
+
196
+ def _get_analysis_prompt(self, analysis_type: str, context_docs: List[str],
197
+ strategy_text: Optional[str] = None,
198
+ checklist_results: Optional[Dict[str, Any]] = None) -> str:
199
+ """
200
+ Get the appropriate analysis prompt based on analysis type.
201
+
202
+ Args:
203
+ analysis_type: Type of analysis to perform
204
+ context_docs: Prepared context documents
205
+ strategy_text: Optional strategy document content
206
+ checklist_results: Optional existing checklist results
207
+
208
+ Returns:
209
+ Generated prompt for the specified analysis type
210
+
211
+ Raises:
212
+ ValueError: If analysis_type is not supported
213
+ """
214
+ if analysis_type == "overview":
215
+ return ResponseParser.create_overview_prompt(context_docs, strategy_text, checklist_results)
216
+ if analysis_type == "strategic":
217
+ return ResponseParser.create_strategic_prompt(context_docs, strategy_text, checklist_results)
218
+ if analysis_type == "checklist":
219
+ return ResponseParser.create_checklist_prompt(context_docs)
220
+ if analysis_type == "questions":
221
+ return ResponseParser.create_questions_prompt(context_docs)
222
+
223
+ raise ValueError(f"Unknown analysis type: {analysis_type}")
224
+
225
+
226
+ # Removed decorator to avoid circular imports
227
+ def answer_question(
228
+ self,
229
+ question: str,
230
+ context_docs: List[str],
231
+ max_length: Optional[int] = None
232
+ ) -> str:
233
+ """
234
+ Answer a specific question using AI with document context.
235
+
236
+ This method performs question answering by analyzing the provided
237
+ question against relevant document excerpts to provide accurate,
238
+ context-aware answers.
239
+
240
+ Args:
241
+ question: The question to answer. Should be clear and specific
242
+ for best results (e.g., "What is the company's revenue?"
243
+ rather than "Tell me about revenue").
244
+ context_docs: List of relevant document excerpts that may contain
245
+ information to answer the question. Should be
246
+ pre-filtered to most relevant content.
247
+ max_length: Optional maximum length for the answer in characters.
248
+ If None, uses service default (typically 2000 chars).
249
+
250
+ Returns:
251
+ AI-generated answer with citations and context where applicable
252
+
253
+ Raises:
254
+ AIError: If question answering fails or service is unavailable
255
+
256
+ Example:
257
+ context = [
258
+ "The company reported $50M in revenue for Q4 2023...",
259
+ "Revenue growth was 15% compared to previous year..."
260
+ ]
261
+ answer = ai_service.answer_question(
262
+ "What was the company's revenue for Q4 2023?",
263
+ context
264
+ )
265
+ """
266
+ # Input validation
267
+ if not question or not isinstance(question, str):
268
+ raise ValueError("Question must be a non-empty string")
269
+
270
+ if not context_docs:
271
+ raise ValueError("Context documents list cannot be None or empty")
272
+
273
+ if not isinstance(context_docs, list):
274
+ raise ValueError("Context documents must be a list")
275
+
276
+ # Validate each context document
277
+ for i, doc in enumerate(context_docs):
278
+ if not isinstance(doc, str):
279
+ raise ValueError(f"Context document at index {i} must be a string")
280
+ if not doc.strip():
281
+ raise ValueError(f"Context document at index {i} cannot be empty or whitespace only")
282
+
283
+ prompt = ResponseParser.create_question_answer_prompt(question, context_docs)
284
+ return self.generate_text(prompt, max_length=max_length or QA_MAX_TOKENS)
285
+
286
+ def get_token_usage_estimate(self, text: str) -> int:
287
+ """
288
+ Estimate token usage for a given text using character-based approximation.
289
+
290
+ This method provides a rough estimate of token count based on character
291
+ length. Actual token counts may vary depending on the specific tokenizer
292
+ used by the AI model.
293
+
294
+ Args:
295
+ text: Text to estimate token count for. Can be any string content
296
+ including prompts, documents, or responses.
297
+
298
+ Returns:
299
+ Estimated token count (integer). Uses approximation of ~4 characters
300
+ per token, which is typical for English text with Claude models.
301
+
302
+ Note:
303
+ This is an approximation. For precise token counting, use the
304
+ actual tokenizer for the specific AI model being used.
305
+
306
+ Example:
307
+ estimate = ai_service.get_token_usage_estimate("Hello, how are you?")
308
+ # Returns approximately 5-6 tokens
309
+ """
310
+ if not text:
311
+ return 0
312
+
313
+ # Rough estimation: ~4 characters per token for English text
314
+ # This is a conservative estimate that works well for Claude models
315
+ return len(text) // 4
316
+
317
+ def is_within_token_limit(self, text: str, max_tokens: int = 100000) -> bool:
318
+ """
319
+ Check if text is within specified token limits.
320
+
321
+ This method helps prevent token overflow by checking if the estimated
322
+ token count for a given text is within acceptable limits.
323
+
324
+ Args:
325
+ text: Text to check for token limit compliance
326
+ max_tokens: Maximum allowed tokens. Default is 100,000 which is
327
+ a conservative limit for most AI models.
328
+
329
+ Returns:
330
+ True if estimated token count is within the specified limit,
331
+ False if it exceeds the limit.
332
+
333
+ Note:
334
+ Uses character-based estimation which may not be perfectly accurate.
335
+ For critical token limit checking, consider using the actual tokenizer.
336
+
337
+ Example:
338
+ if ai_service.is_within_token_limit(document_content, 8000):
339
+ # Safe to process
340
+ analysis = ai_service.analyze_documents(docs, "overview")
341
+ else:
342
+ # Need to truncate or split content
343
+ print("Content too long for processing")
344
+ """
345
+ if not text:
346
+ return True
347
+
348
+ estimated_tokens = self.get_token_usage_estimate(text)
349
+ return estimated_tokens <= max_tokens
350
+
351
+
352
+ # Factory function for easy service creation
353
+ def create_ai_service(
354
+ api_key: str,
355
+ model: str,
356
+ temperature: float = 0.1,
357
+ max_tokens: int = 4000
358
+ ) -> AIService:
359
+ """
360
+ Create and configure an AI service instance with the given parameters.
361
+
362
+ This factory function provides a convenient way to create AI service instances
363
+ with proper configuration and validation. It handles all the setup steps
364
+ including configuration validation and service initialization.
365
+
366
+ Args:
367
+ api_key: Anthropic API key for authentication. Must be a valid
368
+ Anthropic API key with sufficient permissions.
369
+ model: Claude model to use for AI operations. Examples:
370
+ - "claude-3-5-sonnet" (recommended for most use cases)
371
+ - "claude-3-5-haiku-20241022" (faster, less expensive)
372
+ - "claude-3-opus-20240229" (most capable, more expensive)
373
+ temperature: Sampling temperature for response generation (0.0 to 1.0).
374
+ Lower values (0.1) produce more deterministic responses.
375
+ Higher values (0.7+) produce more creative responses.
376
+ max_tokens: Maximum tokens to generate in AI responses.
377
+ Default 4000 tokens provides good balance of length and cost.
378
+
379
+ Returns:
380
+ Fully configured and validated AIService instance ready for use
381
+
382
+ Raises:
383
+ ConfigError: If configuration parameters are invalid
384
+ AIError: If AI service initialization fails
385
+
386
+ Example:
387
+ # Basic usage
388
+ ai_service = create_ai_service("sk-ant-...", "claude-3-5-sonnet")
389
+
390
+ # Advanced configuration
391
+ ai_service = create_ai_service(
392
+ api_key="sk-ant-...",
393
+ model="claude-3-5-haiku-20241022",
394
+ temperature=0.2,
395
+ max_tokens=QA_MAX_TOKENS
396
+ )
397
+
398
+ # Use the service
399
+ if ai_service.is_available:
400
+ answer = ai_service.answer_question("What is AI?", ["AI is artificial intelligence..."])
401
+ """
402
+ # Validate and resolve API key
403
+ api_key = _resolve_api_key(api_key)
404
+
405
+ config = AIConfig(
406
+ api_key=api_key,
407
+ model=model,
408
+ temperature=temperature,
409
+ max_tokens=max_tokens
410
+ )
411
+ return AIService(config)
412
+
413
+
414
+ def _resolve_api_key(api_key: Optional[str]) -> str:
415
+ """
416
+ Resolve API key from parameter or environment variable.
417
+
418
+ Args:
419
+ api_key: API key provided by user, or None
420
+
421
+ Returns:
422
+ Resolved API key string
423
+
424
+ Raises:
425
+ ConfigError: If no API key is available
426
+ """
427
+ if api_key is not None:
428
+ return api_key
429
+
430
+ import os
431
+ env_key = os.getenv('ANTHROPIC_API_KEY')
432
+ if env_key is not None:
433
+ return env_key
434
+
435
+ raise create_config_error(
436
+ "AI API key is missing",
437
+ recovery_hint="Please set ANTHROPIC_API_KEY environment variable or pass api_key parameter"
438
+ )
app/services/response_parser.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Response Parser
4
+
5
+ Handles response parsing and formatting functions for AI operations.
6
+ Provides methods for creating prompts and processing AI responses.
7
+ """
8
+
9
+ from typing import List, Dict, Any, Optional
10
+
11
+ from app.core.exceptions import ProcessingError
12
+
13
+
14
+ class ResponseParser:
15
+ """
16
+ Parser for AI responses and prompt generation.
17
+
18
+ This class provides methods for creating structured prompts
19
+ and processing AI responses for different analysis types.
20
+ """
21
+
22
+ @staticmethod
23
+ def create_overview_prompt(
24
+ context_docs: List[str],
25
+ strategy_text: Optional[str],
26
+ checklist_results: Optional[Dict]
27
+ ) -> str:
28
+ """Create overview analysis prompt"""
29
+ prompt = "Based on the following company documents, provide a comprehensive overview analysis:\n\n"
30
+
31
+ if context_docs:
32
+ prompt += "Company Documents:\n" + "\n\n".join(context_docs) + "\n\n"
33
+
34
+ if strategy_text:
35
+ prompt += f"Strategic Context:\n{strategy_text[:1000]}\n\n"
36
+
37
+ if checklist_results:
38
+ prompt += f"Checklist Findings:\n{str(checklist_results)[:1000]}\n\n"
39
+
40
+ prompt += """Please provide:
41
+ 1. Company overview and business model
42
+ 2. Key strengths and competitive advantages
43
+ 3. Main risks and challenges
44
+ 4. Financial health indicators
45
+ 5. Strategic recommendations
46
+
47
+ Be specific, factual, and focus on the most important insights."""
48
+
49
+ return prompt
50
+
51
+ @staticmethod
52
+ def create_strategic_prompt(
53
+ context_docs: List[str],
54
+ strategy_text: Optional[str],
55
+ checklist_results: Optional[Dict]
56
+ ) -> str:
57
+ """Create strategic analysis prompt"""
58
+ prompt = "Provide a strategic analysis based on the following company information:\n\n"
59
+
60
+ if strategy_text:
61
+ prompt += f"Strategic Framework:\n{strategy_text[:1000]}\n\n"
62
+
63
+ if context_docs:
64
+ prompt += "Company Documents:\n" + "\n\n".join(context_docs) + "\n\n"
65
+
66
+ if checklist_results:
67
+ prompt += f"Operational Findings:\n{str(checklist_results)[:1000]}\n\n"
68
+
69
+ prompt += """Please analyze:
70
+ 1. Strategic positioning and market opportunities
71
+ 2. Operational strengths and weaknesses
72
+ 3. Risk mitigation strategies
73
+ 4. Growth potential and recommendations
74
+ 5. Investment considerations
75
+
76
+ Focus on strategic implications and actionable insights."""
77
+
78
+ return prompt
79
+
80
+ @staticmethod
81
+ def create_checklist_prompt(context_docs: List[str]) -> str:
82
+ """Create checklist analysis prompt"""
83
+ prompt = "Analyze the following documents against standard due diligence checklist items:\n\n"
84
+
85
+ if context_docs:
86
+ prompt += "Documents to Analyze:\n" + "\n\n".join(context_docs) + "\n\n"
87
+
88
+ prompt += """For each major due diligence category, identify:
89
+ 1. What information is available in the documents
90
+ 2. What information appears to be missing
91
+ 3. Any red flags or concerns identified
92
+ 4. Recommendations for further investigation
93
+
94
+ Be thorough and specific in your analysis."""
95
+
96
+ return prompt
97
+
98
+ @staticmethod
99
+ def create_questions_prompt(context_docs: List[str]) -> str:
100
+ """Create questions analysis prompt"""
101
+ prompt = "Answer due diligence questions based on the following documents:\n\n"
102
+
103
+ if context_docs:
104
+ prompt += "Reference Documents:\n" + "\n\n".join(context_docs) + "\n\n"
105
+
106
+ prompt += """For each question, provide:
107
+ 1. Direct answer based on available information
108
+ 2. Supporting evidence from the documents
109
+ 3. Confidence level in the answer
110
+ 4. Any additional context or caveats
111
+
112
+ If information is not available, clearly state this and suggest what additional information would be needed."""
113
+
114
+ return prompt
115
+
116
+ @staticmethod
117
+ def create_question_answer_prompt(question: str, context_docs: List[str]) -> str:
118
+ """Create prompt for answering a specific question"""
119
+ return f"""Based on the following document excerpts, please answer this question:
120
+
121
+ Question: {question}
122
+
123
+ Relevant Document Excerpts:
124
+ {"\n\n".join(context_docs[:5])}
125
+
126
+ Please provide a comprehensive, factual answer with specific references to the source documents.
127
+ If the information is not available in the provided context, clearly state this."""
128
+
129
+ @staticmethod
130
+ def format_response(response: str, max_length: Optional[int] = None) -> str:
131
+ """
132
+ Format and clean AI response.
133
+
134
+ Args:
135
+ response: Raw AI response
136
+ max_length: Optional maximum length for the response
137
+
138
+ Returns:
139
+ Formatted response
140
+
141
+ Raises:
142
+ ProcessingError: If response formatting fails
143
+ """
144
+ try:
145
+ if not response:
146
+ raise ValueError("Response cannot be empty")
147
+
148
+ result = response.strip()
149
+ if max_length and len(result) > max_length:
150
+ result = result[:max_length] + "..."
151
+ return result
152
+ except Exception as e:
153
+ raise ProcessingError(f"Failed to format AI response: {e}")
154
+
155
+ @staticmethod
156
+ def prepare_context_documents(documents: Dict[str, Dict[str, Any]], max_docs: int = 5) -> List[str]:
157
+ """
158
+ Prepare context documents for AI processing.
159
+
160
+ Args:
161
+ documents: Dictionary mapping document names to document data
162
+ max_docs: Maximum number of documents to process
163
+
164
+ Returns:
165
+ List of formatted document contexts
166
+
167
+ Raises:
168
+ ProcessingError: If document preparation fails
169
+ """
170
+ try:
171
+ if not documents:
172
+ raise ValueError("No documents provided for context preparation")
173
+
174
+ context_docs = []
175
+ for doc_key, doc_data in list(documents.items())[:max_docs]:
176
+ if isinstance(doc_data, dict) and 'content' in doc_data:
177
+ content = doc_data['content'][:1000] # Truncate long content
178
+ context_docs.append(f"Document: {doc_data.get('name', doc_key)}\n{content}")
179
+
180
+ if not context_docs:
181
+ raise ValueError("No valid documents found with content")
182
+
183
+ return context_docs
184
+ except Exception as e:
185
+ raise ProcessingError(f"Failed to prepare context documents: {e}")
app/ui/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ UI Components Package
3
+
4
+ Contains all user interface components and layout functions.
5
+ """
6
+
7
+ from .sidebar import Sidebar
8
+
9
+ __all__ = ['Sidebar']
app/ui/error_handler.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Standardized Error Handling System
4
+
5
+ Provides consistent error handling patterns across all modules.
6
+ Centralizes error logging, user messaging, and recovery mechanisms.
7
+ """
8
+
9
+ import logging
10
+ import streamlit as st
11
+ from typing import Any, Optional, Callable, TypeVar
12
+ from functools import wraps
13
+
14
+ from app.core.exceptions import (
15
+ AppException, ValidationError, ProcessingError,
16
+ AIError, ConfigError
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Re-export core exceptions for backward compatibility
22
+ AppError = AppException
23
+
24
+ T = TypeVar('T')
25
+
26
+
27
+ # Exception classes are imported from app.core.exceptions above
28
+
29
+
30
+ class ErrorHandler:
31
+ """
32
+ Centralized error handling system with consistent patterns.
33
+ """
34
+
35
+ @staticmethod
36
+ def handle_error(
37
+ error: Exception,
38
+ context: str = "",
39
+ show_user_message: bool = True,
40
+ log_error: bool = True,
41
+ recovery_hint: Optional[str] = None
42
+ ) -> None:
43
+ """
44
+ Handle an error with consistent logging and user messaging.
45
+
46
+ Args:
47
+ error: The exception that occurred
48
+ context: Description of where the error occurred
49
+ show_user_message: Whether to show error message to user
50
+ log_error: Whether to log the error
51
+ recovery_hint: Optional hint for user recovery
52
+ """
53
+ if log_error:
54
+ ErrorHandler._log_error(error, context)
55
+
56
+ if show_user_message:
57
+ ErrorHandler._show_user_error(error, recovery_hint)
58
+
59
+ @staticmethod
60
+ def _log_error(error: Exception, context: str = "") -> None:
61
+ """Log error with appropriate level based on error type"""
62
+ error_msg = f"{context}: {str(error)}" if context else str(error)
63
+
64
+ if isinstance(error, (ValidationError, ConfigError)):
65
+ logger.warning(error_msg)
66
+ elif isinstance(error, (ProcessingError, AIError)):
67
+ logger.error(error_msg)
68
+ else:
69
+ logger.exception(f"Unexpected error - {error_msg}")
70
+
71
+ @staticmethod
72
+ def _show_user_error(error: Exception, recovery_hint: Optional[str] = None) -> None:
73
+ """Show appropriate error message to user"""
74
+ from app.ui.ui_components import status_message
75
+
76
+ if isinstance(error, AppError):
77
+ user_message = error.user_message
78
+ else:
79
+ # For unexpected errors, don't show internal details
80
+ user_message = "An unexpected error occurred. Please try again."
81
+
82
+ # Add recovery hint if provided
83
+ if recovery_hint:
84
+ user_message += f"\n\n💡 {recovery_hint}"
85
+
86
+ # Show error message to user
87
+ if isinstance(error, ValidationError):
88
+ status_message(user_message, "warning")
89
+ else:
90
+ status_message(user_message, "error")
91
+
92
+ @staticmethod
93
+ def handle_with_recovery(
94
+ func: Callable[..., T],
95
+ context: str = "",
96
+ default_value: Any = None,
97
+ show_spinner: bool = False,
98
+ spinner_text: str = "Processing...",
99
+ recovery_hint: Optional[str] = None
100
+ ) -> Callable[..., T]:
101
+ """
102
+ Decorator that provides consistent error handling with recovery.
103
+
104
+ Args:
105
+ func: Function to wrap
106
+ context: Description of the operation
107
+ default_value: Value to return on error
108
+ show_spinner: Whether to show spinner during operation
109
+ spinner_text: Text to show in spinner
110
+ recovery_hint: Hint for user recovery
111
+
112
+ Returns:
113
+ Wrapped function with error handling
114
+ """
115
+ @wraps(func)
116
+ def wrapper(*args, **kwargs) -> T:
117
+ try:
118
+ if show_spinner:
119
+ with st.spinner(spinner_text):
120
+ return func(*args, **kwargs)
121
+ else:
122
+ return func(*args, **kwargs)
123
+ except Exception as e:
124
+ ErrorHandler.handle_error(e, context, recovery_hint=recovery_hint)
125
+ return default_value
126
+
127
+ return wrapper
128
+
129
+ @staticmethod
130
+ def validate_input(value: Any, validator: Callable[[Any], bool], error_message: str) -> bool:
131
+ """
132
+ Validate input with consistent error handling.
133
+
134
+ Args:
135
+ value: Value to validate
136
+ validator: Function that returns True if valid
137
+ error_message: Error message if validation fails
138
+
139
+ Returns:
140
+ True if valid, False otherwise
141
+ """
142
+ try:
143
+ if validator(value):
144
+ return True
145
+ else:
146
+ raise ValidationError(error_message)
147
+ except ValidationError:
148
+ raise
149
+ except Exception as e:
150
+ raise ValidationError(f"Validation failed: {str(e)}")
151
+
152
+ @staticmethod
153
+ def ensure_config_value(config_value: Any, config_name: str) -> Any:
154
+ """
155
+ Ensure a configuration value exists and is valid.
156
+
157
+ Args:
158
+ config_value: The configuration value to check
159
+ config_name: Name of the configuration for error messages
160
+
161
+ Returns:
162
+ The config value if valid
163
+
164
+ Raises:
165
+ ConfigError: If config value is missing or invalid
166
+ """
167
+ if config_value is None or config_value == "":
168
+ raise ConfigError(
169
+ f"Configuration '{config_name}' is missing or empty",
170
+ user_message=f"Configuration error: {config_name} is not set",
171
+ recovery_hint="Please check your configuration and environment variables"
172
+ )
173
+ return config_value
174
+
175
+ @staticmethod
176
+ def handle_file_operation(
177
+ file_path: str,
178
+ operation: Callable[[], T],
179
+ operation_name: str = "file operation"
180
+ ) -> T:
181
+ """
182
+ Handle file operations with consistent error handling.
183
+
184
+ Args:
185
+ file_path: Path to the file being operated on
186
+ operation: Function that performs the file operation
187
+ operation_name: Description of the operation
188
+
189
+ Returns:
190
+ Result of the file operation
191
+ """
192
+ try:
193
+ return operation()
194
+ except FileNotFoundError:
195
+ raise ProcessingError(
196
+ f"File not found: {file_path}",
197
+ user_message=f"File not found: {file_path}",
198
+ recovery_hint="Please ensure the file exists and try again"
199
+ )
200
+ except PermissionError:
201
+ raise ProcessingError(
202
+ f"Permission denied accessing file: {file_path}",
203
+ user_message=f"Cannot access file: {file_path}",
204
+ recovery_hint="Please check file permissions"
205
+ )
206
+ except Exception as e:
207
+ raise ProcessingError(
208
+ f"Failed to {operation_name} file {file_path}: {str(e)}",
209
+ user_message=f"File operation failed: {operation_name}",
210
+ recovery_hint="Please check the file and try again"
211
+ )
212
+
213
+
214
+ # Convenience decorators for common patterns
215
+ def handle_ui_errors(context: str = "", recovery_hint: Optional[str] = None):
216
+ """
217
+ Decorator for UI operations that need error handling.
218
+
219
+ Args:
220
+ context: Description of the operation
221
+ recovery_hint: Optional hint for user recovery
222
+ """
223
+ def decorator(func):
224
+ @wraps(func)
225
+ def wrapper(*args, **kwargs):
226
+ try:
227
+ return func(*args, **kwargs)
228
+ except Exception as e:
229
+ ErrorHandler.handle_error(e, context, recovery_hint=recovery_hint)
230
+ return None
231
+ return wrapper
232
+ return decorator
233
+
234
+
235
+ def handle_processing_errors(context: str = "", recovery_hint: Optional[str] = None):
236
+ """
237
+ Decorator for processing operations that need error handling.
238
+
239
+ Args:
240
+ context: Description of the operation
241
+ recovery_hint: Optional hint for user recovery
242
+ """
243
+ def decorator(func):
244
+ @wraps(func)
245
+ def wrapper(*args, **kwargs):
246
+ try:
247
+ return func(*args, **kwargs)
248
+ except Exception as e:
249
+ ErrorHandler.handle_error(e, context, recovery_hint=recovery_hint)
250
+ raise # Re-raise for caller to handle
251
+ return wrapper
252
+ return decorator
253
+
254
+
255
+ def validate_and_execute(
256
+ validator: Callable[[], bool],
257
+ operation: Callable[[], T],
258
+ validation_error_msg: str = "Validation failed",
259
+ context: str = ""
260
+ ) -> T:
261
+ """
262
+ Validate and execute operation with consistent error handling.
263
+
264
+ Args:
265
+ validator: Function that returns True if validation passes
266
+ operation: Function to execute if validation passes
267
+ validation_error_msg: Error message for validation failure
268
+ context: Description of the operation
269
+
270
+ Returns:
271
+ Result of the operation
272
+
273
+ Raises:
274
+ ValidationError: If validation fails
275
+ """
276
+ try:
277
+ if not validator():
278
+ raise ValidationError(validation_error_msg, recovery_hint="Please check your input and try again")
279
+ return operation()
280
+ except ValidationError:
281
+ raise
282
+ except Exception as e:
283
+ ErrorHandler.handle_error(e, f"{context} - validation/execution failed")
284
+ raise
app/ui/session_manager.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Session State Manager
4
+
5
+ Manages Streamlit session state with type-safe access.
6
+ """
7
+
8
+ import streamlit as st
9
+ from typing import Any
10
+
11
+ from app.ui.error_handler import ErrorHandler
12
+
13
+
14
+
15
+ class SessionProperty:
16
+ """
17
+ Descriptor for session state properties with type-safe access.
18
+
19
+ This descriptor provides a clean interface to Streamlit's session state,
20
+ eliminating repetitive property definitions while maintaining type safety.
21
+ """
22
+
23
+ def __init__(self, default_value: Any = None):
24
+ self.default_value = default_value
25
+ self.name = None
26
+
27
+ def __set_name__(self, owner, name):
28
+ self.name = name
29
+
30
+ def __get__(self, instance, owner):
31
+ if instance is None:
32
+ return self
33
+ return st.session_state.get(self.name, self.default_value)
34
+
35
+ def __set__(self, instance, value):
36
+ st.session_state[self.name] = value
37
+
38
+
39
+ class SessionManager:
40
+ """Session state manager with type-safe access to session data."""
41
+
42
+ # Document processing state
43
+ documents = SessionProperty({})
44
+ chunks = SessionProperty([])
45
+ embeddings = SessionProperty(None)
46
+
47
+ # Analysis results
48
+ checklist_results = SessionProperty({})
49
+ question_answers = SessionProperty({})
50
+ overview_summary = SessionProperty("")
51
+ strategic_summary = SessionProperty("")
52
+
53
+ # User selections
54
+ strategy_path = SessionProperty(None)
55
+ strategy_text = SessionProperty("")
56
+ checklist_path = SessionProperty(None)
57
+ checklist_text = SessionProperty("")
58
+ questions_path = SessionProperty(None)
59
+ questions_text = SessionProperty("")
60
+ vdr_store = SessionProperty(None)
61
+ data_room_path = SessionProperty(None)
62
+
63
+ # Processing state
64
+ processing_active = SessionProperty(False)
65
+ agent = SessionProperty(None)
66
+
67
+ # Cached data
68
+ checklist = SessionProperty({})
69
+ questions = SessionProperty({})
70
+
71
+ def __init__(self) -> None:
72
+ """Initialize session state manager with default values."""
73
+ self._init_defaults()
74
+
75
+ def _init_defaults(self) -> None:
76
+ """Initialize default session state values."""
77
+ try:
78
+ # Get all descriptor properties and their defaults
79
+ all_properties = {
80
+ name: getattr(self.__class__, name).default_value
81
+ for name in dir(self.__class__)
82
+ if isinstance(getattr(self.__class__, name), SessionProperty)
83
+ }
84
+
85
+ for key, default_value in all_properties.items():
86
+ if key not in st.session_state:
87
+ st.session_state[key] = default_value
88
+
89
+ except Exception as e:
90
+ ErrorHandler.handle_error(
91
+ e,
92
+ "Session initialization failed",
93
+ recovery_hint="Please refresh the page and try again"
94
+ )
95
+ # Initialize with minimal defaults on error
96
+ st.session_state.clear()
97
+ st.session_state.update({
98
+ 'documents': {},
99
+ 'processing_active': False,
100
+ 'agent': None,
101
+ })
102
+
103
+
104
+ def reset(self) -> None:
105
+ """Reset analysis results and cached data for fresh analysis."""
106
+ self.overview_summary = ""
107
+ self.strategic_summary = ""
108
+ self.checklist_results = {}
109
+ self.question_answers = {}
110
+
111
+ def reset_processing(self) -> None:
112
+ """Reset processing flags to allow new operations."""
113
+ self.processing_active = False
114
+
115
+ def ready(self) -> bool:
116
+ """Check if system is ready for analysis operations."""
117
+ return bool(self.documents is not None and len(self.documents) > 0 and not self.processing_active)
app/ui/sidebar.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Sidebar Component
4
+
5
+ Handles project selection, file selectors, and AI settings.
6
+ """
7
+
8
+ import streamlit as st
9
+ from pathlib import Path
10
+ from typing import Tuple, Optional
11
+
12
+ from app.ui.session_manager import SessionManager
13
+ # Use lazy imports to avoid circular import issues
14
+ # from app.handlers.document_handler import DocumentHandler
15
+ # from app.handlers.ai_handler import AIHandler
16
+ # Import components directly to avoid circular import issues
17
+ import importlib.util
18
+ import os
19
+
20
+ # Load the ui_components.py module directly
21
+ components_path = os.path.join(os.path.dirname(__file__), 'ui_components.py')
22
+ spec = importlib.util.spec_from_file_location("components_module", components_path)
23
+ components_module = importlib.util.module_from_spec(spec)
24
+ spec.loader.exec_module(components_module)
25
+
26
+ # Import the specific functions we need
27
+ render_project_selector = components_module.render_project_selector
28
+ render_ai_settings = components_module.render_ai_settings
29
+ render_file_selector = components_module.render_file_selector
30
+ display_processing_error = components_module.display_processing_error
31
+ status_message = components_module.status_message
32
+ from app.core import logger
33
+
34
+
35
+ class Sidebar:
36
+ """
37
+ Simplified sidebar component that handles all sidebar functionality.
38
+ """
39
+
40
+ def __init__(self, session: SessionManager, config):
41
+ """Initialize sidebar with session manager and config"""
42
+ self.session = session
43
+ self.config = config
44
+ # Handlers will be imported lazily when needed
45
+ self._document_handler = None
46
+ self._ai_handler = None
47
+
48
+ @property
49
+ def document_handler(self):
50
+ """Lazy import of DocumentHandler"""
51
+ if self._document_handler is None:
52
+ from app.handlers.document_handler import DocumentHandler
53
+ self._document_handler = DocumentHandler(self.session)
54
+ return self._document_handler
55
+
56
+ @property
57
+ def ai_handler(self):
58
+ """Lazy import of AIHandler"""
59
+ if self._ai_handler is None:
60
+ from app.handlers.ai_handler import AIHandler
61
+ self._ai_handler = AIHandler(self.session)
62
+ return self._ai_handler
63
+
64
+ def render(self) -> Tuple[Optional[str], bool]:
65
+ """
66
+ Render sidebar with project selection, file selectors, and AI settings
67
+
68
+ Returns:
69
+ Tuple of (data_room_path, process_button_pressed)
70
+ """
71
+ with st.sidebar:
72
+ # Project and data room selection
73
+ selected_project_path, data_room_path = render_project_selector()
74
+
75
+ # Process button
76
+ process_button = st.button(
77
+ "🚀 Process Data Room",
78
+ type="primary",
79
+ width='stretch'
80
+ )
81
+
82
+ if process_button:
83
+ st.success("Processing... Check main area for progress")
84
+
85
+ st.divider()
86
+
87
+ # Analysis Configuration
88
+ st.subheader("📋 Analysis Configuration")
89
+
90
+ # Strategy selector
91
+ strategy_path, strategy_text = self._render_file_selector(
92
+ self.config.paths['strategy_dir'], "Strategy", "🎯"
93
+ )
94
+ self.session.strategy_path = strategy_path
95
+ self.session.strategy_text = strategy_text
96
+
97
+ # Checklist selector
98
+ checklist_path, checklist_text = self._render_file_selector(
99
+ self.config.paths['checklist_dir'], "Checklist", "📊"
100
+ )
101
+ self.session.checklist_path = checklist_path
102
+ self.session.checklist_text = checklist_text
103
+
104
+ # Questions selector
105
+ questions_path, questions_text = self._render_file_selector(
106
+ self.config.paths['questions_dir'], "Questions", "❓"
107
+ )
108
+ self.session.questions_path = questions_path
109
+ self.session.questions_text = questions_text
110
+
111
+ st.divider()
112
+
113
+ # AI settings
114
+ api_key, model_choice = render_ai_settings()
115
+
116
+ # Initialize AI agent if API key is available
117
+ if api_key:
118
+ existing_agent = self.session.agent
119
+ if existing_agent is None:
120
+ if self.ai_handler.setup_agent(api_key, model_choice):
121
+ st.success("✅ AI Agent ready")
122
+ else:
123
+ self.session.agent = None
124
+
125
+ return data_room_path, process_button
126
+
127
+ def _render_file_selector(self, directory: str, label: str, icon: str) -> Tuple[Optional[str], str]:
128
+ """
129
+ Render a file selector for a specific directory
130
+
131
+ Args:
132
+ directory: Path to the directory containing files
133
+ label: Label for the selector
134
+ icon: Icon for the selector
135
+
136
+ Returns:
137
+ Tuple of (selected_file_path, selected_file_content)
138
+ """
139
+ try:
140
+ return render_file_selector(directory, label, "sidebar", icon)
141
+ except Exception as e:
142
+ logger.error(f"Failed to render {label.lower()} selector: {e}")
143
+ return None, ""
144
+
145
+ def process_data_room(self, data_room_path: str):
146
+ """
147
+ Process a data room using the fast FAISS loading approach
148
+
149
+ Args:
150
+ data_room_path: Path to the data room directory
151
+ """
152
+ try:
153
+ result = self.document_handler.process_data_room_fast(data_room_path)
154
+
155
+ if result:
156
+ doc_count, chunk_count = result
157
+ st.success(f"✅ Loaded {doc_count} documents and {chunk_count} chunks from pre-built index!")
158
+ st.rerun()
159
+ else:
160
+ display_processing_error("data room")
161
+ except Exception as e:
162
+ logger.error(f"Failed to process data room {data_room_path}: {e}")
163
+ display_processing_error("data room", e)
164
+
app/ui/tabs/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tab Components Package
3
+
4
+ Contains all tab-specific UI components and logic.
5
+ """
6
+
7
+ from .tab_base import TabBase
8
+ from .overview_tab import OverviewTab
9
+ from .strategic_tab import StrategicTab
10
+ from .checklist_tab import ChecklistTab
11
+ from .questions_tab import QuestionsTab
12
+ from .qa_tab import QATab
13
+
14
+ __all__ = [
15
+ 'TabBase',
16
+ 'OverviewTab',
17
+ 'StrategicTab',
18
+ 'ChecklistTab',
19
+ 'QuestionsTab',
20
+ 'QATab'
21
+ ]
app/ui/tabs/checklist_tab.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Checklist Tab Component
4
+
5
+ Handles checklist matching and display.
6
+ """
7
+
8
+ import streamlit as st
9
+
10
+ from app.ui.session_manager import SessionManager
11
+ from app.ui.ui_components import (
12
+ status_message,
13
+ render_generate_buttons,
14
+ processing_guard,
15
+ display_generation_error,
16
+ display_initialization_error
17
+ )
18
+ from app.handlers.ai_handler import AIHandler
19
+ from app.core.logging import logger
20
+
21
+
22
+ class ChecklistTab:
23
+ """
24
+ Checklist matching tab that handles checklist analysis and display.
25
+ """
26
+
27
+ def __init__(self, session: SessionManager, config, ai_handler: AIHandler):
28
+ """Initialize tab with session manager, config, and AI handler"""
29
+ self.session = session
30
+ self.config = config
31
+ self.ai_handler = ai_handler
32
+
33
+ def render(self):
34
+ """Render the checklist tab"""
35
+ documents = self.session.documents
36
+ if not documents:
37
+ status_message("👈 Configure and process data room first", "info")
38
+ return
39
+
40
+ # Use checklist from sidebar
41
+ file_text = self.session.checklist_text
42
+
43
+ if not file_text:
44
+ status_message("👈 Select a checklist in the sidebar first", "info")
45
+ return
46
+
47
+ # Generate button row
48
+ button_clicked = render_generate_buttons(
49
+ "📊 Generate Matching",
50
+ "regenerate_checklist_btn",
51
+ "checklist_results",
52
+ "Generate checklist matching analysis",
53
+ self.session
54
+ )
55
+
56
+ # Generate or display content
57
+ if button_clicked and not self.session.checklist_results:
58
+ self._generate_checklist_matching()
59
+ elif self.session.checklist_results:
60
+ from app.ui.ui_components import render_checklist_results
61
+ results = self.session.checklist_results
62
+ render_checklist_results(results, relevancy_threshold=self.config.processing['similarity_threshold'])
63
+ else:
64
+ status_message("👆 Click 'Generate Matching' to analyze checklist items against documents", "info")
65
+
66
+ @processing_guard()
67
+ def _generate_checklist_matching(self):
68
+ """Generate checklist matching analysis"""
69
+ # Initialize document processor with loaded FAISS store
70
+ from app.core import create_document_processor
71
+
72
+ # Get the store name from session (set during data room processing)
73
+ store_name = self.session.vdr_store
74
+ if not store_name:
75
+ st.error("❌ No data room processed. Please process a data room first.")
76
+ return
77
+
78
+ document_processor = create_document_processor(store_name=store_name)
79
+
80
+ try:
81
+ checklist_text = self.session.checklist_text
82
+ if not checklist_text or not self.session.chunks:
83
+ st.error("❌ No checklist or document chunks available")
84
+ return
85
+
86
+ # Check if data room has been processed
87
+ if not hasattr(self.session, 'documents') or not self.session.documents:
88
+ st.error("❌ No data room processed. Please process a data room first before running checklist analysis.")
89
+ return
90
+
91
+ # Note: Document type embeddings will be auto-loaded if missing during processing
92
+
93
+ with st.spinner("Processing checklist, please wait..."):
94
+ from app.core.parsers import parse_checklist
95
+ from app.core import search_and_analyze
96
+
97
+ try:
98
+ # Parse raw checklist
99
+ llm = self.ai_handler.llm
100
+ if not llm:
101
+ raise ValueError("AI service not configured. Please set up your API key first.")
102
+ checklist = parse_checklist(checklist_text, llm)
103
+ self.session.checklist = checklist
104
+
105
+ # Use pre-built FAISS index from document processor
106
+ if not document_processor.vector_store:
107
+ raise ValueError("No pre-built FAISS index loaded. Please ensure data room is processed first.")
108
+
109
+ vector_store = document_processor.vector_store
110
+
111
+ # Process checklist items
112
+ checklist_results = search_and_analyze(
113
+ checklist,
114
+ vector_store,
115
+ self.ai_handler.session.agent.llm if self.ai_handler.is_agent_available() else None,
116
+ self.config.processing['similarity_threshold'],
117
+ 'items',
118
+ store_name=getattr(document_processor, 'store_name', None),
119
+ session=self.session
120
+ )
121
+ self.session.checklist_results = checklist_results
122
+
123
+ status_message("✅ Checklist matching analysis completed!", "success")
124
+ st.rerun()
125
+
126
+ except Exception as e:
127
+ logger.error(f"Checklist processing failed: {e}")
128
+ display_generation_error("checklist analysis", e)
129
+
130
+ except Exception as e:
131
+ logger.error(f"Failed to initialize document processor: {e}")
132
+ display_initialization_error("document processor", e)
133
+ finally:
134
+ # Processing state is managed by processing_guard decorator
135
+ pass
136
+
app/ui/tabs/graph_tab.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Knowledge Graph Tab
4
+
5
+ This tab provides an interface for exploring pre-computed knowledge graphs
6
+ generated from due diligence documents. It offers entity search, relationship
7
+ exploration, and graph analysis capabilities.
8
+ """
9
+
10
+ import streamlit as st
11
+ import plotly.graph_objects as go
12
+ import plotly.express as px
13
+ import pandas as pd
14
+ from typing import Dict, List, Any, Optional
15
+
16
+ from app.core.knowledge_graph import KnowledgeGraphManager, get_available_knowledge_graphs
17
+ from app.ui.tabs.tab_base import TabBase
18
+ from app.ui.error_handler import handle_ui_errors
19
+ from app.core.logging import logger
20
+
21
+ class GraphTab(TabBase):
22
+ """Knowledge Graph exploration tab"""
23
+
24
+ def __init__(self, session_manager, config, ai_handler, export_handler):
25
+ super().__init__(session_manager, config, ai_handler, export_handler)
26
+ self.tab_name = "Knowledge Graph"
27
+ self.tab_key = "graph"
28
+
29
+ @handle_ui_errors("Knowledge Graph", "Please try refreshing the page")
30
+ def render(self):
31
+ """Render the knowledge graph tab"""
32
+ st.header("🧠 Knowledge Graph Explorer")
33
+
34
+ # Check if we have a loaded company
35
+ if not self.session.vdr_store:
36
+ st.info("📋 Please load a company first using the sidebar.")
37
+ return
38
+
39
+ company_name = self.session.vdr_store
40
+
41
+ # Initialize knowledge graph manager
42
+ if f'kg_manager_{company_name}' not in st.session_state:
43
+ st.session_state[f'kg_manager_{company_name}'] = KnowledgeGraphManager(company_name)
44
+
45
+ kg_manager = st.session_state[f'kg_manager_{company_name}']
46
+
47
+ # Load graph if not already loaded
48
+ if not kg_manager.is_available():
49
+ with st.spinner("Loading knowledge graph..."):
50
+ if not kg_manager.load_graph():
51
+ st.error("❌ Knowledge graph not found for this company.")
52
+ st.info("💡 Run `python scripts/build_knowledge_graphs.py` to generate knowledge graphs.")
53
+ return
54
+
55
+ # Display graph summary
56
+ self._render_graph_summary(kg_manager)
57
+
58
+ # Main interface tabs
59
+ tab1, tab2, tab3, tab4, tab5 = st.tabs([
60
+ "🔍 Entity Search",
61
+ "🔗 Relationship Explorer",
62
+ "📊 Graph Analysis",
63
+ "🎯 Path Finder",
64
+ "🧠 Semantic Search"
65
+ ])
66
+
67
+ with tab1:
68
+ self._render_entity_search(kg_manager)
69
+
70
+ with tab2:
71
+ self._render_relationship_explorer(kg_manager)
72
+
73
+ with tab3:
74
+ self._render_graph_analysis(kg_manager)
75
+
76
+ with tab4:
77
+ self._render_path_finder(kg_manager)
78
+
79
+ with tab5:
80
+ self._render_semantic_search(kg_manager)
81
+
82
+ def _render_graph_summary(self, kg_manager: KnowledgeGraphManager):
83
+ """Render graph summary statistics"""
84
+ stats = kg_manager.get_summary_stats()
85
+
86
+ if not stats:
87
+ return
88
+
89
+ # Summary metrics
90
+ col1, col2, col3, col4 = st.columns(4)
91
+
92
+ with col1:
93
+ st.metric("Total Entities", stats.get('num_entities', 0))
94
+
95
+ with col2:
96
+ st.metric("Relationships", stats.get('num_relationships', 0))
97
+
98
+ with col3:
99
+ entity_types = stats.get('entity_types', {})
100
+ st.metric("Entity Types", len(entity_types))
101
+
102
+ with col4:
103
+ rel_types = stats.get('relationship_types', {})
104
+ st.metric("Relationship Types", len(rel_types))
105
+
106
+ # Entity distribution chart
107
+ if entity_types:
108
+ with st.expander("📊 Entity Distribution", expanded=False):
109
+ fig = px.pie(
110
+ values=list(entity_types.values()),
111
+ names=list(entity_types.keys()),
112
+ title="Distribution of Entity Types"
113
+ )
114
+ st.plotly_chart(fig, width='stretch')
115
+
116
+ def _render_entity_search(self, kg_manager: KnowledgeGraphManager):
117
+ """Render entity search interface"""
118
+ st.subheader("🔍 Search Entities")
119
+
120
+ # Search controls
121
+ col1, col2 = st.columns([3, 1])
122
+
123
+ with col1:
124
+ search_query = st.text_input(
125
+ "Search for entities (companies, people, contracts, etc.)",
126
+ placeholder="e.g., Microsoft, John Smith, acquisition...",
127
+ key="entity_search_query"
128
+ )
129
+
130
+ with col2:
131
+ entity_types = ['All'] + list(kg_manager.get_summary_stats().get('entity_types', {}).keys())
132
+ selected_type = st.selectbox(
133
+ "Filter by type",
134
+ entity_types,
135
+ key="entity_type_filter"
136
+ )
137
+
138
+ if search_query:
139
+ # Perform search
140
+ filter_type = None if selected_type == 'All' else selected_type
141
+ results = kg_manager.search_entities(
142
+ search_query,
143
+ entity_type=filter_type,
144
+ limit=20
145
+ )
146
+
147
+ if results:
148
+ st.success(f"Found {len(results)} matching entities")
149
+
150
+ # Display results
151
+ for i, entity in enumerate(results):
152
+ with st.expander(f"🏷️ {entity['name']} ({entity['type']})", expanded=i==0):
153
+ col1, col2 = st.columns([2, 1])
154
+
155
+ with col1:
156
+ st.write(f"**Type:** {entity['type']}")
157
+ st.write(f"**Sources:** {entity['sources']}")
158
+ st.write(f"**Document Type:** {entity['document_type']}")
159
+
160
+ # Show context samples
161
+ if entity.get('context_samples'):
162
+ st.write("**Context:**")
163
+ for context in entity['context_samples']:
164
+ if context.strip():
165
+ st.write(f"_{context.strip()}_")
166
+
167
+ with col2:
168
+ st.metric("Relevance Score", f"{entity['score']:.2f}")
169
+
170
+ # Button to explore relationships
171
+ if st.button(f"Explore Relationships", key=f"explore_{i}"):
172
+ st.session_state['selected_entity'] = entity['name']
173
+ st.rerun()
174
+ else:
175
+ st.info("No entities found matching your search criteria.")
176
+
177
+ def _render_relationship_explorer(self, kg_manager: KnowledgeGraphManager):
178
+ """Render relationship exploration interface"""
179
+ st.subheader("🔗 Relationship Explorer")
180
+
181
+ # Entity selection
182
+ selected_entity = st.session_state.get('selected_entity', '')
183
+ entity_input = st.text_input(
184
+ "Enter entity name to explore relationships",
185
+ value=selected_entity,
186
+ placeholder="e.g., Microsoft, John Smith...",
187
+ key="relationship_entity_input"
188
+ )
189
+
190
+ if entity_input:
191
+ # Get relationships
192
+ relationships = kg_manager.get_entity_relationships(entity_input)
193
+
194
+ if relationships['outgoing'] or relationships['incoming']:
195
+ # Display outgoing relationships
196
+ if relationships['outgoing']:
197
+ st.write("### ➡️ Outgoing Relationships")
198
+ outgoing_data = []
199
+ for rel in relationships['outgoing']:
200
+ outgoing_data.append({
201
+ 'Target': rel['target'],
202
+ 'Type': rel['target_type'],
203
+ 'Relationship': rel['relationship'],
204
+ 'Source Doc': rel['source_document'],
205
+ 'Confidence': f"{rel['confidence']:.2f}"
206
+ })
207
+
208
+ df_out = pd.DataFrame(outgoing_data)
209
+ st.dataframe(df_out, width='stretch')
210
+
211
+ # Show relationship context on selection
212
+ if st.checkbox("Show relationship contexts", key="show_outgoing_context"):
213
+ for i, rel in enumerate(relationships['outgoing']):
214
+ if rel['context'].strip():
215
+ st.write(f"**{rel['target']} ({rel['relationship']}):**")
216
+ st.write(f"_{rel['context']}_")
217
+ st.write("---")
218
+
219
+ # Display incoming relationships
220
+ if relationships['incoming']:
221
+ st.write("### ⬅️ Incoming Relationships")
222
+ incoming_data = []
223
+ for rel in relationships['incoming']:
224
+ incoming_data.append({
225
+ 'Source': rel['source'],
226
+ 'Type': rel['source_type'],
227
+ 'Relationship': rel['relationship'],
228
+ 'Source Doc': rel['source_document'],
229
+ 'Confidence': f"{rel['confidence']:.2f}"
230
+ })
231
+
232
+ df_in = pd.DataFrame(incoming_data)
233
+ st.dataframe(df_in, width='stretch')
234
+
235
+ # Show relationship context on selection
236
+ if st.checkbox("Show relationship contexts", key="show_incoming_context"):
237
+ for i, rel in enumerate(relationships['incoming']):
238
+ if rel['context'].strip():
239
+ st.write(f"**{rel['source']} ({rel['relationship']}):**")
240
+ st.write(f"_{rel['context']}_")
241
+ st.write("---")
242
+
243
+ # Relationship type distribution
244
+ all_rels = relationships['outgoing'] + relationships['incoming']
245
+ rel_types = {}
246
+ for rel in all_rels:
247
+ rel_type = rel['relationship']
248
+ rel_types[rel_type] = rel_types.get(rel_type, 0) + 1
249
+
250
+ if rel_types:
251
+ st.write("### 📊 Relationship Type Distribution")
252
+ fig = px.bar(
253
+ x=list(rel_types.keys()),
254
+ y=list(rel_types.values()),
255
+ title=f"Relationships for {entity_input}"
256
+ )
257
+ st.plotly_chart(fig, width='stretch')
258
+
259
+ else:
260
+ st.info(f"No relationships found for '{entity_input}'. Try a different entity name.")
261
+
262
+ def _render_graph_analysis(self, kg_manager: KnowledgeGraphManager):
263
+ """Render graph analysis interface"""
264
+ st.subheader("📊 Graph Analysis")
265
+
266
+ # Central entities
267
+ st.write("### 🎯 Most Important Entities")
268
+ central_entities = kg_manager.get_central_entities(limit=15)
269
+
270
+ if central_entities:
271
+ # Create a bar chart of centrality scores
272
+ names = [e['name'] for e in central_entities]
273
+ scores = [e['centrality_score'] for e in central_entities]
274
+ types = [e['type'] for e in central_entities]
275
+
276
+ fig = px.bar(
277
+ x=scores,
278
+ y=names,
279
+ orientation='h',
280
+ color=types,
281
+ title="Entity Centrality Scores",
282
+ labels={'x': 'Centrality Score', 'y': 'Entity'}
283
+ )
284
+ fig.update_layout(height=500)
285
+ st.plotly_chart(fig, width='stretch')
286
+
287
+ # Display detailed table
288
+ with st.expander("📋 Detailed Central Entities", expanded=False):
289
+ central_df = pd.DataFrame([{
290
+ 'Entity': e['name'],
291
+ 'Type': e['type'],
292
+ 'Centrality Score': e['centrality_score'],
293
+ 'Connections': e['num_connections'],
294
+ 'Sources': e['sources']
295
+ } for e in central_entities])
296
+ st.dataframe(central_df, width='stretch')
297
+
298
+ # Entity clusters
299
+ st.write("### 🎭 Entity Clusters")
300
+ clusters = kg_manager.get_entity_clusters()
301
+
302
+ if clusters:
303
+ st.info(f"Found {len(clusters)} clusters of related entities")
304
+
305
+ for i, cluster in enumerate(clusters):
306
+ with st.expander(f"Cluster {i+1} ({len(cluster)} entities)", expanded=i==0):
307
+ # Display cluster as tags
308
+ cluster_html = " • ".join([f"**{entity}**" for entity in cluster])
309
+ st.write(cluster_html)
310
+ else:
311
+ st.info("No significant entity clusters found.")
312
+
313
+ def _render_path_finder(self, kg_manager: KnowledgeGraphManager):
314
+ """Render path finding interface"""
315
+ st.subheader("🎯 Path Finder")
316
+ st.write("Find connections between two entities in the knowledge graph.")
317
+
318
+ col1, col2 = st.columns(2)
319
+
320
+ with col1:
321
+ source_entity = st.text_input(
322
+ "Source Entity",
323
+ placeholder="e.g., Microsoft",
324
+ key="path_source_entity"
325
+ )
326
+
327
+ with col2:
328
+ target_entity = st.text_input(
329
+ "Target Entity",
330
+ placeholder="e.g., OpenAI",
331
+ key="path_target_entity"
332
+ )
333
+
334
+ max_length = st.slider("Maximum Path Length", 1, 5, 3, key="max_path_length")
335
+
336
+ if source_entity and target_entity and st.button("Find Paths", key="find_paths_btn"):
337
+ with st.spinner("Searching for paths..."):
338
+ paths = kg_manager.find_paths(source_entity, target_entity, max_length)
339
+
340
+ if paths:
341
+ st.success(f"Found {len(paths)} path(s) between {source_entity} and {target_entity}")
342
+
343
+ for i, path in enumerate(paths):
344
+ st.write(f"**Path {i+1}:**")
345
+ path_str = " → ".join(path)
346
+ st.write(f"🔗 {path_str}")
347
+
348
+ # Show path length
349
+ st.write(f"_Length: {len(path)-1} steps_")
350
+ st.write("---")
351
+ else:
352
+ st.info(f"No paths found between {source_entity} and {target_entity} within {max_length} steps.")
353
+
354
+ # Path finding tips
355
+ with st.expander("💡 Path Finding Tips", expanded=False):
356
+ st.write("""
357
+ - **Entity names**: Use exact or partial entity names as they appear in the documents
358
+ - **Path length**: Shorter paths show direct connections, longer paths reveal indirect relationships
359
+ - **Multiple paths**: Different paths can reveal different types of business relationships
360
+ - **Use cases**:
361
+ - Find how two companies are connected
362
+ - Trace investment or acquisition chains
363
+ - Discover business partnerships and alliances
364
+ """)
365
+
366
+ def _render_semantic_search(self, kg_manager: KnowledgeGraphManager):
367
+ """Render semantic search interface using FAISS embeddings"""
368
+ st.subheader("🧠 Semantic Search")
369
+ st.write("Search entities using natural language queries powered by your existing FAISS embeddings.")
370
+
371
+ # Semantic entity search
372
+ st.write("### 🔍 Semantic Entity Search")
373
+ semantic_query = st.text_input(
374
+ "Describe what you're looking for (e.g., 'technology companies', 'financial partnerships', 'recent acquisitions')",
375
+ placeholder="e.g., companies involved in AI partnerships",
376
+ key="semantic_entity_query"
377
+ )
378
+
379
+ col1, col2 = st.columns([1, 1])
380
+ with col1:
381
+ semantic_limit = st.slider("Max results", 5, 20, 10, key="semantic_limit")
382
+ with col2:
383
+ similarity_threshold = st.slider("Similarity threshold", 0.1, 0.8, 0.3, key="similarity_threshold")
384
+
385
+ if semantic_query and st.button("🔍 Semantic Search", key="semantic_search_btn"):
386
+ with st.spinner("Searching using AI embeddings..."):
387
+ results = kg_manager.semantic_search_entities(
388
+ semantic_query,
389
+ limit=semantic_limit,
390
+ similarity_threshold=similarity_threshold
391
+ )
392
+
393
+ if results:
394
+ st.success(f"Found {len(results)} semantically relevant entities")
395
+
396
+ for i, entity in enumerate(results):
397
+ with st.expander(f"🏷️ {entity['name']} ({entity['type']}) - Score: {entity['similarity_score']:.3f}", expanded=i==0):
398
+ col1, col2 = st.columns([2, 1])
399
+
400
+ with col1:
401
+ st.write(f"**Type:** {entity['type']}")
402
+ st.write(f"**Sources:** {entity['sources']}")
403
+ st.write(f"**Document Type:** {entity['document_type']}")
404
+
405
+ # Show matching context
406
+ if entity.get('matching_context'):
407
+ st.write("**Relevant Context:**")
408
+ st.write(f"_{entity['matching_context']}_")
409
+
410
+ # Show original context samples
411
+ if entity.get('context_samples'):
412
+ st.write("**Entity Context:**")
413
+ for context in entity['context_samples']:
414
+ if context.strip():
415
+ st.write(f"_{context.strip()}_")
416
+
417
+ with col2:
418
+ st.metric("Similarity Score", f"{entity['similarity_score']:.3f}")
419
+
420
+ # Button to explore relationships
421
+ if st.button(f"Explore Relations", key=f"semantic_explore_{i}"):
422
+ st.session_state['selected_entity'] = entity['name']
423
+ st.rerun()
424
+ else:
425
+ st.info("No entities found matching your semantic query. Try adjusting the similarity threshold or rephrasing your query.")
426
+
427
+ # Context-based related entities
428
+ st.write("### 🔗 Find Related by Context")
429
+ st.write("Find entities that appear in similar contexts to a reference entity.")
430
+
431
+ context_entity = st.text_input(
432
+ "Reference entity name",
433
+ placeholder="e.g., Microsoft",
434
+ key="context_reference_entity"
435
+ )
436
+
437
+ context_limit = st.slider("Max related entities", 3, 15, 5, key="context_limit")
438
+
439
+ if context_entity and st.button("Find Related by Context", key="find_context_related_btn"):
440
+ with st.spinner("Finding contextually related entities..."):
441
+ related = kg_manager.find_related_entities_by_context(context_entity, limit=context_limit)
442
+
443
+ if related:
444
+ st.success(f"Found {len(related)} contextually related entities")
445
+
446
+ related_data = []
447
+ for entity in related:
448
+ related_data.append({
449
+ 'Entity': entity['name'],
450
+ 'Type': entity['type'],
451
+ 'Similarity': f"{entity['similarity_score']:.3f}",
452
+ 'Reason': entity['relationship_reason'],
453
+ 'Sources': entity['sources']
454
+ })
455
+
456
+ df_related = pd.DataFrame(related_data)
457
+ st.dataframe(df_related, width='stretch')
458
+
459
+ # Show context samples for selected entities
460
+ if st.checkbox("Show context samples", key="show_related_contexts"):
461
+ for entity in related:
462
+ if entity.get('context_samples'):
463
+ st.write(f"**{entity['name']}:**")
464
+ for context in entity['context_samples']:
465
+ if context.strip():
466
+ st.write(f"_{context.strip()}_")
467
+ st.write("---")
468
+ else:
469
+ st.info(f"No contextually related entities found for '{context_entity}'.")
470
+
471
+ # Semantic path search
472
+ st.write("### 🎯 Semantic Path Discovery")
473
+ st.write("Find connection paths that are semantically relevant to your query.")
474
+
475
+ path_query = st.text_input(
476
+ "Describe the type of connections you want to find",
477
+ placeholder="e.g., investment relationships, technology partnerships",
478
+ key="semantic_path_query"
479
+ )
480
+
481
+ max_semantic_paths = st.slider("Max paths", 3, 10, 5, key="max_semantic_paths")
482
+
483
+ if path_query and st.button("Find Semantic Paths", key="semantic_paths_btn"):
484
+ with st.spinner("Discovering relevant connection paths..."):
485
+ paths = kg_manager.semantic_path_search(path_query, max_paths=max_semantic_paths)
486
+
487
+ if paths:
488
+ st.success(f"Found {len(paths)} relevant connection paths")
489
+
490
+ for i, path_info in enumerate(paths):
491
+ st.write(f"**Path {i+1}:** (Relevance: {path_info['relevance_score']:.3f})")
492
+ path_str = " → ".join(path_info['path'])
493
+ st.write(f"🔗 {path_str}")
494
+ st.write(f"_{path_info['query_relevance']}_")
495
+ st.write(f"Length: {path_info['path_length']} steps")
496
+ st.write("---")
497
+ else:
498
+ st.info(f"No semantically relevant paths found for '{path_query}'.")
499
+
500
+ # Semantic search tips
501
+ with st.expander("💡 Semantic Search Tips", expanded=False):
502
+ st.write("""
503
+ **Semantic Search Benefits:**
504
+ - Uses your existing FAISS embeddings for intelligent matching
505
+ - Finds entities based on meaning, not just keywords
506
+ - Discovers hidden relationships through context similarity
507
+ - Leverages the same AI models used in your document analysis
508
+
509
+ **Query Examples:**
510
+ - "technology companies with AI focus"
511
+ - "recent merger and acquisition activity"
512
+ - "financial services partnerships"
513
+ - "regulatory compliance issues"
514
+ - "key executive leadership"
515
+
516
+ **How it works:**
517
+ 1. Your query is embedded using the same model as your documents
518
+ 2. FAISS finds the most similar document chunks
519
+ 3. Entities from those chunks are returned with similarity scores
520
+ 4. Results are ranked by semantic relevance
521
+
522
+ **Performance Notes:**
523
+ - Requires existing FAISS indices (same as your document search)
524
+ - No additional models or external services needed
525
+ - Leverages your pre-computed embeddings for fast results
526
+ """)
527
+
528
+ def get_status(self) -> Dict[str, Any]:
529
+ """Get current status of the knowledge graph tab"""
530
+ if not self.session.vdr_store:
531
+ return {
532
+ 'ready': False,
533
+ 'message': 'No company loaded'
534
+ }
535
+
536
+ company_name = self.session.vdr_store
537
+ available_graphs = get_available_knowledge_graphs()
538
+
539
+ if company_name not in available_graphs:
540
+ return {
541
+ 'ready': False,
542
+ 'message': f'Knowledge graph not available for {company_name}'
543
+ }
544
+
545
+ return {
546
+ 'ready': True,
547
+ 'message': f'Knowledge graph ready for {company_name}'
548
+ }
app/ui/tabs/overview_tab.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Overview Tab Component
4
+
5
+ Handles company overview generation and display.
6
+ """
7
+
8
+ # Standard library imports
9
+ from pathlib import Path
10
+
11
+ # Third-party imports
12
+ import streamlit as st
13
+
14
+ # Local imports
15
+ from app.ui.tabs.tab_base import TabBase
16
+ from app.ui.ui_components import status_message
17
+
18
+
19
+ class OverviewTab(TabBase):
20
+ """
21
+ Company overview tab that handles overview generation and display.
22
+ """
23
+
24
+ def render(self):
25
+ """Render the overview tab"""
26
+ if not self._check_documents_available():
27
+ return
28
+
29
+ # Generate button row
30
+ button_clicked = self._render_generate_buttons(
31
+ "🤖 Generate Overview",
32
+ "regenerate_overview_btn",
33
+ "overview_summary",
34
+ "Use AI to generate company overview analysis"
35
+ )
36
+
37
+ # Generate or display content
38
+ if self._should_generate_content(button_clicked, "overview_summary"):
39
+ self._generate_report("overview", "overview_summary", "✅ Company overview generated successfully!")
40
+ else:
41
+ self._render_content_or_placeholder(
42
+ "overview_summary",
43
+ "👆 Click 'Generate Overview' to create AI-powered company analysis"
44
+ )
45
+
46
+ def _generate_report(self, report_type: str, session_attr: str, success_message: str):
47
+ """Generate company overview report using AI"""
48
+ if not self._check_ai_availability():
49
+ return
50
+
51
+ with st.spinner("Agent running, please wait..."):
52
+ data_room_name = self._get_data_room_name()
53
+
54
+ overview_summary = self.ai_handler.generate_report(
55
+ report_type,
56
+ documents=self.session.documents,
57
+ data_room_name=data_room_name,
58
+ strategy_text=self.session.strategy_text,
59
+ checklist_results=self.session.checklist_results
60
+ )
61
+
62
+ if overview_summary:
63
+ setattr(self.session, session_attr, overview_summary)
64
+ status_message(success_message, "success")
65
+ st.rerun()
66
+ else:
67
+ status_message("Failed to generate overview. Please try again.", "error")
68
+
69
+ def _get_export_method_name(self) -> str:
70
+ """Get export method name for overview reports"""
71
+ return "export_overview_report"
72
+
73
+ def _get_download_key(self) -> str:
74
+ """Get download button key for overview reports"""
75
+ return "export_overview_btn"
76
+
app/ui/tabs/qa_tab.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Q&A Tab Component
4
+
5
+ Handles Q&A with citations functionality.
6
+ """
7
+
8
+ # Standard library imports
9
+ from pathlib import Path
10
+
11
+ # Third-party imports
12
+ import streamlit as st
13
+
14
+ # Local imports
15
+ from app.core import RELEVANCY_THRESHOLD, logger
16
+ from app.handlers.ai_handler import AIHandler
17
+ from app.ui.session_manager import SessionManager
18
+ from app.ui.ui_components import (
19
+ display_processing_error,
20
+ display_generation_error,
21
+ display_download_error,
22
+ status_message
23
+ )
24
+
25
+
26
+ class QATab:
27
+ """
28
+ Q&A with citations tab that handles question answering and citation display.
29
+ """
30
+
31
+ def __init__(self, session: SessionManager, config, ai_handler: AIHandler):
32
+ """Initialize tab with session manager, config, and AI handler"""
33
+ self.session = session
34
+ self.config = config
35
+ self.ai_handler = ai_handler
36
+
37
+ def render(self):
38
+ """Render the Q&A tab"""
39
+ chunks = self.session.chunks
40
+ if not chunks:
41
+ status_message("👈 Process data room first to enable Q&A", "info")
42
+ return
43
+
44
+ # Question input
45
+ question = st.text_input(
46
+ "Ask a question about your documents:",
47
+ placeholder="e.g., What are the main risks? What is the revenue model? Who are the key customers?",
48
+ key="qa_question_input"
49
+ )
50
+
51
+ # Handle Q&A query if there's a question
52
+ if question:
53
+ st.divider()
54
+ self._handle_qa_query(question)
55
+
56
+ def _handle_qa_query(self, question: str):
57
+ """Handle Q&A query and display results"""
58
+ # Create a unique key for this Q&A session to prevent resets
59
+ qa_key = f"qa_results_{hash(question) % 100000}"
60
+
61
+ # Check if we already have results for this question in session state
62
+ if qa_key not in st.session_state:
63
+ try:
64
+ from app.core import search_documents
65
+
66
+ # Initialize document processor with loaded FAISS store
67
+ from app.core import create_document_processor
68
+
69
+ # Get the store name from session (set during data room processing)
70
+ store_name = self.session.vdr_store
71
+ if not store_name:
72
+ st.error("❌ No data room processed. Please process a data room first.")
73
+ return
74
+
75
+ document_processor = create_document_processor(store_name=store_name)
76
+
77
+ # Use lower threshold for Q&A to get more relevant results
78
+ qa_threshold = 0.15 # Lower threshold for QA to find more results
79
+
80
+ with st.spinner("🔍 Searching documents..."):
81
+ results = search_documents(
82
+ question,
83
+ document_processor,
84
+ top_k=self.config.ui['top_k_search_results'],
85
+ threshold=qa_threshold
86
+ )
87
+
88
+ # Fallback: try with lower threshold if no results found
89
+ if not results:
90
+ logger.info(f"No results found with threshold {qa_threshold}, trying lower threshold...")
91
+ fallback_threshold = 0.05 # Very low threshold as last resort
92
+ results = search_documents(
93
+ question,
94
+ document_processor,
95
+ top_k=self.config.ui['top_k_search_results'],
96
+ threshold=fallback_threshold
97
+ )
98
+ if results:
99
+ st.info(f"ℹ️ Found results with lower relevance threshold ({fallback_threshold})")
100
+
101
+ # Store results in session state to prevent resets
102
+ st.session_state[qa_key] = {
103
+ 'question': question,
104
+ 'results': results,
105
+ 'has_ai': self.ai_handler.is_agent_available()
106
+ }
107
+
108
+ except Exception as e:
109
+ logger.error(f"Failed to handle Q&A query: {e}")
110
+ display_processing_error("question", e)
111
+ return
112
+
113
+ # Render results from session state
114
+ qa_data = st.session_state[qa_key]
115
+ results = qa_data['results']
116
+
117
+ if results:
118
+ # Use agent to synthesize answer if available
119
+ if qa_data['has_ai']:
120
+ self._render_ai_answer(question, results)
121
+ else:
122
+ self._render_direct_results(results)
123
+ else:
124
+ status_message("No relevant information found for your question.", "warning")
125
+
126
+ def _render_ai_answer(self, question: str, results: list):
127
+ """Render AI-generated answer with citations"""
128
+ st.markdown("### 🤖 AI Service Answer")
129
+ with st.spinner("AI processing, please wait..."):
130
+ try:
131
+ # Convert results to document format for context
132
+ context_docs = [f"From {r.get('source', 'Unknown')}:\n{r.get('text', '')}" for r in results[:3]]
133
+
134
+ # Use the AI handler
135
+ answer_text = self.ai_handler.answer_question(question, context_docs)
136
+
137
+ st.markdown(answer_text)
138
+
139
+ except Exception as e:
140
+ logger.error(f"Failed to generate AI answer: {e}")
141
+ display_generation_error("AI answer")
142
+
143
+ st.divider()
144
+ self._render_source_documents(results, question)
145
+
146
+ def _render_direct_results(self, results: list):
147
+ """Render direct search results without AI synthesis"""
148
+ st.markdown("### 📚 Relevant Documents")
149
+ self._render_source_documents(results)
150
+
151
+ def _render_source_documents(self, results: list, question: str = ""):
152
+ """Render source documents with download buttons"""
153
+ st.markdown("### 📚 Source Documents")
154
+
155
+ # Display source documents with download buttons
156
+ for i, result in enumerate(results[:3], 1):
157
+ with st.container():
158
+ col1, col2 = st.columns([5, 1])
159
+ with col1:
160
+ text_content = result.get('text', '')
161
+ excerpt = text_content[:200] + "..." if len(text_content) > 200 else text_content
162
+ st.markdown(f"{i}. \"{excerpt}\")")
163
+
164
+ # Create clickable link for the document
165
+ doc_path = result.get('path', result.get('full_path', ''))
166
+ doc_name = result.get('source', 'Unknown Document')
167
+ doc_title = self._format_document_title(doc_name)
168
+
169
+ # Show document info and citation
170
+ doc_source = result.get('source', 'Unknown')
171
+ citation = result.get('citation', '')
172
+ st.caption(f" 📄 {doc_source} ({citation})" if citation else f" 📄 {doc_source}")
173
+
174
+ with col2:
175
+ # Only show one download button
176
+ self._render_qa_download_button(result, i, question)
177
+
178
+ def _format_document_title(self, doc_name: str) -> str:
179
+ """Format document title for display"""
180
+ try:
181
+ from app.core import format_document_title
182
+ return format_document_title(doc_name)
183
+ except Exception:
184
+ return doc_name
185
+
186
+ def _render_qa_download_button(self, result: dict, idx: int, question: str):
187
+ """Render download button for Q&A results"""
188
+ doc_path = result.get('path', '')
189
+ if doc_path:
190
+ # Create a more stable key that won't cause resets
191
+ doc_source = result.get('source', 'document')
192
+ button_key = f"qa_dl_{idx}_{hash(doc_path + question) % 100000}"
193
+
194
+ # Use consistent path resolution logic
195
+ try:
196
+ from app.ui.ui_components import _resolve_document_path
197
+ resolved_path = _resolve_document_path(doc_path)
198
+
199
+ if resolved_path and resolved_path.exists():
200
+ with open(resolved_path, 'rb') as f:
201
+ file_bytes = f.read()
202
+
203
+ st.download_button(
204
+ label="📥 Download",
205
+ data=file_bytes,
206
+ file_name=resolved_path.name, # Use actual filename
207
+ mime="application/pdf",
208
+ key=button_key,
209
+ help=f"Download {doc_source}",
210
+ width='stretch'
211
+ )
212
+ else:
213
+ st.caption("(unavailable)")
214
+ except Exception as e:
215
+ logger.error(f"Download failed: {str(e)}")
216
+ st.caption("(error)")
app/ui/tabs/questions_tab.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Questions Tab Component
4
+
5
+ Handles due diligence questions analysis and display.
6
+ """
7
+
8
+ import streamlit as st
9
+
10
+ from app.ui.session_manager import SessionManager
11
+ from app.ui.ui_components import (
12
+ status_message,
13
+ render_generate_buttons,
14
+ processing_guard,
15
+ display_generation_error,
16
+ display_initialization_error
17
+ )
18
+ from app.handlers.ai_handler import AIHandler
19
+ from app.core.logging import logger
20
+
21
+
22
+ class QuestionsTab:
23
+ """
24
+ Questions tab that handles due diligence questions analysis and display.
25
+ """
26
+
27
+ def __init__(self, session: SessionManager, config, ai_handler: AIHandler):
28
+ """Initialize tab with session manager, config, and AI handler"""
29
+ self.session = session
30
+ self.config = config
31
+ self.ai_handler = ai_handler
32
+
33
+ def render(self):
34
+ """Render the questions tab"""
35
+ documents = self.session.documents
36
+ if not documents:
37
+ status_message("👈 Configure and process data room first", "info")
38
+ return
39
+
40
+ # Use questions from sidebar
41
+ file_text = self.session.questions_text
42
+
43
+ if not file_text:
44
+ status_message("👈 Select a questions list in the sidebar first", "info")
45
+ return
46
+
47
+ # Generate button row
48
+ button_clicked = render_generate_buttons(
49
+ "❓ Generate Answers",
50
+ "regenerate_questions_btn",
51
+ "question_answers",
52
+ "Generate answers for due diligence questions",
53
+ self.session
54
+ )
55
+
56
+ # Generate or display content
57
+ if button_clicked and not self.session.question_answers:
58
+ self._generate_question_answers()
59
+ elif self.session.question_answers:
60
+ from app.ui.ui_components import render_question_results
61
+ answers = self.session.question_answers
62
+ # Convert from {'questions': [...]} format to {question_id: answer_data} format
63
+ if isinstance(answers, dict) and 'questions' in answers:
64
+ questions_dict = {}
65
+ for i, question_data in enumerate(answers['questions']):
66
+ questions_dict[f"question_{i}"] = question_data
67
+ render_question_results(questions_dict)
68
+ else:
69
+ render_question_results(answers)
70
+ else:
71
+ status_message("👆 Click 'Generate Answers' to find relevant documents for due diligence questions", "info")
72
+
73
+ @processing_guard()
74
+ def _generate_question_answers(self):
75
+ """Generate question answering analysis"""
76
+ from app.core.document_processor import DocumentProcessor
77
+
78
+ # Initialize document processor with loaded FAISS store
79
+ from app.core.utils import create_document_processor
80
+
81
+ # Get the store name from session (set during data room processing)
82
+ store_name = self.session.vdr_store
83
+ if not store_name:
84
+ st.error("❌ No data room processed. Please process a data room first.")
85
+ return
86
+
87
+ document_processor = create_document_processor(store_name=store_name)
88
+
89
+ try:
90
+ questions_text = self.session.questions_text
91
+ if not questions_text or not self.session.chunks:
92
+ st.error("❌ No questions or document chunks available")
93
+ return
94
+
95
+ # Show progress indicator
96
+ with st.spinner("🚀 Starting question analysis..."):
97
+ try:
98
+ from app.core.parsers import parse_questions
99
+ from app.core.search import search_and_analyze
100
+
101
+ # Step 1: Parse questions
102
+ st.info("📋 Parsing questions...")
103
+ llm = self.ai_handler.llm
104
+ if not llm:
105
+ raise ValueError("AI service not configured. Please set up your API key first.")
106
+ questions = parse_questions(questions_text, llm)
107
+ self.session.questions = questions
108
+ st.info(f"Found {len(questions)} questions to process")
109
+
110
+ # Step 2: Use pre-built FAISS index
111
+ st.info("🔍 Setting up document search...")
112
+ if not document_processor.vector_store:
113
+ raise ValueError("No pre-built FAISS index loaded. Please ensure data room is processed first.")
114
+ vector_store = document_processor.vector_store
115
+
116
+ # Step 3: Process questions with batch processing
117
+ st.info("🤖 Processing questions with AI (batch mode)...")
118
+ st.info("Using concurrent processing for faster results...")
119
+
120
+ question_answers = search_and_analyze(
121
+ questions,
122
+ vector_store,
123
+ self.ai_handler.session.agent.llm if self.ai_handler.is_agent_available() else None,
124
+ self.config.processing['relevancy_threshold'],
125
+ 'questions',
126
+ store_name=getattr(document_processor, 'store_name', None)
127
+ )
128
+ self.session.question_answers = question_answers
129
+
130
+ # Complete
131
+ questions_list = question_answers.get('questions', [])
132
+ answered_count = sum(1 for a in questions_list if a.get('has_answer', False))
133
+ st.success(f"✅ Completed! {answered_count}/{len(questions)} questions answered")
134
+
135
+ status_message("✅ Question answering analysis completed!", "success")
136
+ st.rerun()
137
+
138
+ except Exception as e:
139
+ logger.error(f"Questions processing failed: {e}")
140
+ display_generation_error("question analysis", e)
141
+ except Exception as e:
142
+ logger.error(f"Failed to initialize document processor: {e}")
143
+ display_initialization_error("document processor", e)
app/ui/tabs/strategic_tab.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Strategic Analysis Tab Component
4
+
5
+ Handles strategic analysis generation and display.
6
+ """
7
+
8
+ import streamlit as st
9
+
10
+ from app.ui.tabs.tab_base import TabBase
11
+ from app.ui.ui_components import status_message
12
+ from app.core import logger
13
+
14
+
15
+ class StrategicTab(TabBase):
16
+ """
17
+ Strategic analysis tab that handles strategic report generation and display.
18
+ """
19
+
20
+ def render(self):
21
+ """Render the strategic analysis tab"""
22
+ if not self._check_documents_available():
23
+ return
24
+
25
+ # Generate button row
26
+ button_clicked = self._render_generate_buttons(
27
+ "🎯 Generate Analysis",
28
+ "regenerate_strategic_btn",
29
+ "strategic_summary",
30
+ "Use AI to generate strategic analysis"
31
+ )
32
+
33
+ # Generate or display content
34
+ if self._should_generate_content(button_clicked, "strategic_summary"):
35
+ self._generate_report("strategic", "strategic_summary", "✅ Strategic analysis generated successfully!")
36
+ else:
37
+ self._render_content_or_placeholder(
38
+ "strategic_summary",
39
+ "👆 Click 'Generate Analysis' to create AI-powered strategic assessment"
40
+ )
41
+
42
+ def _generate_report(self, report_type: str, session_attr: str, success_message: str):
43
+ """Generate strategic analysis report using AI"""
44
+ if not self._check_ai_availability():
45
+ return
46
+
47
+ if not self._check_processing_active():
48
+ return
49
+
50
+ # Set processing active
51
+ self._set_processing_active(True)
52
+
53
+ try:
54
+ with st.spinner("Agent running, please wait..."):
55
+ data_room_name = self._get_data_room_name()
56
+
57
+ strategic_summary = self.ai_handler.generate_report(
58
+ report_type,
59
+ documents=self.session.documents,
60
+ data_room_name=data_room_name,
61
+ strategy_text=self.session.strategy_text,
62
+ checklist_results=self.session.checklist_results
63
+ )
64
+
65
+ if strategic_summary:
66
+ setattr(self.session, session_attr, strategic_summary)
67
+ status_message(success_message, "success")
68
+ st.rerun()
69
+ else:
70
+ status_message("Failed to generate strategic analysis. Please try again.", "error")
71
+ except Exception as e:
72
+ logger.error(f"Failed to generate strategic analysis: {e}")
73
+ status_message(f"Failed to generate strategic analysis: {str(e)}", "error")
74
+ finally:
75
+ # Always reset processing state
76
+ self._set_processing_active(False)
77
+
78
+ def _get_export_method_name(self) -> str:
79
+ """Get export method name for strategic reports"""
80
+ return "export_strategic_report"
81
+
82
+ def _get_download_key(self) -> str:
83
+ """Get download button key for strategic reports"""
84
+ return "export_strategic_btn"
85
+
app/ui/tabs/tab_base.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Tab Base Component
4
+
5
+ Provides shared functionality for all tab components including common
6
+ initialization patterns, render methods, and export functionality.
7
+ """
8
+
9
+ # Standard library imports
10
+ from pathlib import Path
11
+ from typing import Optional, Any, Dict
12
+
13
+ # Third-party imports
14
+ import streamlit as st
15
+
16
+ # Local imports
17
+ from app.ui.error_handler import handle_ui_errors
18
+ from app.handlers.ai_handler import AIHandler
19
+ from app.handlers.export_handler import ExportHandler
20
+ from app.ui.session_manager import SessionManager
21
+ from app.ui.ui_components import status_message, render_generate_buttons
22
+
23
+
24
+ class TabBase:
25
+ """
26
+ Base class for tab components with shared functionality.
27
+
28
+ Provides common patterns for initialization, rendering, and export functionality.
29
+ """
30
+
31
+ def __init__(self, session: SessionManager, config, ai_handler: AIHandler, export_handler: ExportHandler):
32
+ """Initialize tab with session manager, config, and handlers"""
33
+ self.session = session
34
+ self.config = config
35
+ self.ai_handler = ai_handler
36
+ self.export_handler = export_handler
37
+
38
+ def render(self):
39
+ """Render the tab - to be implemented by subclasses"""
40
+ raise NotImplementedError("Subclasses must implement render()")
41
+
42
+ def _check_documents_available(self) -> bool:
43
+ """Check if documents are available and show message if not"""
44
+ if not self.session.documents:
45
+ status_message("👈 Configure and process data room first", "info")
46
+ return False
47
+ return True
48
+
49
+ def _render_generate_buttons(self, generate_label: str, regenerate_key: str,
50
+ session_attr: str, help_text: str) -> tuple[bool, bool]:
51
+ """Render common generate and regenerate buttons using reusable component"""
52
+ return render_generate_buttons(
53
+ generate_label,
54
+ regenerate_key,
55
+ session_attr,
56
+ help_text,
57
+ self.session
58
+ )
59
+
60
+ def _should_generate_content(self, generate_clicked: bool, session_attr: str) -> bool:
61
+ """Determine if content should be generated"""
62
+ return generate_clicked and not getattr(self.session, session_attr)
63
+
64
+ def _should_display_content(self, session_attr: str) -> bool:
65
+ """Determine if content should be displayed"""
66
+ return bool(getattr(self.session, session_attr))
67
+
68
+ def _get_data_room_name(self) -> str:
69
+ """Get the data room name from documents"""
70
+ if not self.session.documents:
71
+ return "Unknown"
72
+ return Path(list(self.session.documents.keys())[0]).parent.name
73
+
74
+ def _check_ai_availability(self) -> bool:
75
+ """Check if AI agent is available"""
76
+ if not self.ai_handler.is_agent_available():
77
+ status_message("AI Agent not available. Please configure your API key in the sidebar.", "error")
78
+ return False
79
+ return True
80
+
81
+ def _check_processing_active(self) -> bool:
82
+ """Check if processing is already active"""
83
+ if self.session.processing_active:
84
+ status_message("⚠️ Another operation is currently running. Please wait.", "warning")
85
+ return False
86
+ return True
87
+
88
+ def _set_processing_active(self, active: bool):
89
+ """Set processing active state"""
90
+ self.session.processing_active = active
91
+
92
+ @handle_ui_errors("Report generation", "Please check your documents and try again")
93
+ def _generate_report(self, report_type: str, session_attr: str, success_message: str):
94
+ """Generate report using AI - to be implemented by subclasses"""
95
+ raise NotImplementedError("Subclasses must implement _generate_report()")
96
+
97
+ def _render_export_button(self, export_method_name: str, download_key: str):
98
+ """Render export button for reports"""
99
+ # Get the session attribute dynamically
100
+ session_attr = export_method_name.replace("export_", "").replace("_report", "_summary")
101
+ if not getattr(self.session, session_attr):
102
+ return
103
+
104
+ # Call the export method dynamically
105
+ export_method = getattr(self.export_handler, export_method_name)
106
+ file_name, export_data = export_method()
107
+
108
+ if file_name and export_data:
109
+ st.download_button(
110
+ "📥 Export Report",
111
+ data=export_data,
112
+ file_name=file_name,
113
+ mime="text/markdown",
114
+ key=download_key,
115
+ help="Download report as Markdown file"
116
+ )
117
+
118
+ def _render_content_or_placeholder(self, session_attr: str, placeholder_message: str):
119
+ """Render content if available, otherwise show placeholder"""
120
+ content = getattr(self.session, session_attr)
121
+ if content:
122
+ if isinstance(content, str):
123
+ st.markdown(content)
124
+ else:
125
+ # Handle dict/other types as needed by subclasses
126
+ self._render_custom_content(content)
127
+ self._render_export_button(self._get_export_method_name(), self._get_download_key())
128
+ else:
129
+ status_message(placeholder_message, "info")
130
+
131
+ def _render_custom_content(self, content: Any):
132
+ """Render custom content types - can be overridden by subclasses"""
133
+ pass
134
+
135
+ def _get_export_method_name(self) -> str:
136
+ """Get export method name - to be implemented by subclasses"""
137
+ raise NotImplementedError("Subclasses must implement _get_export_method_name()")
138
+
139
+ def _get_download_key(self) -> str:
140
+ """Get download button key - to be implemented by subclasses"""
141
+ raise NotImplementedError("Subclasses must implement _get_download_key()")