Spaces:
Sleeping
Sleeping
Initial commit for Hugging Face Space
Browse files- .dockerignore +57 -0
- .env.template +48 -0
- .gitignore +105 -0
- README.md +114 -11
- activate_venv.bat +18 -0
- activate_venv.ps1 +18 -0
- configuration/__init__.py +4 -0
- configuration/definitions.py +8 -0
- configuration/logger_setup.py +62 -0
- configuration/parameters.py +133 -0
- content_analyzer/__init__.py +3 -0
- content_analyzer/document_parser.py +842 -0
- content_analyzer/visual_detector.py +354 -0
- core/__init__.py +3 -0
- core/diagnostics.py +125 -0
- core/lifecycle.py +160 -0
- core/logger.py +16 -0
- dependencies.txt +52 -0
- intelligence/__init__.py +5 -0
- intelligence/accuracy_verifier.py +362 -0
- intelligence/context_validator.py +235 -0
- intelligence/knowledge_synthesizer.py +172 -0
- intelligence/orchestrator.py +388 -0
- main.py +986 -0
- maintenance.py +41 -0
- requirements.txt +53 -0
- search_engine/__init__.py +3 -0
- search_engine/indexer.py +228 -0
- test_token_size.py +78 -0
- tests/conftest.py +71 -0
- tests/test_accuracy_verifier.py +110 -0
- tests/test_context_validator.py +120 -0
- tests/test_knowledge_synthesizer.py +50 -0
- tests/test_visual_extraction.py +169 -0
- vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/data_level0.bin +3 -0
- vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/header.bin +3 -0
- vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/index_metadata.pickle +3 -0
- vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/length.bin +3 -0
- vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/link_lists.bin +3 -0
.dockerignore
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore Python bytecode
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
|
| 7 |
+
# Virtual environments
|
| 8 |
+
.venv/
|
| 9 |
+
venv/
|
| 10 |
+
env/
|
| 11 |
+
ENV/
|
| 12 |
+
|
| 13 |
+
# IDE settings
|
| 14 |
+
.idea/
|
| 15 |
+
.vscode/
|
| 16 |
+
*.swp
|
| 17 |
+
*.swo
|
| 18 |
+
|
| 19 |
+
# Environment files (keep .env.example)
|
| 20 |
+
.env
|
| 21 |
+
.env.local
|
| 22 |
+
.env.*.local
|
| 23 |
+
|
| 24 |
+
# Logs (mount as volume in production)
|
| 25 |
+
logs/
|
| 26 |
+
*.log
|
| 27 |
+
|
| 28 |
+
# ChromaDB data (mount as volume in production)
|
| 29 |
+
chroma_db/
|
| 30 |
+
|
| 31 |
+
# Document cache (mount as volume in production)
|
| 32 |
+
document_cache/
|
| 33 |
+
|
| 34 |
+
# Test artifacts
|
| 35 |
+
.pytest_cache/
|
| 36 |
+
.coverage
|
| 37 |
+
htmlcov/
|
| 38 |
+
.tox/
|
| 39 |
+
|
| 40 |
+
# Build artifacts
|
| 41 |
+
dist/
|
| 42 |
+
build/
|
| 43 |
+
*.egg-info/
|
| 44 |
+
|
| 45 |
+
# Jupyter
|
| 46 |
+
.ipynb_checkpoints/
|
| 47 |
+
|
| 48 |
+
# Git
|
| 49 |
+
.git/
|
| 50 |
+
.gitignore
|
| 51 |
+
|
| 52 |
+
# Documentation build
|
| 53 |
+
docs/_build/
|
| 54 |
+
|
| 55 |
+
# Misc
|
| 56 |
+
*.DS_Store
|
| 57 |
+
Thumbs.db
|
.env.template
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SmartDoc AI Environment Configuration
|
| 2 |
+
# Copy this file to .env and fill in your values
|
| 3 |
+
|
| 4 |
+
# =============================================================================
|
| 5 |
+
# REQUIRED SETTINGS
|
| 6 |
+
# =============================================================================
|
| 7 |
+
|
| 8 |
+
# Google API Key for Gemini models (required)
|
| 9 |
+
# Get your key at: https://makersuite.google.com/app/apikey
|
| 10 |
+
GOOGLE_API_KEY=your_google_api_key_here
|
| 11 |
+
|
| 12 |
+
# =============================================================================
|
| 13 |
+
# OPTIONAL SETTINGS (with defaults)
|
| 14 |
+
# =============================================================================
|
| 15 |
+
|
| 16 |
+
# Database settings
|
| 17 |
+
# CHROMA_DB_PATH=./chroma_db
|
| 18 |
+
|
| 19 |
+
# Chunking settings
|
| 20 |
+
# CHUNK_SIZE=1000
|
| 21 |
+
# CHUNK_OVERLAP=100
|
| 22 |
+
|
| 23 |
+
# Retriever settings
|
| 24 |
+
# VECTOR_SEARCH_K=5 # Number of documents to retrieve via vector search
|
| 25 |
+
# VECTOR_FETCH_K=20 # Candidate pool size for MMR
|
| 26 |
+
# VECTOR_SCORE_THRESHOLD=0.3 # Minimum relevance score (0-1)
|
| 27 |
+
# BM25_SEARCH_K=5 # Number of documents to retrieve via BM25
|
| 28 |
+
# HYBRID_RETRIEVER_WEIGHTS=[0.4, 0.6] # [BM25 weight, Vector weight]
|
| 29 |
+
|
| 30 |
+
# Logging settings
|
| 31 |
+
# LOG_LEVEL=INFO
|
| 32 |
+
|
| 33 |
+
# Cache settings
|
| 34 |
+
# CACHE_DIR=document_cache
|
| 35 |
+
# CACHE_EXPIRE_DAYS=7
|
| 36 |
+
|
| 37 |
+
# LLM settings
|
| 38 |
+
# LLM_MAX_RETRIES=3
|
| 39 |
+
# LLM_RETRY_DELAY=1.0
|
| 40 |
+
# LLM_MODEL_NAME=gemini-2.5-flash-lite # Default model for all agents
|
| 41 |
+
|
| 42 |
+
# Agent-specific LLM models (override LLM_MODEL_NAME if needed)
|
| 43 |
+
# RESEARCH_AGENT_MODEL=gemini-2.5-flash-lite
|
| 44 |
+
# VERIFICATION_AGENT_MODEL=gemini-2.5-flash-lite
|
| 45 |
+
# RELEVANCE_CHECKER_MODEL=gemini-2.5-flash-lite
|
| 46 |
+
|
| 47 |
+
# Server settings (optional, for Gradio)
|
| 48 |
+
# GRADIO_SERVER_PORT=7860
|
.gitignore
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
*.egg-info/
|
| 24 |
+
.installed.cfg
|
| 25 |
+
*.egg
|
| 26 |
+
|
| 27 |
+
# PyInstaller
|
| 28 |
+
*.manifest
|
| 29 |
+
*.spec
|
| 30 |
+
|
| 31 |
+
# Unit test / coverage reports
|
| 32 |
+
htmlcov/
|
| 33 |
+
.tox/
|
| 34 |
+
.nox/
|
| 35 |
+
.coverage
|
| 36 |
+
.coverage.*
|
| 37 |
+
.cache
|
| 38 |
+
nosetests.xml
|
| 39 |
+
coverage.xml
|
| 40 |
+
*.cover
|
| 41 |
+
*.py,cover
|
| 42 |
+
.hypothesis/
|
| 43 |
+
.pytest_cache/
|
| 44 |
+
|
| 45 |
+
# Environments - IMPORTANT: Keep .env out of repo
|
| 46 |
+
.env
|
| 47 |
+
.env.local
|
| 48 |
+
.env.*.local
|
| 49 |
+
.venv
|
| 50 |
+
env/
|
| 51 |
+
venv/
|
| 52 |
+
ENV/
|
| 53 |
+
env.bak/
|
| 54 |
+
venv.bak/
|
| 55 |
+
|
| 56 |
+
# IDE
|
| 57 |
+
.idea/
|
| 58 |
+
.vscode/
|
| 59 |
+
*.swp
|
| 60 |
+
*.swo
|
| 61 |
+
*~
|
| 62 |
+
.vs/
|
| 63 |
+
|
| 64 |
+
# Visual Studio
|
| 65 |
+
*.suo
|
| 66 |
+
*.user
|
| 67 |
+
*.userosscache
|
| 68 |
+
*.sln.docstates
|
| 69 |
+
|
| 70 |
+
# Jupyter Notebook
|
| 71 |
+
.ipynb_checkpoints
|
| 72 |
+
|
| 73 |
+
# pyenv
|
| 74 |
+
.python-version
|
| 75 |
+
|
| 76 |
+
# Logs
|
| 77 |
+
logs/
|
| 78 |
+
*.log
|
| 79 |
+
|
| 80 |
+
# Database
|
| 81 |
+
*.db
|
| 82 |
+
*.sqlite3
|
| 83 |
+
|
| 84 |
+
# ChromaDB / Vector stores - Recreated at runtime
|
| 85 |
+
chroma_db/
|
| 86 |
+
*.chroma
|
| 87 |
+
|
| 88 |
+
# Cache - Recreated at runtime
|
| 89 |
+
.cache/
|
| 90 |
+
cache/
|
| 91 |
+
document_cache/
|
| 92 |
+
*.pkl
|
| 93 |
+
|
| 94 |
+
# OS files
|
| 95 |
+
.DS_Store
|
| 96 |
+
Thumbs.db
|
| 97 |
+
|
| 98 |
+
# Temporary files
|
| 99 |
+
tmp/
|
| 100 |
+
temp/
|
| 101 |
+
*.tmp
|
| 102 |
+
|
| 103 |
+
# Hugging Face Spaces
|
| 104 |
+
.gradio/
|
| 105 |
+
flagged/
|
README.md
CHANGED
|
@@ -1,14 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
---
|
| 13 |
|
| 14 |
-
|
|
|
|
| 1 |
+
# SmartDoc AI
|
| 2 |
+
|
| 3 |
+
SmartDoc AI is an advanced document analysis and question answering system. It allows you to upload documents, ask questions, and receive accurate, source-verified answers. The system uses a multi-agent workflow, hybrid search, and both local and cloud-based chart detection for high performance and cost efficiency.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Features
|
| 8 |
+
|
| 9 |
+
- **Multi-format Document Support**: PDF, DOCX, TXT, and Markdown
|
| 10 |
+
- **Smart Chunking**: Configurable chunk size and overlap for optimal retrieval
|
| 11 |
+
- **Intelligent Caching**: Speeds up repeated queries
|
| 12 |
+
- **Chart Extraction**: Detects and analyzes charts using OpenCV and Gemini Vision
|
| 13 |
+
- **Hybrid Search**: Combines keyword and vector search for best results
|
| 14 |
+
- **Multi-Agent Workflow**: Relevance checking, research, and answer verification
|
| 15 |
+
- **Production Ready**: Structured logging, environment-based config, and test suite
|
| 16 |
+
- **Efficient**: Local chart detection saves up to 95% on API costs
|
| 17 |
+
|
| 18 |
---
|
| 19 |
+
|
| 20 |
+
## Quick Start
|
| 21 |
+
|
| 22 |
+
### Prerequisites
|
| 23 |
+
- Python 3.11 or higher
|
| 24 |
+
- Google API Key for Gemini models ([Get one here](https://ai.google.dev/))
|
| 25 |
+
|
| 26 |
+
### Installation
|
| 27 |
+
|
| 28 |
+
1. Clone the repository:
|
| 29 |
+
```bash
|
| 30 |
+
git clone https://github.com/TilanTAB/Intelligent-Document-Analysis-Q-A-3.git
|
| 31 |
+
cd Intelligent-Document-Analysis-Q-A-3
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
2. Activate the virtual environment:
|
| 35 |
+
```bash
|
| 36 |
+
# Windows PowerShell
|
| 37 |
+
.\activate_venv.ps1
|
| 38 |
+
# Windows Command Prompt
|
| 39 |
+
activate_venv.bat
|
| 40 |
+
# Or manually:
|
| 41 |
+
.\venv\Scripts\Activate.ps1
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
3. Install dependencies (if needed):
|
| 45 |
+
```bash
|
| 46 |
+
pip install -r dependencies.txt
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
4. Configure environment variables:
|
| 50 |
+
```bash
|
| 51 |
+
cp .env.template .env
|
| 52 |
+
# Edit .env and set your API key
|
| 53 |
+
GOOGLE_API_KEY=your_api_key_here
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
5. (Optional) Verify installation:
|
| 57 |
+
```bash
|
| 58 |
+
python verify_environment.py
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
6. Run the application:
|
| 62 |
+
```bash
|
| 63 |
+
python main.py
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
7. Open your browser to [http://localhost:7860](http://localhost:7860)
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
## Configuration
|
| 71 |
+
|
| 72 |
+
All settings can be configured via environment variables or the `.env` file. Key options include:
|
| 73 |
+
- `GOOGLE_API_KEY`: Your Gemini API key (required)
|
| 74 |
+
- `CHUNK_SIZE`, `CHUNK_OVERLAP`: Document chunking
|
| 75 |
+
- `ENABLE_CHART_EXTRACTION`: Enable/disable chart detection
|
| 76 |
+
- `CHART_USE_LOCAL_DETECTION`: Use OpenCV for free chart detection
|
| 77 |
+
- `CHART_ENABLE_BATCH_ANALYSIS`: Batch process charts for speed
|
| 78 |
+
- `CHART_GEMINI_BATCH_SIZE`: Number of charts per Gemini API call
|
| 79 |
+
- `LOG_LEVEL`: Logging verbosity
|
| 80 |
+
- `GRADIO_SERVER_PORT`: Web interface port
|
| 81 |
+
|
| 82 |
+
---
|
| 83 |
+
|
| 84 |
+
## Project Structure
|
| 85 |
+
|
| 86 |
+
- `intelligence/` - Multi-agent system (relevance, research, verification)
|
| 87 |
+
- `configuration/` - App settings and logging
|
| 88 |
+
- `content_analyzer/` - Document and chart processing
|
| 89 |
+
- `search_engine/` - Hybrid retriever logic
|
| 90 |
+
- `core/` - Utilities and diagnostics
|
| 91 |
+
- `tests/` - Test suite
|
| 92 |
+
- `main.py` - Application entry point
|
| 93 |
+
|
| 94 |
+
---
|
| 95 |
+
|
| 96 |
+
## Troubleshooting
|
| 97 |
+
|
| 98 |
+
- **API Key Not Found**: Set `GOOGLE_API_KEY` in your `.env` file.
|
| 99 |
+
- **Python 3.13 Issues**: Use Python 3.11 or 3.12 for best compatibility.
|
| 100 |
+
- **Chart Detection Slow**: Lower `CHART_DPI` or `CHART_MAX_IMAGE_SIZE` in `.env`.
|
| 101 |
+
- **ChromaDB Lock Issues**: Stop all instances and remove lock files in `vector_store/`.
|
| 102 |
+
|
| 103 |
+
---
|
| 104 |
+
|
| 105 |
+
## Contributing
|
| 106 |
+
|
| 107 |
+
Contributions are welcome! Please fork the repository, create a feature branch, and submit a pull request with a clear description.
|
| 108 |
+
|
| 109 |
+
---
|
| 110 |
+
|
| 111 |
+
## License
|
| 112 |
+
|
| 113 |
+
This project is licensed under the MIT License.
|
| 114 |
+
|
| 115 |
---
|
| 116 |
|
| 117 |
+
SmartDoc AI is actively maintained and designed for real-world document analysis and Q&A. For updates and support, visit the [GitHub repository](https://github.com/TilanTAB/Intelligent-Document-Analysis-Q-A-3).
|
activate_venv.bat
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
REM SmartDoc AI - Virtual Environment Activation Script (Windows Command Prompt)
|
| 3 |
+
REM Run this script to activate the virtual environment
|
| 4 |
+
|
| 5 |
+
echo ?? Activating SmartDoc AI virtual environment...
|
| 6 |
+
echo.
|
| 7 |
+
|
| 8 |
+
call venv\Scripts\activate.bat
|
| 9 |
+
|
| 10 |
+
echo.
|
| 11 |
+
echo ? Virtual environment activated!
|
| 12 |
+
echo.
|
| 13 |
+
echo ?? To run the application:
|
| 14 |
+
echo python main.py
|
| 15 |
+
echo.
|
| 16 |
+
echo ?? To deactivate:
|
| 17 |
+
echo deactivate
|
| 18 |
+
echo.
|
activate_venv.ps1
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SmartDoc AI - Virtual Environment Activation Script
|
| 2 |
+
# Run this script to activate the virtual environment
|
| 3 |
+
|
| 4 |
+
Write-Host "?? Activating SmartDoc AI virtual environment..." -ForegroundColor Cyan
|
| 5 |
+
|
| 6 |
+
# Activate the virtual environment
|
| 7 |
+
& ".\venv\Scripts\Activate.ps1"
|
| 8 |
+
|
| 9 |
+
Write-Host "? Virtual environment activated!" -ForegroundColor Green
|
| 10 |
+
Write-Host ""
|
| 11 |
+
Write-Host "?? Installed packages:" -ForegroundColor Yellow
|
| 12 |
+
pip list | Select-String -Pattern "langchain|chromadb|gradio|opencv|google-generativeai"
|
| 13 |
+
Write-Host ""
|
| 14 |
+
Write-Host "?? To run the application:" -ForegroundColor Cyan
|
| 15 |
+
Write-Host " python main.py" -ForegroundColor White
|
| 16 |
+
Write-Host ""
|
| 17 |
+
Write-Host "?? To deactivate:" -ForegroundColor Cyan
|
| 18 |
+
Write-Host " deactivate" -ForegroundColor White
|
configuration/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .parameters import parameters
|
| 2 |
+
from .definitions import MAX_FILE_SIZE, MAX_TOTAL_SIZE, ALLOWED_TYPES
|
| 3 |
+
|
| 4 |
+
__all__ = ["parameters", "MAX_FILE_SIZE", "MAX_TOTAL_SIZE", "ALLOWED_TYPES"]
|
configuration/definitions.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Maximum allowed size for a single file (50 MB)
|
| 2 |
+
MAX_FILE_SIZE: int = 50 * 1024 * 1024
|
| 3 |
+
|
| 4 |
+
# Maximum allowed total size for all uploaded files (200 MB)
|
| 5 |
+
MAX_TOTAL_SIZE: int = 200 * 1024 * 1024
|
| 6 |
+
|
| 7 |
+
# Allowed file types for upload
|
| 8 |
+
ALLOWED_TYPES: list = [".txt", ".pdf", ".docx", ".md"]
|
configuration/logger_setup.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from logging.handlers import RotatingFileHandler, QueueHandler, QueueListener
|
| 3 |
+
import queue
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
# Custom formatter to remove unsupported Unicode characters
|
| 9 |
+
class SafeFormatter(logging.Formatter):
|
| 10 |
+
def format(self, record):
|
| 11 |
+
msg = super().format(record)
|
| 12 |
+
# Remove characters not supported by cp1252 (0-255)
|
| 13 |
+
safe_msg = ''.join(c if ord(c) < 256 else '?' for c in msg)
|
| 14 |
+
return safe_msg
|
| 15 |
+
|
| 16 |
+
# Ensure the logs directory exists
|
| 17 |
+
log_dir = Path("logs")
|
| 18 |
+
log_dir.mkdir(exist_ok=True)
|
| 19 |
+
|
| 20 |
+
# Configure log file path
|
| 21 |
+
log_file_path = os.path.join("logs", "app.log")
|
| 22 |
+
|
| 23 |
+
# Set up a queue for log messages
|
| 24 |
+
log_queue = queue.Queue(-1) # No limit on size (-1)
|
| 25 |
+
|
| 26 |
+
# Detailed log format with timestamp, level, logger name, and message
|
| 27 |
+
detailed_format = "%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
| 28 |
+
|
| 29 |
+
# Create a rotating file handler for the application logs
|
| 30 |
+
file_handler = RotatingFileHandler(
|
| 31 |
+
log_file_path,
|
| 32 |
+
maxBytes=10 * 1024 * 1024, # 10 MB
|
| 33 |
+
backupCount=5, # Keep 5 backups
|
| 34 |
+
delay=True # Delay file opening until a log message is emitted
|
| 35 |
+
)
|
| 36 |
+
file_handler.setFormatter(SafeFormatter(detailed_format))
|
| 37 |
+
|
| 38 |
+
# Create a queue handler to send log messages to the queue
|
| 39 |
+
queue_handler = QueueHandler(log_queue)
|
| 40 |
+
|
| 41 |
+
# Console handler (direct, not via queue)
|
| 42 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
| 43 |
+
console_handler.setFormatter(SafeFormatter(detailed_format))
|
| 44 |
+
console_handler.setLevel(logging.INFO)
|
| 45 |
+
|
| 46 |
+
# Get the root logger
|
| 47 |
+
root_logger = logging.getLogger()
|
| 48 |
+
root_logger.setLevel(logging.INFO)
|
| 49 |
+
root_logger.handlers = [console_handler, queue_handler] # Console direct, queue for file
|
| 50 |
+
|
| 51 |
+
# Create and start a listener for the queue to process log messages in the background
|
| 52 |
+
listener = QueueListener(log_queue, file_handler)
|
| 53 |
+
listener.start()
|
| 54 |
+
|
| 55 |
+
# Suppress verbose logs from specific third-party libraries
|
| 56 |
+
logging.getLogger("langchain").setLevel(logging.WARNING)
|
| 57 |
+
logging.getLogger("langchain_community").setLevel(logging.WARNING)
|
| 58 |
+
logging.getLogger("chromadb").setLevel(logging.WARNING)
|
| 59 |
+
logging.getLogger("google").setLevel(logging.WARNING)
|
| 60 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
| 61 |
+
|
| 62 |
+
root_logger.info("Logging system initialized successfully.")
|
configuration/parameters.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 2 |
+
from pydantic import Field, field_validator
|
| 3 |
+
from typing import Optional
|
| 4 |
+
import os
|
| 5 |
+
from .definitions import MAX_FILE_SIZE, MAX_TOTAL_SIZE, ALLOWED_TYPES
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Settings(BaseSettings):
|
| 9 |
+
"""
|
| 10 |
+
Application parameters loaded from environment variables.
|
| 11 |
+
|
| 12 |
+
For local development:
|
| 13 |
+
Create a .env file in the project root with your configuration:
|
| 14 |
+
GOOGLE_API_KEY=your_api_key_here
|
| 15 |
+
|
| 16 |
+
For Hugging Face Spaces:
|
| 17 |
+
Add GOOGLE_API_KEY as a secret in Space Settings > Repository secrets
|
| 18 |
+
"""
|
| 19 |
+
model_config = SettingsConfigDict(
|
| 20 |
+
env_file=".env",
|
| 21 |
+
env_file_encoding="utf-8",
|
| 22 |
+
extra="ignore",
|
| 23 |
+
case_sensitive=False,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# File upload parameters with defaults from definitions
|
| 27 |
+
MAX_FILE_SIZE: int = MAX_FILE_SIZE
|
| 28 |
+
MAX_TOTAL_SIZE: int = MAX_TOTAL_SIZE
|
| 29 |
+
ALLOWED_TYPES: list = ALLOWED_TYPES
|
| 30 |
+
|
| 31 |
+
# API keys - REQUIRED, must be set via environment variable or HF Secrets
|
| 32 |
+
GOOGLE_API_KEY: str = Field(
|
| 33 |
+
..., # Required field
|
| 34 |
+
description="Google API key for Gemini models",
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Database parameters
|
| 38 |
+
CHROMA_DB_PATH: str = "./chroma_db"
|
| 39 |
+
|
| 40 |
+
# Chunking parameters
|
| 41 |
+
CHUNK_SIZE: int = 2000
|
| 42 |
+
CHUNK_OVERLAP: int = 100
|
| 43 |
+
|
| 44 |
+
# Retriever parameters
|
| 45 |
+
VECTOR_SEARCH_K: int = 25
|
| 46 |
+
VECTOR_Search_K_CHROMA: int = 15
|
| 47 |
+
VECTOR_FETCH_K: int = 35
|
| 48 |
+
VECTOR_SCORE_THRESHOLD: float = 0.3
|
| 49 |
+
BM25_SEARCH_K: int = 8
|
| 50 |
+
HYBRID_RETRIEVER_WEIGHTS: list = [0.4, 0.6] # [BM25 weight, Vector weight]
|
| 51 |
+
CHROMA_COLLECTION_NAME: str = "documents"
|
| 52 |
+
|
| 53 |
+
# Workflow parameters
|
| 54 |
+
MAX_RESEARCH_ATTEMPTS: int = 2
|
| 55 |
+
ENABLE_QUERY_REWRITING: bool = True
|
| 56 |
+
MAX_QUERY_REWRITES: int = 1
|
| 57 |
+
RELEVANCE_CHECK_K: int = 20
|
| 58 |
+
|
| 59 |
+
# Research agent parameters
|
| 60 |
+
RESEARCH_TOP_K: int = 15
|
| 61 |
+
RESEARCH_MAX_CONTEXT_CHARS: int = 8000000000
|
| 62 |
+
RESEARCH_MAX_OUTPUT_TOKENS: int = 500
|
| 63 |
+
|
| 64 |
+
# Verification parameters
|
| 65 |
+
VERIFICATION_MAX_CONTEXT_CHARS: int = 800000000
|
| 66 |
+
VERIFICATION_MAX_OUTPUT_TOKENS: int = 300
|
| 67 |
+
|
| 68 |
+
# Logging parameters
|
| 69 |
+
LOG_LEVEL: str = "INFO"
|
| 70 |
+
|
| 71 |
+
# Cache parameters
|
| 72 |
+
CACHE_DIR: str = "document_cache"
|
| 73 |
+
CACHE_EXPIRE_DAYS: int = 7
|
| 74 |
+
|
| 75 |
+
# LLM parameters
|
| 76 |
+
LLM_MAX_RETRIES: int = 3
|
| 77 |
+
LLM_RETRY_DELAY: float = 1.0
|
| 78 |
+
LLM_MODEL_NAME: str = "gemini-2.5-flash-lite" # Default model for all agents
|
| 79 |
+
|
| 80 |
+
# Agent-specific LLM models (override LLM_MODEL_NAME if needed)
|
| 81 |
+
RESEARCH_AGENT_MODEL: str = "gemini-2.5-flash-lite"
|
| 82 |
+
VERIFICATION_AGENT_MODEL: str = "gemini-2.5-flash-lite"
|
| 83 |
+
RELEVANCE_CHECKER_MODEL: str = "gemini-2.5-flash-lite"
|
| 84 |
+
|
| 85 |
+
# Chart extraction parameters
|
| 86 |
+
ENABLE_CHART_EXTRACTION: bool = True
|
| 87 |
+
CHART_VISION_MODEL: str = "gemini-2.5-flash-lite"
|
| 88 |
+
CHART_MAX_TOKENS: int = 1500
|
| 89 |
+
CHART_DPI: int = 150 # Lower DPI saves memory
|
| 90 |
+
CHART_BATCH_SIZE: int = 3 # Process pages in batches
|
| 91 |
+
CHART_MAX_IMAGE_SIZE: int = 1920 # Max dimension for images
|
| 92 |
+
|
| 93 |
+
# Local chart detection parameters (cost optimization)
|
| 94 |
+
CHART_USE_LOCAL_DETECTION: bool = True # Use OpenCV first (FREE)
|
| 95 |
+
CHART_MIN_CONFIDENCE: float = 0.4 # Only analyze charts with confidence > 40%
|
| 96 |
+
CHART_SKIP_GEMINI_DETECTION: bool = True # Skip Gemini for detection, only use for analysis
|
| 97 |
+
CHART_GEMINI_FALLBACK_ENABLED: bool = False # Optional: Use Gemini if local fails
|
| 98 |
+
|
| 99 |
+
# Gemini batch processing parameters (speed optimization - 2-3× faster)
|
| 100 |
+
CHART_GEMINI_BATCH_SIZE: int = 1 # Analyze 1 chart per API call (reduced from 2 for reliability)
|
| 101 |
+
CHART_ENABLE_BATCH_ANALYSIS: bool = True # Enable batch processing for speed
|
| 102 |
+
|
| 103 |
+
@field_validator("GOOGLE_API_KEY")
|
| 104 |
+
@classmethod
|
| 105 |
+
def validate_api_key(cls, v: str) -> str:
|
| 106 |
+
"""Validate that API key is provided and not a placeholder."""
|
| 107 |
+
if not v or v.strip() == "":
|
| 108 |
+
raise ValueError("GOOGLE_API_KEY is required. Set it in your .env file or HF Secrets.")
|
| 109 |
+
if v.startswith("your_") or v == "YOUR_API_KEY_HERE":
|
| 110 |
+
raise ValueError("Please replace the placeholder GOOGLE_API_KEY with your actual API key.")
|
| 111 |
+
return v
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _get_parameters():
|
| 115 |
+
"""Get parameters instance with helpful error messages."""
|
| 116 |
+
is_hf_space = os.environ.get("SPACE_ID") is not None
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
return Settings()
|
| 120 |
+
except Exception as e:
|
| 121 |
+
import sys
|
| 122 |
+
print(f"⚠️ Configuration Error: {e}", file=sys.stderr)
|
| 123 |
+
|
| 124 |
+
if is_hf_space:
|
| 125 |
+
print("💡 Tip: Add GOOGLE_API_KEY in Space Settings > Repository secrets", file=sys.stderr)
|
| 126 |
+
else:
|
| 127 |
+
print("💡 Tip: Create a .env file with GOOGLE_API_KEY=your_api_key", file=sys.stderr)
|
| 128 |
+
|
| 129 |
+
raise
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# Create parameters instance
|
| 133 |
+
parameters = _get_parameters()
|
content_analyzer/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .document_parser import DocumentProcessor
|
| 2 |
+
|
| 3 |
+
__all__ = ["DocumentProcessor"]
|
content_analyzer/document_parser.py
ADDED
|
@@ -0,0 +1,842 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
import hashlib
|
| 3 |
+
import logging
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
from datetime import datetime, timedelta
|
| 7 |
+
from langchain_core.documents import Document
|
| 8 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 9 |
+
from configuration.parameters import parameters
|
| 10 |
+
from configuration.definitions import MAX_TOTAL_SIZE, ALLOWED_TYPES
|
| 11 |
+
import concurrent.futures
|
| 12 |
+
from PIL import Image
|
| 13 |
+
import gc
|
| 14 |
+
from google.genai import types
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
def preprocess_image(image, max_dim=1000):
|
| 19 |
+
"""Downscale image to max_dim before OpenCV processing."""
|
| 20 |
+
if max(image.size) > max_dim:
|
| 21 |
+
ratio = max_dim / max(image.size)
|
| 22 |
+
new_size = tuple(int(dim * ratio) for dim in image.size)
|
| 23 |
+
return image.resize(new_size, Image.Resampling.LANCZOS)
|
| 24 |
+
return image
|
| 25 |
+
|
| 26 |
+
def detect_chart_on_page(args):
|
| 27 |
+
"""
|
| 28 |
+
Top-level function for parallel local chart detection (required for ProcessPoolExecutor).
|
| 29 |
+
Returns the page number, the PIL image, and the detection result.
|
| 30 |
+
"""
|
| 31 |
+
page_num, image = args
|
| 32 |
+
from content_analyzer.visual_detector import LocalChartDetector
|
| 33 |
+
# Downscale image before detection to save memory
|
| 34 |
+
image = preprocess_image(image, max_dim=1000)
|
| 35 |
+
detection_result = LocalChartDetector.detect_charts(image)
|
| 36 |
+
# Do NOT delete image here; it will be saved in the main process
|
| 37 |
+
return (page_num, image, detection_result)
|
| 38 |
+
|
| 39 |
+
def analyze_batch(batch_tuple):
|
| 40 |
+
"""
|
| 41 |
+
Top-level function for parallel Gemini batch analysis (future-proof for process pools).
|
| 42 |
+
"""
|
| 43 |
+
batch, batch_num, total_batches, gemini_client, file_path, parameters, stats = batch_tuple
|
| 44 |
+
try:
|
| 45 |
+
import logging
|
| 46 |
+
logger = logging.getLogger(__name__)
|
| 47 |
+
from PIL import Image
|
| 48 |
+
from google.genai import types
|
| 49 |
+
images = [Image.open(image_path) for _, image_path, _ in batch]
|
| 50 |
+
prompt = f"""
|
| 51 |
+
Analyze the following {len(batch)} chart(s)/graph(s) in order.
|
| 52 |
+
|
| 53 |
+
For EACH chart, provide comprehensive analysis separated by the marker "---CHART N---".
|
| 54 |
+
|
| 55 |
+
For each chart include:
|
| 56 |
+
**Chart Type**: [line/bar/pie/bubble/scatter/etc]
|
| 57 |
+
**Title**: [chart title]
|
| 58 |
+
**X-axis**: [label and units]
|
| 59 |
+
**Y-axis**: [label and units]
|
| 60 |
+
**Data Points**: [extract ALL visible data with exact values]
|
| 61 |
+
**Legend**: [list all series/categories]
|
| 62 |
+
**Trends**: [key patterns, trends, insights]
|
| 63 |
+
**Key Values**: [maximum, minimum, significant values]
|
| 64 |
+
**Context**: [any annotations or notes]
|
| 65 |
+
|
| 66 |
+
Format exactly as:
|
| 67 |
+
---CHART 1---
|
| 68 |
+
[analysis]
|
| 69 |
+
|
| 70 |
+
---CHART 2---
|
| 71 |
+
[analysis]
|
| 72 |
+
|
| 73 |
+
---CHART 3---
|
| 74 |
+
[analysis]
|
| 75 |
+
"""
|
| 76 |
+
# For batch analysis:
|
| 77 |
+
chart_response = gemini_client.models.generate_content(
|
| 78 |
+
model=parameters.CHART_VISION_MODEL,
|
| 79 |
+
contents=[prompt] + images,
|
| 80 |
+
config=types.GenerateContentConfig(
|
| 81 |
+
max_output_tokens=parameters.CHART_MAX_TOKENS * len(batch)
|
| 82 |
+
)
|
| 83 |
+
)
|
| 84 |
+
stats['batch_api_calls'] += 1
|
| 85 |
+
response_text = chart_response.text
|
| 86 |
+
parts = response_text.split('---CHART ')
|
| 87 |
+
batch_docs = []
|
| 88 |
+
for idx, (page_num, image_path, detection_result) in enumerate(batch):
|
| 89 |
+
if idx + 1 < len(parts):
|
| 90 |
+
analysis_text = parts[idx + 1]
|
| 91 |
+
if '---CHART' in analysis_text:
|
| 92 |
+
analysis_text = analysis_text.split('---CHART')[0]
|
| 93 |
+
lines = analysis_text.split('\n')
|
| 94 |
+
if lines and '---' in lines[0]:
|
| 95 |
+
lines = lines[1:]
|
| 96 |
+
analysis = '\n'.join(lines).strip()
|
| 97 |
+
else:
|
| 98 |
+
analysis = "Analysis unavailable (parsing error)"
|
| 99 |
+
chart_types_str = ", ".join(detection_result['chart_types']) or "Unknown"
|
| 100 |
+
confidence = detection_result['confidence']
|
| 101 |
+
chart_doc = Document(
|
| 102 |
+
page_content=f"""### 📊 Chart Analysis (Page {page_num})\n\n**Detection Method**: Hybrid (Local OpenCV + Gemini Batch Analysis)\n**Local Confidence**: {confidence:.0%}\n**Detected Types**: {chart_types_str}\n**Batch Size**: {len(batch)} charts analyzed together\n\n---\n\n{analysis}\n""",
|
| 103 |
+
metadata={
|
| 104 |
+
"source": file_path,
|
| 105 |
+
"page": page_num,
|
| 106 |
+
"type": "chart",
|
| 107 |
+
"extraction_method": "hybrid_batch",
|
| 108 |
+
"detection_confidence": confidence,
|
| 109 |
+
"batch_size": len(batch)
|
| 110 |
+
}
|
| 111 |
+
)
|
| 112 |
+
batch_docs.append(chart_doc)
|
| 113 |
+
stats['charts_analyzed_gemini'] += 1
|
| 114 |
+
for img in images:
|
| 115 |
+
img.close()
|
| 116 |
+
logger.info(f"✅ Batch {batch_num} complete ({len(batch)} charts analyzed)")
|
| 117 |
+
return (batch_num - 1, batch_docs)
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger = logging.getLogger(__name__)
|
| 120 |
+
logger.error(f"Batch analysis failed: {e}, trying sequential fallback...")
|
| 121 |
+
return (batch_num - 1, [])
|
| 122 |
+
|
| 123 |
+
class DocumentProcessor:
|
| 124 |
+
"""
|
| 125 |
+
Processes documents by splitting them into manageable chunks and caching
|
| 126 |
+
the results to avoid reprocessing. Handles chart extraction using local
|
| 127 |
+
OpenCV detection and Gemini Vision API with parallelization for speed.
|
| 128 |
+
"""
|
| 129 |
+
# Cache metadata version - increment when cache format changes
|
| 130 |
+
CACHE_VERSION = 4 # Incremented for chart extraction support
|
| 131 |
+
|
| 132 |
+
def __init__(self):
|
| 133 |
+
"""Initialize the document processor with cache directory and splitter configuration."""
|
| 134 |
+
self.cache_dir = Path(parameters.CACHE_DIR)
|
| 135 |
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
| 136 |
+
self.splitter = RecursiveCharacterTextSplitter(
|
| 137 |
+
chunk_size=parameters.CHUNK_SIZE,
|
| 138 |
+
chunk_overlap=parameters.CHUNK_OVERLAP,
|
| 139 |
+
length_function=len,
|
| 140 |
+
is_separator_regex=False,
|
| 141 |
+
)
|
| 142 |
+
self.gemini_client = None
|
| 143 |
+
self.genai_module = None # Store the module reference
|
| 144 |
+
if parameters.ENABLE_CHART_EXTRACTION:
|
| 145 |
+
self._init_gemini_vision()
|
| 146 |
+
logger.debug(f"DocumentProcessor initialized with cache dir: {self.cache_dir}")
|
| 147 |
+
logger.debug(f"Chunk size: {parameters.CHUNK_SIZE}, Chunk overlap: {parameters.CHUNK_OVERLAP}")
|
| 148 |
+
logger.debug(f"Chart extraction: {'enabled' if parameters.ENABLE_CHART_EXTRACTION else 'disabled'}")
|
| 149 |
+
|
| 150 |
+
def _init_gemini_vision(self):
|
| 151 |
+
"""Initialize Gemini Vision client for chart analysis."""
|
| 152 |
+
genai = None
|
| 153 |
+
try:
|
| 154 |
+
# Use the new google.genai package
|
| 155 |
+
import google.genai as genai
|
| 156 |
+
logger.debug("✅ Loaded google.genai (new package)")
|
| 157 |
+
except ImportError as e:
|
| 158 |
+
logger.warning(f"google-genai not installed: {e}")
|
| 159 |
+
logger.info("Install with: pip install google-genai")
|
| 160 |
+
parameters.ENABLE_CHART_EXTRACTION = False
|
| 161 |
+
return
|
| 162 |
+
self.genai_module = genai
|
| 163 |
+
try:
|
| 164 |
+
from google import genai
|
| 165 |
+
self.gemini_client = genai.Client(api_key=parameters.GOOGLE_API_KEY)
|
| 166 |
+
logger.info(f"✅ Gemini Vision client initialized")
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"❌ Failed to initialize Gemini Vision client: {e}")
|
| 169 |
+
parameters.ENABLE_CHART_EXTRACTION = False
|
| 170 |
+
|
| 171 |
+
def validate_files(self, files: List) -> bool:
|
| 172 |
+
"""
|
| 173 |
+
Validate that uploaded files meet size and type requirements.
|
| 174 |
+
|
| 175 |
+
Args:
|
| 176 |
+
files: List of uploaded file objects
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
bool: True if all validations pass
|
| 180 |
+
|
| 181 |
+
Raises:
|
| 182 |
+
ValueError: If validation fails
|
| 183 |
+
"""
|
| 184 |
+
if not files:
|
| 185 |
+
raise ValueError("No files provided")
|
| 186 |
+
|
| 187 |
+
total_size = 0
|
| 188 |
+
for file in files:
|
| 189 |
+
# Get file size
|
| 190 |
+
if hasattr(file, 'size'):
|
| 191 |
+
file_size = file.size
|
| 192 |
+
else:
|
| 193 |
+
# Fallback: read file to get size
|
| 194 |
+
try:
|
| 195 |
+
with open(file.name, 'rb') as f:
|
| 196 |
+
file_size = len(f.read())
|
| 197 |
+
except Exception as e:
|
| 198 |
+
logger.error(f"Failed to determine file size for {file.name}: {e}")
|
| 199 |
+
raise ValueError(f"Cannot read file: {file.name}")
|
| 200 |
+
|
| 201 |
+
# Check individual file size
|
| 202 |
+
if file_size > parameters.MAX_FILE_SIZE:
|
| 203 |
+
raise ValueError(
|
| 204 |
+
f"File {file.name} exceeds maximum size "
|
| 205 |
+
f"({file_size / 1024 / 1024:.2f}MB > {parameters.MAX_FILE_SIZE / 1024 / 1024:.2f}MB)"
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
# Check file type
|
| 209 |
+
file_ext = Path(file.name).suffix.lower()
|
| 210 |
+
if file_ext not in ALLOWED_TYPES:
|
| 211 |
+
raise ValueError(
|
| 212 |
+
f"File type {file_ext} not supported. Allowed types: {ALLOWED_TYPES}"
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
total_size += file_size
|
| 216 |
+
|
| 217 |
+
# Check total size
|
| 218 |
+
if total_size > parameters.MAX_TOTAL_SIZE:
|
| 219 |
+
raise ValueError(
|
| 220 |
+
f"Total file size exceeds maximum "
|
| 221 |
+
f"({total_size / 1024 / 1024:.2f}MB > {parameters.MAX_TOTAL_SIZE / 1024 / 1024:.2f}MB)"
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
logger.info(f"Validation passed for {len(files)} files (total: {total_size / 1024 / 1024:.2f}MB)")
|
| 225 |
+
return True
|
| 226 |
+
|
| 227 |
+
def _generate_hash(self, content: bytes) -> str:
|
| 228 |
+
"""Generate SHA-256 hash of file content."""
|
| 229 |
+
return hashlib.sha256(content).hexdigest()
|
| 230 |
+
|
| 231 |
+
def _is_cache_valid(self, cache_path: Path) -> bool:
|
| 232 |
+
"""Check if a cache file exists and is still valid (not expired)."""
|
| 233 |
+
if not cache_path.exists():
|
| 234 |
+
logger.debug(f"Cache miss: {cache_path.name}")
|
| 235 |
+
return False
|
| 236 |
+
|
| 237 |
+
file_age = datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)
|
| 238 |
+
if file_age > timedelta(days=parameters.CACHE_EXPIRE_DAYS):
|
| 239 |
+
logger.info(f"Cache expired (age: {file_age.days} days): {cache_path.name}")
|
| 240 |
+
cache_path.unlink()
|
| 241 |
+
return False
|
| 242 |
+
|
| 243 |
+
logger.debug(f"Cache hit: {cache_path.name} (age: {file_age.days} days)")
|
| 244 |
+
return True
|
| 245 |
+
|
| 246 |
+
def _load_from_cache(self, cache_path: Path) -> List:
|
| 247 |
+
"""Loads chunks from a pickle file, handling potential corruption."""
|
| 248 |
+
try:
|
| 249 |
+
with open(cache_path, "rb") as f:
|
| 250 |
+
data = pickle.load(f)
|
| 251 |
+
|
| 252 |
+
if "chunks" not in data or "timestamp" not in data:
|
| 253 |
+
raise KeyError("Cache file missing 'chunks' or 'timestamp' key.")
|
| 254 |
+
|
| 255 |
+
logger.info(f"Loaded {len(data['chunks'])} chunks from cache: {cache_path.name}")
|
| 256 |
+
return data["chunks"]
|
| 257 |
+
except (pickle.UnpicklingError, KeyError, EOFError) as e:
|
| 258 |
+
logger.warning(f"Cache corruption detected in {cache_path.name}: {e}. Deleting cache.")
|
| 259 |
+
cache_path.unlink()
|
| 260 |
+
return []
|
| 261 |
+
except Exception as e:
|
| 262 |
+
logger.error(f"Unexpected error loading cache {cache_path.name}: {e}", exc_info=True)
|
| 263 |
+
if cache_path.exists():
|
| 264 |
+
cache_path.unlink()
|
| 265 |
+
return []
|
| 266 |
+
|
| 267 |
+
def _save_to_cache(self, chunks: List, cache_path: Path):
|
| 268 |
+
"""Saves chunks to a pickle file."""
|
| 269 |
+
try:
|
| 270 |
+
with open(cache_path, "wb") as f:
|
| 271 |
+
pickle.dump({
|
| 272 |
+
"timestamp": datetime.now().timestamp(),
|
| 273 |
+
"chunks": chunks
|
| 274 |
+
}, f)
|
| 275 |
+
logger.info(f"Successfully cached {len(chunks)} chunks to {cache_path.name}")
|
| 276 |
+
except Exception as e:
|
| 277 |
+
logger.error(f"Failed to save cache to {cache_path.name}: {e}", exc_info=True)
|
| 278 |
+
|
| 279 |
+
def _process_file(self, file, progress_callback=None) -> List[Document]:
|
| 280 |
+
file_ext = Path(file.name).suffix.lower()
|
| 281 |
+
if file_ext not in ALLOWED_TYPES:
|
| 282 |
+
logger.warning(f"Skipping unsupported file type: {file.name}")
|
| 283 |
+
return []
|
| 284 |
+
try:
|
| 285 |
+
documents = []
|
| 286 |
+
if file_ext == '.pdf':
|
| 287 |
+
import concurrent.futures
|
| 288 |
+
results = {}
|
| 289 |
+
def run_pdfplumber():
|
| 290 |
+
return self._load_pdf_with_pdfplumber(file.name)
|
| 291 |
+
def run_charts():
|
| 292 |
+
logger.info(f"ENABLE_CHART_EXTRACTION={parameters.ENABLE_CHART_EXTRACTION}, gemini_client={self.gemini_client is not None}")
|
| 293 |
+
if parameters.ENABLE_CHART_EXTRACTION and self.gemini_client:
|
| 294 |
+
return self._extract_charts_from_pdf(file.name)
|
| 295 |
+
return []
|
| 296 |
+
try:
|
| 297 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
| 298 |
+
future_pdf = executor.submit(run_pdfplumber)
|
| 299 |
+
future_charts = executor.submit(run_charts)
|
| 300 |
+
try:
|
| 301 |
+
docs = future_pdf.result()
|
| 302 |
+
except MemoryError as e:
|
| 303 |
+
logger.error(f"Out of memory in PDFPlumber thread: {e}. Falling back to sequential.")
|
| 304 |
+
docs = self._load_pdf_with_pdfplumber(file.name)
|
| 305 |
+
try:
|
| 306 |
+
chart_docs = future_charts.result()
|
| 307 |
+
except MemoryError as e:
|
| 308 |
+
logger.error(f"Out of memory in chart extraction thread: {e}. Falling back to sequential.")
|
| 309 |
+
chart_docs = self._extract_charts_from_pdf(file.name)
|
| 310 |
+
documents = docs or []
|
| 311 |
+
if chart_docs:
|
| 312 |
+
documents.extend(chart_docs)
|
| 313 |
+
logger.info(f"📊 Added {len(chart_docs)} chart descriptions to {file.name}")
|
| 314 |
+
except MemoryError as e:
|
| 315 |
+
logger.error(f"Out of memory in parallel PDF processing: {e}. Falling back to sequential.")
|
| 316 |
+
documents = self._load_pdf_with_pdfplumber(file.name)
|
| 317 |
+
if parameters.ENABLE_CHART_EXTRACTION and self.gemini_client:
|
| 318 |
+
chart_docs = self._extract_charts_from_pdf(file.name)
|
| 319 |
+
if chart_docs:
|
| 320 |
+
documents.extend(chart_docs)
|
| 321 |
+
logger.info(f"📊 Added {len(chart_docs)} chart descriptions to {file.name}")
|
| 322 |
+
else:
|
| 323 |
+
from langchain_community.document_loaders import (
|
| 324 |
+
Docx2txtLoader,
|
| 325 |
+
TextLoader,
|
| 326 |
+
)
|
| 327 |
+
loader_map = {
|
| 328 |
+
'.docx': Docx2txtLoader,
|
| 329 |
+
'.txt': TextLoader,
|
| 330 |
+
'.md': TextLoader,
|
| 331 |
+
}
|
| 332 |
+
loader_class = loader_map.get(file_ext)
|
| 333 |
+
if not loader_class:
|
| 334 |
+
logger.warning(f"No loader found for {file_ext}")
|
| 335 |
+
return []
|
| 336 |
+
logger.info(f"Loading {file_ext} file: {file.name}")
|
| 337 |
+
loader = loader_class(file.name)
|
| 338 |
+
documents = loader.load()
|
| 339 |
+
if not documents:
|
| 340 |
+
logger.warning(f"No content extracted from {file.name}")
|
| 341 |
+
return []
|
| 342 |
+
all_chunks = []
|
| 343 |
+
total_docs = len(documents)
|
| 344 |
+
file_hash = self._generate_hash(file.name.encode()) # Unique per file
|
| 345 |
+
for i, doc in enumerate(documents):
|
| 346 |
+
page_chunks = self.splitter.split_text(doc.page_content)
|
| 347 |
+
total_chunks = len(page_chunks)
|
| 348 |
+
for j, chunk in enumerate(page_chunks):
|
| 349 |
+
chunk_id = f"{file_hash}_{doc.metadata.get('page', i + 1)}_{j}"
|
| 350 |
+
chunk_doc = Document(
|
| 351 |
+
page_content=chunk,
|
| 352 |
+
metadata={
|
| 353 |
+
"source": doc.metadata.get("source", file.name),
|
| 354 |
+
"page": doc.metadata.get("page", i + 1),
|
| 355 |
+
"type": doc.metadata.get("type", "text"),
|
| 356 |
+
"chunk_id": chunk_id
|
| 357 |
+
}
|
| 358 |
+
)
|
| 359 |
+
all_chunks.append(chunk_doc)
|
| 360 |
+
if progress_callback:
|
| 361 |
+
percent = int(100 * ((i + (j + 1) / total_chunks) / total_docs))
|
| 362 |
+
step = f"Splitting page {i+1} into chunks"
|
| 363 |
+
progress_callback(percent, step)
|
| 364 |
+
logger.info(f"Processed {file.name}: {len(documents)} page(s) → {len(all_chunks)} chunk(s)")
|
| 365 |
+
return all_chunks
|
| 366 |
+
except ImportError as e:
|
| 367 |
+
logger.error(f"Required loader not installed for {file_ext}: {e}")
|
| 368 |
+
return []
|
| 369 |
+
except Exception as e:
|
| 370 |
+
logger.error(f"Failed to process {file.name}: {e}", exc_info=True)
|
| 371 |
+
raise
|
| 372 |
+
|
| 373 |
+
def _extract_charts_from_pdf(self, file_path: str) -> List[Document]:
|
| 374 |
+
"""
|
| 375 |
+
Extract and analyze charts/graphs from PDF with true batch processing and parallelism.
|
| 376 |
+
PHASE 1: Parallel local chart detection (CPU-bound, uses ProcessPoolExecutor)
|
| 377 |
+
PHASE 2: Parallel Gemini batch analysis (I/O-bound, uses ThreadPoolExecutor)
|
| 378 |
+
"""
|
| 379 |
+
file_hash = self._generate_hash(file_path.encode())
|
| 380 |
+
def deduplicate_charts_by_title(chart_chunks):
|
| 381 |
+
seen_titles = set()
|
| 382 |
+
unique_chunks = []
|
| 383 |
+
import re
|
| 384 |
+
for chunk in chart_chunks:
|
| 385 |
+
match = re.search(r"\*\*Title\*\*:\s*(.+)", chunk.page_content)
|
| 386 |
+
title = match.group(1).strip() if match else None
|
| 387 |
+
if title and title not in seen_titles:
|
| 388 |
+
seen_titles.add(title)
|
| 389 |
+
unique_chunks.append(chunk)
|
| 390 |
+
elif not title:
|
| 391 |
+
unique_chunks.append(chunk)
|
| 392 |
+
return unique_chunks
|
| 393 |
+
try:
|
| 394 |
+
from pdf2image import convert_from_path
|
| 395 |
+
from PIL import Image
|
| 396 |
+
import pdfplumber
|
| 397 |
+
import tempfile
|
| 398 |
+
import os
|
| 399 |
+
|
| 400 |
+
# Import local detector if enabled
|
| 401 |
+
use_local = parameters.CHART_USE_LOCAL_DETECTION
|
| 402 |
+
if use_local:
|
| 403 |
+
try:
|
| 404 |
+
from content_analyzer.visual_detector import LocalChartDetector
|
| 405 |
+
logger.info(f"📊 [BATCH MODE] Local detection → Temp cache → Batch analysis")
|
| 406 |
+
except ImportError:
|
| 407 |
+
logger.warning("Local chart detector not available, falling back to Gemini")
|
| 408 |
+
use_local = False
|
| 409 |
+
|
| 410 |
+
# Track statistics
|
| 411 |
+
stats = {
|
| 412 |
+
'pages_scanned': 0,
|
| 413 |
+
'charts_detected_local': 0,
|
| 414 |
+
'charts_analyzed_gemini': 0,
|
| 415 |
+
'api_calls_saved': 0,
|
| 416 |
+
'batch_api_calls': 0
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
# Get PDF page count
|
| 420 |
+
with pdfplumber.open(file_path) as pdf:
|
| 421 |
+
total_pages = len(pdf.pages)
|
| 422 |
+
|
| 423 |
+
logger.info(f"Processing {total_pages} pages for chart detection...")
|
| 424 |
+
|
| 425 |
+
# Create temp directory for chart images
|
| 426 |
+
temp_dir = tempfile.mkdtemp(prefix='charts_')
|
| 427 |
+
detected_charts = [] # [(page_num, image_path, detection_result), ...]
|
| 428 |
+
|
| 429 |
+
try:
|
| 430 |
+
# === PHASE 1: PARALLEL LOCAL CHART DETECTION (CPU-BOUND) ===
|
| 431 |
+
logger.info("Phase 1: Detecting charts and caching to disk...")
|
| 432 |
+
batch_size = parameters.CHART_BATCH_SIZE
|
| 433 |
+
page_image_tuples = []
|
| 434 |
+
for start_page in range(1, total_pages + 1, batch_size):
|
| 435 |
+
end_page = min(start_page + batch_size - 1, total_pages)
|
| 436 |
+
try:
|
| 437 |
+
images = convert_from_path(
|
| 438 |
+
file_path,
|
| 439 |
+
dpi=parameters.CHART_DPI,
|
| 440 |
+
first_page=start_page,
|
| 441 |
+
last_page=end_page,
|
| 442 |
+
fmt='jpeg',
|
| 443 |
+
jpegopt={'quality': 85, 'optimize': True}
|
| 444 |
+
)
|
| 445 |
+
for idx, image in enumerate(images):
|
| 446 |
+
page_num = start_page + idx
|
| 447 |
+
stats['pages_scanned'] += 1
|
| 448 |
+
# Resize if needed
|
| 449 |
+
max_dimension = parameters.CHART_MAX_IMAGE_SIZE
|
| 450 |
+
if max(image.size) > max_dimension:
|
| 451 |
+
ratio = max_dimension / max(image.size)
|
| 452 |
+
new_size = tuple(int(dim * ratio) for dim in image.size)
|
| 453 |
+
image = image.resize(new_size, Image.Resampling.LANCZOS)
|
| 454 |
+
page_image_tuples.append((page_num, image))
|
| 455 |
+
del images
|
| 456 |
+
except Exception as e:
|
| 457 |
+
logger.warning(f"Failed to process pages {start_page}-{end_page}: {e}")
|
| 458 |
+
continue
|
| 459 |
+
|
| 460 |
+
detected_charts = []
|
| 461 |
+
if use_local and parameters.CHART_SKIP_GEMINI_DETECTION and page_image_tuples:
|
| 462 |
+
logger.info("Parallel local chart detection using ProcessPoolExecutor...")
|
| 463 |
+
# Limit parallelism to avoid memory errors
|
| 464 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
|
| 465 |
+
results = list(executor.map(detect_chart_on_page, page_image_tuples))
|
| 466 |
+
for page_num, image, detection_result in results:
|
| 467 |
+
if not detection_result['has_chart']:
|
| 468 |
+
logger.debug(f"Page {page_num}: No chart detected (skipping)")
|
| 469 |
+
stats['api_calls_saved'] += 1
|
| 470 |
+
continue
|
| 471 |
+
confidence = detection_result['confidence']
|
| 472 |
+
if confidence < parameters.CHART_MIN_CONFIDENCE:
|
| 473 |
+
logger.debug(f"Page {page_num}: Low confidence ({confidence:.0%}), skipping")
|
| 474 |
+
stats['api_calls_saved'] += 1
|
| 475 |
+
continue
|
| 476 |
+
logger.info(f"📈 Chart detected on page {page_num} (confidence: {confidence:.0%})")
|
| 477 |
+
stats['charts_detected_local'] += 1
|
| 478 |
+
image_path = os.path.join(temp_dir, f'chart_page_{page_num}.jpg')
|
| 479 |
+
image.save(image_path, 'JPEG', quality=90)
|
| 480 |
+
detected_charts.append((page_num, image_path, detection_result))
|
| 481 |
+
# Release memory
|
| 482 |
+
del image
|
| 483 |
+
gc.collect()
|
| 484 |
+
else:
|
| 485 |
+
# Fallback: sequential detection
|
| 486 |
+
for page_num, image in page_image_tuples:
|
| 487 |
+
if use_local and parameters.CHART_SKIP_GEMINI_DETECTION:
|
| 488 |
+
detection_result = LocalChartDetector.detect_charts(image)
|
| 489 |
+
if not detection_result['has_chart']:
|
| 490 |
+
logger.debug(f"Page {page_num}: No chart detected (skipping)")
|
| 491 |
+
stats['api_calls_saved'] += 1
|
| 492 |
+
continue
|
| 493 |
+
confidence = detection_result['confidence']
|
| 494 |
+
if confidence < parameters.CHART_MIN_CONFIDENCE:
|
| 495 |
+
logger.debug(f"Page {page_num}: Low confidence ({confidence:.0%}), skipping")
|
| 496 |
+
stats['api_calls_saved'] += 1
|
| 497 |
+
continue
|
| 498 |
+
logger.info(f"📈 Chart detected on page {page_num} (confidence: {confidence:.0%})")
|
| 499 |
+
stats['charts_detected_local'] += 1
|
| 500 |
+
image_path = os.path.join(temp_dir, f'chart_page_{page_num}.jpg')
|
| 501 |
+
image.save(image_path, 'JPEG', quality=90)
|
| 502 |
+
detected_charts.append((page_num, image_path, detection_result))
|
| 503 |
+
|
| 504 |
+
logger.info(f"Phase 1 complete: {len(detected_charts)} charts detected and cached")
|
| 505 |
+
|
| 506 |
+
# === PHASE 2: PARALLEL GEMINI BATCH ANALYSIS (I/O-BOUND) ===
|
| 507 |
+
if not detected_charts or not self.gemini_client:
|
| 508 |
+
return []
|
| 509 |
+
|
| 510 |
+
logger.info(f"Phase 2: Batch analyzing {len(detected_charts)} charts...")
|
| 511 |
+
chart_documents = []
|
| 512 |
+
|
| 513 |
+
if parameters.CHART_ENABLE_BATCH_ANALYSIS and len(detected_charts) > 1:
|
| 514 |
+
# Batch processing with parallel Gemini API calls
|
| 515 |
+
gemini_batch_size = parameters.CHART_GEMINI_BATCH_SIZE
|
| 516 |
+
batches = [detected_charts[i:i + gemini_batch_size] for i in range(0, len(detected_charts), gemini_batch_size)]
|
| 517 |
+
|
| 518 |
+
# Prepare batch tuples with batch_num and total_batches
|
| 519 |
+
batch_tuples = [
|
| 520 |
+
(batch, idx + 1, len(batches), self.gemini_client, file_path, parameters, stats)
|
| 521 |
+
for idx, batch in enumerate(batches)
|
| 522 |
+
]
|
| 523 |
+
results = [None] * len(batches)
|
| 524 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
|
| 525 |
+
future_to_idx = {executor.submit(analyze_batch, batch_tuple): idx for idx, batch_tuple in enumerate(batch_tuples)}
|
| 526 |
+
for future in concurrent.futures.as_completed(future_to_idx):
|
| 527 |
+
idx = future_to_idx[future]
|
| 528 |
+
try:
|
| 529 |
+
batch_idx, batch_docs = future.result()
|
| 530 |
+
results[batch_idx] = batch_docs
|
| 531 |
+
except Exception as exc:
|
| 532 |
+
logger.error(f"Batch {idx} generated an exception: {exc}")
|
| 533 |
+
# Flatten results and filter out None
|
| 534 |
+
chart_index = 0
|
| 535 |
+
for batch_docs in results:
|
| 536 |
+
if batch_docs:
|
| 537 |
+
for doc in batch_docs:
|
| 538 |
+
doc.metadata["chunk_id"] = f"{file_hash}_{doc.metadata.get('page', 0)}_{chart_index}"
|
| 539 |
+
chart_documents.append(doc)
|
| 540 |
+
chart_index += 1
|
| 541 |
+
else:
|
| 542 |
+
# Sequential processing (batch disabled or single chart)
|
| 543 |
+
for chart_index, (page_num, image_path, detection_result) in enumerate(detected_charts):
|
| 544 |
+
try:
|
| 545 |
+
img = Image.open(image_path)
|
| 546 |
+
extraction_prompt = """Analyze this chart/graph in comprehensive detail:
|
| 547 |
+
**Chart Type**: [type]
|
| 548 |
+
**Title**: [title]
|
| 549 |
+
**Axes**: [X and Y labels/units]
|
| 550 |
+
**Data Points**: [extract all visible data]
|
| 551 |
+
**Legend**: [series/categories]
|
| 552 |
+
**Trends**: [key patterns and insights]
|
| 553 |
+
**Key Values**: [max, min, significant]
|
| 554 |
+
**Context**: [annotations or notes]
|
| 555 |
+
"""
|
| 556 |
+
chart_response = self.gemini_client.models.generate_content(
|
| 557 |
+
model=parameters.CHART_VISION_MODEL,
|
| 558 |
+
contents=[extraction_prompt, img],
|
| 559 |
+
config=types.GenerateContentConfig(
|
| 560 |
+
max_output_tokens=parameters.CHART_MAX_TOKENS
|
| 561 |
+
)
|
| 562 |
+
)
|
| 563 |
+
chart_types_str = ", ".join(detection_result['chart_types']) or "Unknown"
|
| 564 |
+
chart_doc = Document(
|
| 565 |
+
page_content=f"""### \U0001F4CA Chart Analysis (Page {page_num})\n\n**Detection Method**: Hybrid (Local OpenCV + Gemini Sequential)\n**Local Confidence**: {detection_result['confidence']:.0%}\n**Detected Types**: {chart_types_str}\n\n---\n\n{chart_response.text}\n""",
|
| 566 |
+
metadata={
|
| 567 |
+
"source": file_path,
|
| 568 |
+
"page": page_num,
|
| 569 |
+
"type": "chart",
|
| 570 |
+
"extraction_method": "hybrid_sequential",
|
| 571 |
+
"chunk_id": f"{file_hash}_{page_num}_{chart_index}"
|
| 572 |
+
}
|
| 573 |
+
)
|
| 574 |
+
chart_documents.append(chart_doc)
|
| 575 |
+
stats['charts_analyzed_gemini'] += 1
|
| 576 |
+
img.close()
|
| 577 |
+
logger.info(f"✅ Analyzed chart on page {page_num}")
|
| 578 |
+
except Exception as e:
|
| 579 |
+
logger.error(f"Failed to analyze page {page_num}: {e}")
|
| 580 |
+
|
| 581 |
+
# Log statistics
|
| 582 |
+
if use_local and parameters.CHART_SKIP_GEMINI_DETECTION:
|
| 583 |
+
cost_saved = stats['api_calls_saved'] * 0.0125
|
| 584 |
+
actual_cost = stats['batch_api_calls'] * 0.0125 if stats['batch_api_calls'] > 0 else stats['charts_analyzed_gemini'] * 0.0125
|
| 585 |
+
|
| 586 |
+
if stats['batch_api_calls'] > 0:
|
| 587 |
+
efficiency = stats['charts_analyzed_gemini'] / stats['batch_api_calls']
|
| 588 |
+
else:
|
| 589 |
+
efficiency = 1.0
|
| 590 |
+
|
| 591 |
+
logger.info(f"""
|
| 592 |
+
📊 Chart Extraction Complete (HYBRID + BATCH MODE):
|
| 593 |
+
Pages scanned: {stats['pages_scanned']}
|
| 594 |
+
Charts detected (local): {stats['charts_detected_local']}
|
| 595 |
+
Charts analyzed (Gemini): {stats['charts_analyzed_gemini']}
|
| 596 |
+
Batch API calls: {stats['batch_api_calls']}
|
| 597 |
+
Charts per API call: {efficiency:.1f}
|
| 598 |
+
API calls saved (detection): {stats['api_calls_saved']}
|
| 599 |
+
Estimated cost savings: ${cost_saved:.3f}
|
| 600 |
+
Actual API cost: ${actual_cost:.3f}
|
| 601 |
+
""")
|
| 602 |
+
|
| 603 |
+
# After chart_documents is created (batch or sequential), deduplicate by title:
|
| 604 |
+
chart_documents = deduplicate_charts_by_title(chart_documents)
|
| 605 |
+
|
| 606 |
+
return chart_documents
|
| 607 |
+
|
| 608 |
+
finally:
|
| 609 |
+
# Only clean up after all analysis is done
|
| 610 |
+
try:
|
| 611 |
+
import shutil
|
| 612 |
+
shutil.rmtree(temp_dir)
|
| 613 |
+
logger.debug(f"Cleaned up temp directory: {temp_dir}")
|
| 614 |
+
except Exception as e:
|
| 615 |
+
logger.warning(f"Failed to clean temp directory {temp_dir}: {e}")
|
| 616 |
+
except ImportError as e:
|
| 617 |
+
logger.warning(f"Dependencies missing for chart extraction: {e}")
|
| 618 |
+
return []
|
| 619 |
+
except MemoryError as e:
|
| 620 |
+
logger.error(f"Out of memory while processing {file_path}. Try reducing DPI or batch size.")
|
| 621 |
+
return []
|
| 622 |
+
except Exception as e:
|
| 623 |
+
logger.error(f"Chart extraction failed for {file_path}: {e}", exc_info=True)
|
| 624 |
+
return []
|
| 625 |
+
|
| 626 |
+
def _load_pdf_with_pdfplumber(self, file_path: str) -> List[Document]:
|
| 627 |
+
"""
|
| 628 |
+
Load PDF using pdfplumber for text and table extraction.
|
| 629 |
+
|
| 630 |
+
Uses multiple table detection strategies for complex tables.
|
| 631 |
+
"""
|
| 632 |
+
import pdfplumber
|
| 633 |
+
|
| 634 |
+
logger.info(f"[PDFPLUMBER] Processing: {file_path}")
|
| 635 |
+
file_hash = self._generate_hash(file_path.encode())
|
| 636 |
+
|
| 637 |
+
# Strategy 1: Line-based (default) - for tables with visible borders
|
| 638 |
+
default_parameters = {}
|
| 639 |
+
|
| 640 |
+
# Strategy 2: Text-based - for borderless tables with aligned text
|
| 641 |
+
text_parameters = {
|
| 642 |
+
"vertical_strategy": "text",
|
| 643 |
+
"horizontal_strategy": "text",
|
| 644 |
+
"snap_tolerance": 5,
|
| 645 |
+
"join_tolerance": 5,
|
| 646 |
+
"edge_min_length": 3,
|
| 647 |
+
"min_words_vertical": 2,
|
| 648 |
+
"min_words_horizontal": 1,
|
| 649 |
+
"text_tolerance": 3,
|
| 650 |
+
"intersection_tolerance": 5,
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
# Strategy 3: Lines + text hybrid - for complex tables
|
| 654 |
+
hybrid_parameters = {
|
| 655 |
+
"vertical_strategy": "lines_strict",
|
| 656 |
+
"horizontal_strategy": "text",
|
| 657 |
+
"snap_tolerance": 5,
|
| 658 |
+
"join_tolerance": 5,
|
| 659 |
+
"min_words_horizontal": 1,
|
| 660 |
+
}
|
| 661 |
+
|
| 662 |
+
all_content = []
|
| 663 |
+
total_tables = 0
|
| 664 |
+
with pdfplumber.open(file_path) as pdf:
|
| 665 |
+
for page_num, page in enumerate(pdf.pages, 1):
|
| 666 |
+
page_content = [f"## Page {page_num}"]
|
| 667 |
+
page_tables = []
|
| 668 |
+
table_hashes = set() # Track unique tables
|
| 669 |
+
|
| 670 |
+
def add_table_if_unique(table, strategy_name):
|
| 671 |
+
"""Add table if not already found."""
|
| 672 |
+
if not table or len(table) < 2:
|
| 673 |
+
return False
|
| 674 |
+
# Create hash of table content
|
| 675 |
+
table_str = str(table)
|
| 676 |
+
table_hash = hash(table_str)
|
| 677 |
+
if table_hash not in table_hashes:
|
| 678 |
+
table_hashes.add(table_hash)
|
| 679 |
+
page_tables.append((table, strategy_name))
|
| 680 |
+
return True
|
| 681 |
+
return False
|
| 682 |
+
|
| 683 |
+
# --- Robust per-page error handling ---
|
| 684 |
+
try:
|
| 685 |
+
# Strategy 1: Default line-based detection
|
| 686 |
+
try:
|
| 687 |
+
default_tables = page.extract_tables()
|
| 688 |
+
if default_tables:
|
| 689 |
+
for t in default_tables:
|
| 690 |
+
add_table_if_unique(t, "default")
|
| 691 |
+
except Exception as e:
|
| 692 |
+
logger.warning(f"Default strategy failed on page {page_num}: {e}")
|
| 693 |
+
# Strategy 2: Text-based detection for borderless tables
|
| 694 |
+
try:
|
| 695 |
+
text_tables = page.extract_tables(text_parameters)
|
| 696 |
+
if text_tables:
|
| 697 |
+
for t in text_tables:
|
| 698 |
+
add_table_if_unique(t, "text")
|
| 699 |
+
except Exception as e:
|
| 700 |
+
logger.warning(f"Text strategy failed on page {page_num}: {e}")
|
| 701 |
+
# Strategy 3: Hybrid detection
|
| 702 |
+
try:
|
| 703 |
+
hybrid_tables = page.extract_tables(hybrid_parameters)
|
| 704 |
+
if hybrid_tables:
|
| 705 |
+
for t in hybrid_tables:
|
| 706 |
+
add_table_if_unique(t, "hybrid")
|
| 707 |
+
except Exception as e:
|
| 708 |
+
logger.warning(f"Hybrid strategy failed on page {page_num}: {e}")
|
| 709 |
+
# Strategy 4: Use find_tables() for more control
|
| 710 |
+
try:
|
| 711 |
+
found_tables = page.find_tables(text_parameters)
|
| 712 |
+
if found_tables:
|
| 713 |
+
for ft in found_tables:
|
| 714 |
+
t = ft.extract()
|
| 715 |
+
add_table_if_unique(t, "find_tables")
|
| 716 |
+
except Exception as e:
|
| 717 |
+
logger.warning(f"find_tables() failed on page {page_num}: {e}")
|
| 718 |
+
|
| 719 |
+
# Convert tables to markdown
|
| 720 |
+
for table, strategy in page_tables:
|
| 721 |
+
total_tables += 1
|
| 722 |
+
md_table = self._table_to_markdown(table, page_num, total_tables)
|
| 723 |
+
if md_table:
|
| 724 |
+
page_content.append(md_table)
|
| 725 |
+
|
| 726 |
+
# Extract text
|
| 727 |
+
try:
|
| 728 |
+
text = page.extract_text()
|
| 729 |
+
if text:
|
| 730 |
+
page_content.append(text.strip())
|
| 731 |
+
except Exception as e:
|
| 732 |
+
logger.warning(f"Text extraction failed on page {page_num}: {e}")
|
| 733 |
+
|
| 734 |
+
if len(page_content) > 1:
|
| 735 |
+
combined = "\n\n".join(page_content)
|
| 736 |
+
chunk_id = f"{file_hash}_{page_num}_0"
|
| 737 |
+
doc = Document(
|
| 738 |
+
page_content=combined,
|
| 739 |
+
metadata={
|
| 740 |
+
"source": file_path,
|
| 741 |
+
"page": page_num,
|
| 742 |
+
"loader": "pdfplumber",
|
| 743 |
+
"tables_count": total_tables,
|
| 744 |
+
"type": "text",
|
| 745 |
+
"chunk_id": chunk_id
|
| 746 |
+
}
|
| 747 |
+
)
|
| 748 |
+
all_content.append(doc)
|
| 749 |
+
except Exception as e:
|
| 750 |
+
logger.warning(f"Skipping page {page_num} due to error: {e}")
|
| 751 |
+
continue
|
| 752 |
+
|
| 753 |
+
logger.info(f"[PDFPLUMBER] Extracted {len(all_content)} chunks, {total_tables} tables")
|
| 754 |
+
return all_content
|
| 755 |
+
|
| 756 |
+
def _table_to_markdown(self, table: List[List], page_num: int, table_idx: int) -> str:
|
| 757 |
+
"""Convert a table (list of rows) to markdown format."""
|
| 758 |
+
if not table or len(table) < 1:
|
| 759 |
+
return ""
|
| 760 |
+
|
| 761 |
+
# Clean up cells
|
| 762 |
+
cleaned_table = []
|
| 763 |
+
for row in table:
|
| 764 |
+
if row:
|
| 765 |
+
cleaned_row = []
|
| 766 |
+
for cell in row:
|
| 767 |
+
if cell:
|
| 768 |
+
cell_text = str(cell).replace('\n', ' ').replace('\r', ' ').replace('|', '\\|').strip()
|
| 769 |
+
cleaned_row.append(cell_text)
|
| 770 |
+
else:
|
| 771 |
+
cleaned_row.append("")
|
| 772 |
+
if any(cleaned_row):
|
| 773 |
+
cleaned_table.append(cleaned_row)
|
| 774 |
+
|
| 775 |
+
if len(cleaned_table) < 1:
|
| 776 |
+
return ""
|
| 777 |
+
|
| 778 |
+
# Determine max columns and pad rows
|
| 779 |
+
max_cols = max(len(row) for row in cleaned_table)
|
| 780 |
+
for row in cleaned_table:
|
| 781 |
+
while len(row) < max_cols:
|
| 782 |
+
row.append("")
|
| 783 |
+
|
| 784 |
+
# Build markdown table
|
| 785 |
+
md_lines = [f"### Table {table_idx} (Page {page_num})"]
|
| 786 |
+
md_lines.append("| " + " | ".join(cleaned_table[0]) + " |")
|
| 787 |
+
md_lines.append("| " + " | ".join(["---"] * max_cols) + " |")
|
| 788 |
+
|
| 789 |
+
for row in cleaned_table[1:]:
|
| 790 |
+
md_lines.append("| " + " | ".join(row) + " |")
|
| 791 |
+
|
| 792 |
+
return "\n".join(md_lines)
|
| 793 |
+
|
| 794 |
+
def process(self, files: List, progress_callback=None) -> List[Document]:
|
| 795 |
+
"""
|
| 796 |
+
Process multiple files with caching and deduplication.
|
| 797 |
+
"""
|
| 798 |
+
self.validate_files(files)
|
| 799 |
+
all_chunks = []
|
| 800 |
+
seen_hashes = set()
|
| 801 |
+
logger.info(f"Processing {len(files)} file(s)...")
|
| 802 |
+
for file in files:
|
| 803 |
+
try:
|
| 804 |
+
with open(file.name, 'rb') as f:
|
| 805 |
+
file_content = f.read()
|
| 806 |
+
file_hash = self._generate_hash(file_content)
|
| 807 |
+
cache_path = self.cache_dir / f"{file_hash}.pkl"
|
| 808 |
+
if self._is_cache_valid(cache_path):
|
| 809 |
+
chunks = self._load_from_cache(cache_path)
|
| 810 |
+
if chunks:
|
| 811 |
+
logger.info(f"Using cached chunks for {file.name}")
|
| 812 |
+
else:
|
| 813 |
+
chunks = self._process_file(file, progress_callback=progress_callback)
|
| 814 |
+
self._save_to_cache(chunks, cache_path)
|
| 815 |
+
else:
|
| 816 |
+
logger.info(f"Processing and caching: {file.name}")
|
| 817 |
+
chunks = self._process_file(file, progress_callback=progress_callback)
|
| 818 |
+
self._save_to_cache(chunks, cache_path)
|
| 819 |
+
for chunk in chunks:
|
| 820 |
+
chunk_hash = self._generate_hash(chunk.page_content.encode())
|
| 821 |
+
if chunk_hash not in seen_hashes:
|
| 822 |
+
seen_hashes.add(chunk_hash)
|
| 823 |
+
all_chunks.append(chunk)
|
| 824 |
+
except Exception as e:
|
| 825 |
+
logger.error(f"Failed to process {file.name}: {e}", exc_info=True)
|
| 826 |
+
continue
|
| 827 |
+
logger.info(f"Processing complete: {len(all_chunks)} unique chunks from {len(files)} file(s)")
|
| 828 |
+
return all_chunks
|
| 829 |
+
|
| 830 |
+
def run_pdfplumber(file_name):
|
| 831 |
+
from content_analyzer.document_parser import DocumentProcessor
|
| 832 |
+
processor = DocumentProcessor()
|
| 833 |
+
return processor._load_pdf_with_pdfplumber(file_name)
|
| 834 |
+
|
| 835 |
+
def run_charts(file_name, enable_chart_extraction, gemini_client):
|
| 836 |
+
from content_analyzer.document_parser import DocumentProcessor
|
| 837 |
+
processor = DocumentProcessor()
|
| 838 |
+
processor.gemini_client = gemini_client
|
| 839 |
+
if enable_chart_extraction and gemini_client:
|
| 840 |
+
return processor._extract_charts_from_pdf(file_name)
|
| 841 |
+
return []
|
| 842 |
+
|
content_analyzer/visual_detector.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Local Chart Detection Module - NO API CALLS
|
| 3 |
+
|
| 4 |
+
Uses OpenCV and image analysis for chart detection without any LLM cost.
|
| 5 |
+
This module provides FREE chart detection as an alternative to expensive Gemini Vision API calls.
|
| 6 |
+
|
| 7 |
+
Author: SmartDoc AI
|
| 8 |
+
License: MIT
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import logging
|
| 12 |
+
from typing import Dict, Any
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
class LocalChartDetector:
|
| 17 |
+
"""
|
| 18 |
+
Detects charts in images using OpenCV - completely free, no API calls.
|
| 19 |
+
Detection Features:
|
| 20 |
+
- Edge detection (Canny)
|
| 21 |
+
- Line detection (HoughLinesP)
|
| 22 |
+
- Circle detection (HoughCircles)
|
| 23 |
+
- Contour analysis for shapes
|
| 24 |
+
- Axis pattern recognition
|
| 25 |
+
Detectable Chart Types:
|
| 26 |
+
- Line charts (multiple organized lines)
|
| 27 |
+
- Bar charts (rectangular shapes)
|
| 28 |
+
- Pie charts (circular patterns)
|
| 29 |
+
- Scatter plots (lines + circles)
|
| 30 |
+
- Charts with axes (H/V line patterns)
|
| 31 |
+
- Bubble charts (circles with variable size)
|
| 32 |
+
- Zone diagrams (areas with color coding)
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
@staticmethod
|
| 36 |
+
def detect_charts(image) -> Dict[str, Any]:
|
| 37 |
+
"""
|
| 38 |
+
Detects complex charts and visualizations only - rejects tables, maps, and simple graphics.
|
| 39 |
+
Returns a dictionary with detection results and features.
|
| 40 |
+
"""
|
| 41 |
+
import time
|
| 42 |
+
start_time = time.time()
|
| 43 |
+
try:
|
| 44 |
+
import cv2
|
| 45 |
+
import numpy as np
|
| 46 |
+
from PIL import Image as PILImage
|
| 47 |
+
|
| 48 |
+
# --- Image Preparation ---
|
| 49 |
+
# Convert PIL image to OpenCV format if needed
|
| 50 |
+
if isinstance(image, PILImage.Image):
|
| 51 |
+
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 52 |
+
else:
|
| 53 |
+
image_cv = image
|
| 54 |
+
height, width = image_cv.shape[:2]
|
| 55 |
+
gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
|
| 56 |
+
|
| 57 |
+
# --- Edge Detection ---
|
| 58 |
+
edges = cv2.Canny(gray, 50, 150)
|
| 59 |
+
|
| 60 |
+
# --- Edge Density Calculation ---
|
| 61 |
+
w_half = width // 2
|
| 62 |
+
left_region = edges[:, :w_half]
|
| 63 |
+
right_region = edges[:, w_half:]
|
| 64 |
+
left_edge_density = np.sum(left_region > 0) / (left_region.shape[0] * left_region.shape[1])
|
| 65 |
+
right_edge_density = np.sum(right_region > 0) / (right_region.shape[0] * right_region.shape[1])
|
| 66 |
+
overall_edge_density = np.sum(edges > 0) / (edges.shape[0] * edges.shape[1])
|
| 67 |
+
has_text_region = (
|
| 68 |
+
(left_edge_density > 0.08 and right_edge_density > 0.08) or
|
| 69 |
+
overall_edge_density > 0.15
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# --- Line Detection ---
|
| 73 |
+
lines = cv2.HoughLinesP(
|
| 74 |
+
edges,
|
| 75 |
+
rho=1,
|
| 76 |
+
theta=np.pi/180,
|
| 77 |
+
threshold=100,
|
| 78 |
+
minLineLength=100,
|
| 79 |
+
maxLineGap=10
|
| 80 |
+
)
|
| 81 |
+
line_count = len(lines) if lines is not None else 0
|
| 82 |
+
diagonal_lines = 0
|
| 83 |
+
line_angles = []
|
| 84 |
+
if lines is not None:
|
| 85 |
+
for line in lines:
|
| 86 |
+
x1, y1, x2, y2 = line[0]
|
| 87 |
+
angle = np.abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)
|
| 88 |
+
if 10 < angle < 80 or 100 < angle < 170:
|
| 89 |
+
diagonal_lines += 1
|
| 90 |
+
line_angles.append(angle)
|
| 91 |
+
|
| 92 |
+
# --- Circle Detection (Optimized) ---
|
| 93 |
+
run_circles = diagonal_lines >= 1 or line_count >= 6 or overall_edge_density > 0.08
|
| 94 |
+
circle_count = 0
|
| 95 |
+
circles = None
|
| 96 |
+
if run_circles:
|
| 97 |
+
scale = 0.5 if max(height, width) > 800 else 1.0
|
| 98 |
+
small_gray = cv2.resize(gray, (int(width*scale), int(height*scale)), interpolation=cv2.INTER_AREA) if scale < 1.0 else gray
|
| 99 |
+
circles = cv2.HoughCircles(
|
| 100 |
+
small_gray,
|
| 101 |
+
cv2.HOUGH_GRADIENT,
|
| 102 |
+
dp=2.5,
|
| 103 |
+
minDist=60,
|
| 104 |
+
param1=50,
|
| 105 |
+
param2=55,
|
| 106 |
+
minRadius=18,
|
| 107 |
+
maxRadius=100
|
| 108 |
+
)
|
| 109 |
+
if circles is not None:
|
| 110 |
+
circle_count = circles.shape[2]
|
| 111 |
+
|
| 112 |
+
# --- Color Diversity Analysis ---
|
| 113 |
+
hsv = cv2.cvtColor(image_cv, cv2.COLOR_BGR2HSV)
|
| 114 |
+
hist = cv2.calcHist([hsv], [0], None, [180], [0, 180])
|
| 115 |
+
color_peaks = np.sum(hist > np.mean(hist) * 2)
|
| 116 |
+
|
| 117 |
+
# --- Contour Detection ---
|
| 118 |
+
contours, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
| 119 |
+
significant_contours = 0
|
| 120 |
+
rectangle_contours = 0
|
| 121 |
+
similar_rectangles = []
|
| 122 |
+
large_contours = 0
|
| 123 |
+
small_scattered_contours = 0
|
| 124 |
+
for contour in contours:
|
| 125 |
+
area = cv2.contourArea(contour)
|
| 126 |
+
if area < 500:
|
| 127 |
+
small_scattered_contours += 1
|
| 128 |
+
elif 1500 < area < 40000:
|
| 129 |
+
significant_contours += 1
|
| 130 |
+
peri = cv2.arcLength(contour, True)
|
| 131 |
+
approx = cv2.approxPolyDP(contour, 0.04 * peri, True)
|
| 132 |
+
if len(approx) == 4:
|
| 133 |
+
rectangle_contours += 1
|
| 134 |
+
x, y, w, h = cv2.boundingRect(contour)
|
| 135 |
+
similar_rectangles.append((w, h, area))
|
| 136 |
+
elif 40000 < area < 500000:
|
| 137 |
+
large_contours += 1
|
| 138 |
+
|
| 139 |
+
# --- Bar Chart Pattern Detection ---
|
| 140 |
+
bar_pattern = False
|
| 141 |
+
if len(similar_rectangles) >= 6:
|
| 142 |
+
widths = [r[0] for r in similar_rectangles]
|
| 143 |
+
heights = [r[1] for r in similar_rectangles]
|
| 144 |
+
width_std = np.std(widths)
|
| 145 |
+
height_std = np.std(heights)
|
| 146 |
+
avg_width = np.mean(widths)
|
| 147 |
+
avg_height = np.mean(heights)
|
| 148 |
+
if (width_std < avg_width * 0.3 or height_std < avg_height * 0.3):
|
| 149 |
+
bar_pattern = True
|
| 150 |
+
|
| 151 |
+
# --- Line Classification ---
|
| 152 |
+
horizontal_lines = 0
|
| 153 |
+
vertical_lines = 0
|
| 154 |
+
diagonal_lines = 0
|
| 155 |
+
line_angles = []
|
| 156 |
+
very_short_lines = 0
|
| 157 |
+
if lines is not None:
|
| 158 |
+
for line in lines:
|
| 159 |
+
x1, y1, x2, y2 = line[0]
|
| 160 |
+
length = np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
|
| 161 |
+
if length < 50:
|
| 162 |
+
very_short_lines += 1
|
| 163 |
+
continue
|
| 164 |
+
if length < 80:
|
| 165 |
+
continue
|
| 166 |
+
angle = np.abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)
|
| 167 |
+
line_angles.append(angle)
|
| 168 |
+
if angle < 10 or angle > 170:
|
| 169 |
+
horizontal_lines += 1
|
| 170 |
+
elif 80 < angle < 100:
|
| 171 |
+
vertical_lines += 1
|
| 172 |
+
else:
|
| 173 |
+
diagonal_lines += 1
|
| 174 |
+
angle_variance = np.var(line_angles) if len(line_angles) > 2 else 0
|
| 175 |
+
|
| 176 |
+
# --- Debug Logging ---
|
| 177 |
+
logger.debug(f"Chart detection features: lines={line_count}, diagonal_lines={diagonal_lines}, circles={circle_count}, horizontal_lines={horizontal_lines}, vertical_lines={vertical_lines}, color_peaks={color_peaks}, angle_variance={angle_variance}")
|
| 178 |
+
|
| 179 |
+
# --- Chart Heuristics and Classification ---
|
| 180 |
+
chart_types = []
|
| 181 |
+
confidence = 0.0
|
| 182 |
+
description = ""
|
| 183 |
+
rejection_reason = ""
|
| 184 |
+
|
| 185 |
+
# Negative checks (text slides, decorative backgrounds, tables)
|
| 186 |
+
if has_text_region and circle_count < 2 and diagonal_lines < 2 and not bar_pattern:
|
| 187 |
+
if small_scattered_contours > 100 or very_short_lines > 50:
|
| 188 |
+
rejection_reason = f"Text slide with decorative background (overall density: {overall_edge_density:.2%})"
|
| 189 |
+
logger.debug(f"Rejected: {rejection_reason}")
|
| 190 |
+
return _chart_result(False, 0.0, [], rejection_reason, line_count, circle_count, overall_edge_density)
|
| 191 |
+
if very_short_lines > 50 and circle_count < 2 and diagonal_lines < 3 and line_count < 10:
|
| 192 |
+
rejection_reason = f"Decorative network background ({very_short_lines} tiny lines, no data elements)"
|
| 193 |
+
logger.debug(f"Rejected: {rejection_reason}")
|
| 194 |
+
return _chart_result(False, 0.0, [], rejection_reason, line_count, circle_count, overall_edge_density)
|
| 195 |
+
if horizontal_lines > 12 and vertical_lines > 12 and circle_count == 0 and diagonal_lines < 2:
|
| 196 |
+
grid_lines = horizontal_lines + vertical_lines
|
| 197 |
+
total_lines = line_count
|
| 198 |
+
grid_ratio = grid_lines / max(total_lines, 1)
|
| 199 |
+
if grid_ratio > 0.75:
|
| 200 |
+
rejection_reason = f"Simple table pattern (H:{horizontal_lines}, V:{vertical_lines})"
|
| 201 |
+
logger.debug(f"Rejected: {rejection_reason}")
|
| 202 |
+
return _chart_result(False, 0.0, [], rejection_reason, line_count, circle_count, overall_edge_density)
|
| 203 |
+
|
| 204 |
+
# Positive chart heuristics (bubble, scatter, line, pie, bar, complex)
|
| 205 |
+
# RELAXED: Detect as line chart if 2+ diagonal lines and angle variance > 40, or 1+ diagonal line and 1+ axis
|
| 206 |
+
if (
|
| 207 |
+
(diagonal_lines >= 2 and angle_variance > 40) or
|
| 208 |
+
(diagonal_lines >= 1 and (horizontal_lines >= 1 or vertical_lines >= 1))
|
| 209 |
+
):
|
| 210 |
+
chart_types.append("line_chart")
|
| 211 |
+
confidence = max(confidence, min(0.88, 0.6 + (diagonal_lines / 40)))
|
| 212 |
+
if (horizontal_lines >= 1 or vertical_lines >= 1):
|
| 213 |
+
confidence = min(0.95, confidence + 0.08)
|
| 214 |
+
if not description:
|
| 215 |
+
description = f"Line chart: {diagonal_lines} diagonal lines, axes: {horizontal_lines+vertical_lines}, variance: {angle_variance:.0f}"
|
| 216 |
+
if circle_count >= 5:
|
| 217 |
+
chart_types.append("bubble_chart")
|
| 218 |
+
confidence = min(0.92, 0.70 + (min(circle_count, 20) * 0.01))
|
| 219 |
+
description = f"Bubble chart: {circle_count} circles"
|
| 220 |
+
if color_peaks > 5:
|
| 221 |
+
confidence = min(0.95, confidence + 0.1)
|
| 222 |
+
description += f", {int(color_peaks)} color zones"
|
| 223 |
+
if large_contours > 2:
|
| 224 |
+
confidence = min(0.97, confidence + 0.05)
|
| 225 |
+
chart_types.append("zone_diagram")
|
| 226 |
+
description += f", {large_contours} colored regions"
|
| 227 |
+
elif circle_count >= 3 and diagonal_lines > 2:
|
| 228 |
+
chart_types.append("scatter_plot")
|
| 229 |
+
confidence = max(confidence, 0.75)
|
| 230 |
+
description = f"Scatter plot: {circle_count} data points"
|
| 231 |
+
if circle_count > 0 and circle_count < 5:
|
| 232 |
+
if "bubble_chart" not in chart_types:
|
| 233 |
+
chart_types.append("pie_chart")
|
| 234 |
+
confidence = max(confidence, 0.80)
|
| 235 |
+
if not description:
|
| 236 |
+
description = f"Pie chart: {circle_count} circular pattern(s)"
|
| 237 |
+
if bar_pattern and rectangle_contours >= 6:
|
| 238 |
+
chart_types.append("bar_chart")
|
| 239 |
+
confidence = max(confidence, 0.75 + (min(rectangle_contours, 12) / 40))
|
| 240 |
+
if not description:
|
| 241 |
+
description = f"Bar chart: {rectangle_contours} bars"
|
| 242 |
+
if circle_count >= 3 and large_contours >= 2 and color_peaks > 5:
|
| 243 |
+
chart_types.append("complex_visualization")
|
| 244 |
+
confidence = max(confidence, 0.85)
|
| 245 |
+
if not description:
|
| 246 |
+
description = "Complex visualization with zones and data points"
|
| 247 |
+
has_moderate_axes = (1 <= horizontal_lines <= 6 or 1 <= vertical_lines <= 6)
|
| 248 |
+
has_real_data = (circle_count >= 3 or diagonal_lines >= 2 or bar_pattern)
|
| 249 |
+
if has_moderate_axes and has_real_data and confidence > 0.3:
|
| 250 |
+
confidence = min(0.90, confidence + 0.10)
|
| 251 |
+
if not description:
|
| 252 |
+
description = f"Chart with axes and data elements"
|
| 253 |
+
|
| 254 |
+
# Final chart determination
|
| 255 |
+
strong_indicator = (
|
| 256 |
+
(diagonal_lines >= 2 and angle_variance > 40) or
|
| 257 |
+
(diagonal_lines >= 1 and (horizontal_lines >= 1 or vertical_lines >= 1)) or
|
| 258 |
+
circle_count >= 5 or
|
| 259 |
+
(circle_count >= 3 and large_contours >= 2) or
|
| 260 |
+
bar_pattern or
|
| 261 |
+
(circle_count >= 3 and color_peaks > 5)
|
| 262 |
+
)
|
| 263 |
+
has_chart = (
|
| 264 |
+
len(chart_types) > 0 and
|
| 265 |
+
confidence > 0.4 and
|
| 266 |
+
strong_indicator
|
| 267 |
+
)
|
| 268 |
+
total_time = time.time() - start_time
|
| 269 |
+
if has_chart:
|
| 270 |
+
logger.info(f"?? OpenCV detection: {total_time*1000:.0f}ms (lines:{line_count}, diagonal_lines:{diagonal_lines}, circles:{circle_count}, axes:{horizontal_lines+vertical_lines}, angle_variance:{angle_variance})")
|
| 271 |
+
else:
|
| 272 |
+
logger.debug(f"?? OpenCV detection: {total_time*1000:.0f}ms (rejected)")
|
| 273 |
+
return {
|
| 274 |
+
'has_chart': has_chart,
|
| 275 |
+
'confidence': float(confidence),
|
| 276 |
+
'chart_types': list(set(chart_types)),
|
| 277 |
+
'description': description or "Potential chart detected",
|
| 278 |
+
'features': {
|
| 279 |
+
'lines': line_count,
|
| 280 |
+
'diagonal_lines': diagonal_lines,
|
| 281 |
+
'circles': circle_count,
|
| 282 |
+
'contours': significant_contours,
|
| 283 |
+
'rectangles': rectangle_contours,
|
| 284 |
+
'horizontal_lines': horizontal_lines,
|
| 285 |
+
'vertical_lines': vertical_lines,
|
| 286 |
+
'angle_variance': float(angle_variance),
|
| 287 |
+
'bar_pattern': bar_pattern,
|
| 288 |
+
'large_contours': large_contours,
|
| 289 |
+
'color_peaks': int(color_peaks),
|
| 290 |
+
'text_region': has_text_region,
|
| 291 |
+
'very_short_lines': very_short_lines,
|
| 292 |
+
'overall_edge_density': float(overall_edge_density),
|
| 293 |
+
'detection_time_ms': float(total_time * 1000)
|
| 294 |
+
}
|
| 295 |
+
}
|
| 296 |
+
except ImportError as e:
|
| 297 |
+
logger.warning(f"OpenCV not installed: {e}")
|
| 298 |
+
logger.info("Install with: pip install opencv-python")
|
| 299 |
+
return {
|
| 300 |
+
'has_chart': False,
|
| 301 |
+
'confidence': 0.0,
|
| 302 |
+
'chart_types': [],
|
| 303 |
+
'description': 'OpenCV required for local detection',
|
| 304 |
+
'features': {},
|
| 305 |
+
'error': 'opencv_not_installed'
|
| 306 |
+
}
|
| 307 |
+
except Exception as e:
|
| 308 |
+
logger.error(f"Chart detection error: {e}")
|
| 309 |
+
return {
|
| 310 |
+
'has_chart': False,
|
| 311 |
+
'confidence': 0.0,
|
| 312 |
+
'chart_types': [],
|
| 313 |
+
'description': f'Detection error: {str(e)}',
|
| 314 |
+
'features': {},
|
| 315 |
+
'error': str(e)
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
def _chart_result(has_chart, confidence, chart_types, description, line_count, circle_count, overall_edge_density):
|
| 319 |
+
"""Helper to return a standard chart detection result dict."""
|
| 320 |
+
return {
|
| 321 |
+
'has_chart': has_chart,
|
| 322 |
+
'confidence': confidence,
|
| 323 |
+
'chart_types': chart_types,
|
| 324 |
+
'description': description,
|
| 325 |
+
'features': {
|
| 326 |
+
'lines': line_count,
|
| 327 |
+
'circles': circle_count,
|
| 328 |
+
'overall_edge_density': float(overall_edge_density)
|
| 329 |
+
}
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
# Detection configuration thresholds (BALANCED - detect real charts, reject pure text)
|
| 333 |
+
DETECTION_CONFIG = {
|
| 334 |
+
'min_circles_bubble_chart': 5,
|
| 335 |
+
'min_circles_scatter': 3,
|
| 336 |
+
'min_diagonal_lines': 5, # Lowered from 8 for line charts
|
| 337 |
+
'min_angle_variance': 150, # Lowered from 200 for line charts
|
| 338 |
+
'min_rectangle_contours': 6,
|
| 339 |
+
'min_confidence_threshold': 0.4, # Lowered from 0.5
|
| 340 |
+
'max_grid_ratio': 0.75,
|
| 341 |
+
'max_text_edge_density_both': 0.08, # Both sides text
|
| 342 |
+
'max_text_edge_density_overall': 0.15, # Entire page text
|
| 343 |
+
'min_very_short_lines_mesh': 50,
|
| 344 |
+
'axis_confidence_bonus': 0.10,
|
| 345 |
+
'min_line_length': 80,
|
| 346 |
+
'contour_area_min': 1500,
|
| 347 |
+
'contour_area_max': 40000,
|
| 348 |
+
'large_contour_min': 40000,
|
| 349 |
+
'large_contour_max': 500000,
|
| 350 |
+
'circle_radius_min': 15,
|
| 351 |
+
'circle_radius_max': 300,
|
| 352 |
+
'min_bar_chart_bars': 6,
|
| 353 |
+
'min_color_peaks': 5
|
| 354 |
+
}
|
core/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .logging import logger
|
| 2 |
+
|
| 3 |
+
__all__ = ["logger"]
|
core/diagnostics.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Health check utilities for DocChat.
|
| 3 |
+
|
| 4 |
+
This module provides diagnostics check functions that can be used
|
| 5 |
+
to verify the application is running correctly.
|
| 6 |
+
"""
|
| 7 |
+
import logging
|
| 8 |
+
from typing import Dict, Any
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def check_diagnostics() -> Dict[str, Any]:
|
| 15 |
+
"""
|
| 16 |
+
Perform a comprehensive diagnostics check of the application.
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
Dict with diagnostics status and component information
|
| 20 |
+
"""
|
| 21 |
+
diagnostics_status = {
|
| 22 |
+
"status": "diagnosticsy",
|
| 23 |
+
"timestamp": datetime.utcnow().isoformat(),
|
| 24 |
+
"components": {}
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
# Check parameters
|
| 28 |
+
try:
|
| 29 |
+
from configuration.parameters import parameters
|
| 30 |
+
diagnostics_status["components"]["parameters"] = {
|
| 31 |
+
"status": "ok",
|
| 32 |
+
"chroma_db_path": parameters.CHROMA_DB_PATH,
|
| 33 |
+
"log_level": parameters.LOG_LEVEL
|
| 34 |
+
}
|
| 35 |
+
except Exception as e:
|
| 36 |
+
diagnostics_status["components"]["parameters"] = {
|
| 37 |
+
"status": "error",
|
| 38 |
+
"error": str(e)
|
| 39 |
+
}
|
| 40 |
+
diagnostics_status["status"] = "undiagnosticsy"
|
| 41 |
+
|
| 42 |
+
# Check ChromaDB directory
|
| 43 |
+
try:
|
| 44 |
+
from pathlib import Path
|
| 45 |
+
chroma_path = Path(parameters.CHROMA_DB_PATH)
|
| 46 |
+
diagnostics_status["components"]["chroma_db"] = {
|
| 47 |
+
"status": "ok",
|
| 48 |
+
"path_exists": chroma_path.exists(),
|
| 49 |
+
"is_writable": chroma_path.exists() and chroma_path.is_dir()
|
| 50 |
+
}
|
| 51 |
+
except Exception as e:
|
| 52 |
+
diagnostics_status["components"]["chroma_db"] = {
|
| 53 |
+
"status": "error",
|
| 54 |
+
"error": str(e)
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
# Check cache directory
|
| 58 |
+
try:
|
| 59 |
+
cache_path = Path(parameters.CACHE_DIR)
|
| 60 |
+
diagnostics_status["components"]["cache"] = {
|
| 61 |
+
"status": "ok",
|
| 62 |
+
"path_exists": cache_path.exists(),
|
| 63 |
+
"is_writable": cache_path.exists() and cache_path.is_dir()
|
| 64 |
+
}
|
| 65 |
+
except Exception as e:
|
| 66 |
+
diagnostics_status["components"]["cache"] = {
|
| 67 |
+
"status": "error",
|
| 68 |
+
"error": str(e)
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
# Check if required packages are importable
|
| 72 |
+
required_packages = [
|
| 73 |
+
"langchain",
|
| 74 |
+
"langchain_google_genai",
|
| 75 |
+
"chromadb",
|
| 76 |
+
"gradio"
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
packages_status = {}
|
| 80 |
+
for package in required_packages:
|
| 81 |
+
try:
|
| 82 |
+
__import__(package)
|
| 83 |
+
packages_status[package] = "ok"
|
| 84 |
+
except ImportError as e:
|
| 85 |
+
packages_status[package] = f"missing: {e}"
|
| 86 |
+
diagnostics_status["status"] = "degraded"
|
| 87 |
+
|
| 88 |
+
diagnostics_status["components"]["packages"] = packages_status
|
| 89 |
+
|
| 90 |
+
return diagnostics_status
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def check_api_key() -> Dict[str, Any]:
|
| 94 |
+
"""
|
| 95 |
+
Check if the Google API key is configured and valid format.
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Dict with API key status (does not expose the key)
|
| 99 |
+
"""
|
| 100 |
+
try:
|
| 101 |
+
from configuration.parameters import parameters
|
| 102 |
+
api_key = parameters.GOOGLE_API_KEY
|
| 103 |
+
|
| 104 |
+
if not api_key:
|
| 105 |
+
return {"status": "missing", "message": "GOOGLE_API_KEY not set"}
|
| 106 |
+
|
| 107 |
+
if len(api_key) < 20:
|
| 108 |
+
return {"status": "invalid", "message": "API key appears too short"}
|
| 109 |
+
|
| 110 |
+
# Mask the key for logging (show first 4 and last 4 chars)
|
| 111 |
+
masked = f"{api_key[:4]}...{api_key[-4:]}"
|
| 112 |
+
|
| 113 |
+
return {
|
| 114 |
+
"status": "configured",
|
| 115 |
+
"masked_key": masked,
|
| 116 |
+
"length": len(api_key)
|
| 117 |
+
}
|
| 118 |
+
except Exception as e:
|
| 119 |
+
return {"status": "error", "message": str(e)}
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
# Run diagnostics check when executed directly
|
| 124 |
+
import json
|
| 125 |
+
print(json.dumps(check_diagnostics(), indent=2))
|
core/lifecycle.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Signal handling and graceful lifecycle utilities.
|
| 3 |
+
|
| 4 |
+
This module provides graceful lifecycle handling for the DocChat application,
|
| 5 |
+
ensuring resources are properly cleaned up when the application is terminated.
|
| 6 |
+
"""
|
| 7 |
+
import signal
|
| 8 |
+
import sys
|
| 9 |
+
import logging
|
| 10 |
+
import atexit
|
| 11 |
+
from typing import Callable, List, Optional
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ShutdownHandler:
|
| 18 |
+
"""
|
| 19 |
+
Manages graceful lifecycle of the application.
|
| 20 |
+
|
| 21 |
+
Registers cleanup callbacks that are executed when the application
|
| 22 |
+
receives a termination signal (SIGINT, SIGTERM) or exits normally.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
_instance: Optional['ShutdownHandler'] = None
|
| 26 |
+
|
| 27 |
+
def __new__(cls) -> 'ShutdownHandler':
|
| 28 |
+
"""Singleton pattern to ensure only one handler exists."""
|
| 29 |
+
if cls._instance is None:
|
| 30 |
+
cls._instance = super().__new__(cls)
|
| 31 |
+
cls._instance._initialized = False
|
| 32 |
+
return cls._instance
|
| 33 |
+
|
| 34 |
+
def __init__(self) -> None:
|
| 35 |
+
"""Initialize the lifecycle handler."""
|
| 36 |
+
if self._initialized:
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
self._cleanup_callbacks: List[Callable] = []
|
| 40 |
+
self._lifecycle_in_progress: bool = False
|
| 41 |
+
self._initialized = True
|
| 42 |
+
|
| 43 |
+
# Register signal handlers
|
| 44 |
+
signal.signal(signal.SIGINT, self._signal_handler)
|
| 45 |
+
signal.signal(signal.SIGTERM, self._signal_handler)
|
| 46 |
+
|
| 47 |
+
# Register atexit handler for normal exits
|
| 48 |
+
atexit.register(self._atexit_handler)
|
| 49 |
+
|
| 50 |
+
logger.info("[SHUTDOWN] ShutdownHandler initialized")
|
| 51 |
+
|
| 52 |
+
def register_cleanup(self, callback: Callable, name: str = "") -> None:
|
| 53 |
+
"""
|
| 54 |
+
Register a cleanup callback to be called on lifecycle.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
callback: Function to call during lifecycle
|
| 58 |
+
name: Optional name for logging purposes
|
| 59 |
+
"""
|
| 60 |
+
self._cleanup_callbacks.append((callback, name))
|
| 61 |
+
logger.debug(f"[SHUTDOWN] Registered cleanup callback: {name or callback.__name__}")
|
| 62 |
+
|
| 63 |
+
def _signal_handler(self, signum: int, frame) -> None:
|
| 64 |
+
"""
|
| 65 |
+
Handle termination signals.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
signum: Signal number
|
| 69 |
+
frame: Current stack frame
|
| 70 |
+
"""
|
| 71 |
+
signal_name = signal.Signals(signum).name
|
| 72 |
+
logger.info(f"[SHUTDOWN] Received {signal_name}, initiating graceful lifecycle...")
|
| 73 |
+
|
| 74 |
+
self._execute_cleanup()
|
| 75 |
+
sys.exit(0)
|
| 76 |
+
|
| 77 |
+
def _atexit_handler(self) -> None:
|
| 78 |
+
"""Handle normal application exit."""
|
| 79 |
+
if not self._lifecycle_in_progress:
|
| 80 |
+
logger.info("[SHUTDOWN] Application exiting normally, running cleanup...")
|
| 81 |
+
self._execute_cleanup()
|
| 82 |
+
|
| 83 |
+
def _execute_cleanup(self) -> None:
|
| 84 |
+
"""Execute all registered cleanup callbacks."""
|
| 85 |
+
if self._lifecycle_in_progress:
|
| 86 |
+
return
|
| 87 |
+
|
| 88 |
+
self._lifecycle_in_progress = True
|
| 89 |
+
logger.info(f"[SHUTDOWN] Executing {len(self._cleanup_callbacks)} cleanup callbacks...")
|
| 90 |
+
|
| 91 |
+
for callback, name in reversed(self._cleanup_callbacks):
|
| 92 |
+
try:
|
| 93 |
+
callback_name = name or callback.__name__
|
| 94 |
+
logger.debug(f"[SHUTDOWN] Running cleanup: {callback_name}")
|
| 95 |
+
callback()
|
| 96 |
+
logger.debug(f"[SHUTDOWN] ? Cleanup completed: {callback_name}")
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logger.error(f"[SHUTDOWN] ? Cleanup failed: {e}", exc_info=True)
|
| 99 |
+
|
| 100 |
+
logger.info("[SHUTDOWN] ? All cleanup callbacks executed")
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def cleanup_chroma_db() -> None:
|
| 104 |
+
"""Clean up ChromaDB connections."""
|
| 105 |
+
try:
|
| 106 |
+
# ChromaDB cleanup if needed
|
| 107 |
+
logger.info("[CLEANUP] Cleaning up ChromaDB...")
|
| 108 |
+
# ChromaDB uses SQLite which handles cleanup automatically
|
| 109 |
+
logger.info("[CLEANUP] ? ChromaDB cleanup complete")
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logger.error(f"[CLEANUP] ChromaDB cleanup failed: {e}")
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def cleanup_temp_files() -> None:
|
| 115 |
+
"""Clean up temporary files created during processing."""
|
| 116 |
+
try:
|
| 117 |
+
import tempfile
|
| 118 |
+
import shutil
|
| 119 |
+
|
| 120 |
+
# Clean up any temp directories we created
|
| 121 |
+
temp_base = Path(tempfile.gettempdir())
|
| 122 |
+
|
| 123 |
+
# Only clean up directories that match our pattern
|
| 124 |
+
# Be conservative to avoid deleting user data
|
| 125 |
+
logger.info("[CLEANUP] Temporary file cleanup complete")
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"[CLEANUP] Temp file cleanup failed: {e}")
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def cleanup_logging() -> None:
|
| 131 |
+
"""Flush and close all log handlers."""
|
| 132 |
+
try:
|
| 133 |
+
logger.info("[CLEANUP] Flushing log handlers...")
|
| 134 |
+
|
| 135 |
+
# Get root logger and flush all handlers
|
| 136 |
+
root_logger = logging.getLogger()
|
| 137 |
+
for handler in root_logger.handlers:
|
| 138 |
+
handler.flush()
|
| 139 |
+
|
| 140 |
+
logger.info("[CLEANUP] ? Log handlers flushed")
|
| 141 |
+
except Exception as e:
|
| 142 |
+
# Can't log this since logging might be broken
|
| 143 |
+
print(f"[CLEANUP] Log handler cleanup failed: {e}", file=sys.stderr)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def initialize_lifecycle_handler() -> ShutdownHandler:
|
| 147 |
+
"""
|
| 148 |
+
Initialize the lifecycle handler with default cleanup callbacks.
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
The initialized ShutdownHandler instance
|
| 152 |
+
"""
|
| 153 |
+
handler = ShutdownHandler()
|
| 154 |
+
|
| 155 |
+
# Register default cleanup callbacks (order matters - reverse execution)
|
| 156 |
+
handler.register_cleanup(cleanup_logging, "Logging cleanup")
|
| 157 |
+
handler.register_cleanup(cleanup_temp_files, "Temp files cleanup")
|
| 158 |
+
handler.register_cleanup(cleanup_chroma_db, "ChromaDB cleanup")
|
| 159 |
+
|
| 160 |
+
return handler
|
core/logger.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Logging utility module.
|
| 3 |
+
|
| 4 |
+
This module provides a centralized logger instance using the standard library logging.
|
| 5 |
+
The logging configuration is handled by config/logger_setup.py which should be
|
| 6 |
+
called at application startup.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
from core.logging import logger
|
| 10 |
+
logger.info("Your message here")
|
| 11 |
+
"""
|
| 12 |
+
import logging
|
| 13 |
+
|
| 14 |
+
# Get a logger for the smartdoc module
|
| 15 |
+
# The actual configuration (handlers, formatters) is done in config/logger_setup.py
|
| 16 |
+
logger = logging.getLogger("smartdoc")
|
dependencies.txt
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies
|
| 2 |
+
aiofiles>=23.2.1
|
| 3 |
+
aiohttp>=3.11.0
|
| 4 |
+
annotated-types>=0.7.0
|
| 5 |
+
anyio>=4.8.0
|
| 6 |
+
|
| 7 |
+
# PDF Processing
|
| 8 |
+
pdfplumber>=0.11.0
|
| 9 |
+
pdf2image>=1.17.0
|
| 10 |
+
Pillow>=10.0.0
|
| 11 |
+
|
| 12 |
+
# Computer Vision for local chart detection (cost optimization)
|
| 13 |
+
opencv-python>=4.8.0
|
| 14 |
+
|
| 15 |
+
# LangChain ecosystem
|
| 16 |
+
langchain>=0.3.16
|
| 17 |
+
langchain-core>=0.3.32
|
| 18 |
+
langchain-text-splitters>=0.3.5
|
| 19 |
+
langchain-google-genai>=2.0.0
|
| 20 |
+
langchain-community>=0.3.16
|
| 21 |
+
|
| 22 |
+
# Google AI for chart analysis
|
| 23 |
+
google-generativeai>=0.8.0
|
| 24 |
+
|
| 25 |
+
# Vector store
|
| 26 |
+
chromadb>=0.6.3
|
| 27 |
+
|
| 28 |
+
# Web framework
|
| 29 |
+
gradio>=5.13.0
|
| 30 |
+
|
| 31 |
+
# Data processing
|
| 32 |
+
pandas>=2.1.4
|
| 33 |
+
numpy>=1.26.4
|
| 34 |
+
beautifulsoup4>=4.12.3
|
| 35 |
+
|
| 36 |
+
# Document loaders
|
| 37 |
+
python-docx>=1.1.2
|
| 38 |
+
docx2txt>=0.8
|
| 39 |
+
|
| 40 |
+
# Configuration
|
| 41 |
+
pydantic>=2.11.10,<2.12.5
|
| 42 |
+
pydantic-settings>=2.10.1,<3.0.0
|
| 43 |
+
python-dotenv>=1.0.1
|
| 44 |
+
|
| 45 |
+
# BM25 retriever
|
| 46 |
+
rank-bm25>=0.2.2
|
| 47 |
+
|
| 48 |
+
# Utilities
|
| 49 |
+
tqdm>=4.67.0
|
| 50 |
+
requests>=2.32.0
|
| 51 |
+
tiktoken>=0.8.0
|
| 52 |
+
tenacity>=9.0.0
|
intelligence/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .knowledge_synthesizer import ResearchAgent
|
| 2 |
+
from .accuracy_verifier import VerificationAgent
|
| 3 |
+
from .orchestrator import AgentWorkflow
|
| 4 |
+
|
| 5 |
+
__all__ = ["ResearchAgent", "VerificationAgent", "AgentWorkflow"]
|
intelligence/accuracy_verifier.py
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Verification agent module for answer validation against source documents.
|
| 3 |
+
"""
|
| 4 |
+
from typing import Dict, List, Optional, Literal
|
| 5 |
+
from langchain_core.documents import Document
|
| 6 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
from configuration.parameters import parameters
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class VerificationResult(BaseModel):
|
| 16 |
+
"""Structured output model for verification results."""
|
| 17 |
+
|
| 18 |
+
supported: Literal["YES", "NO", "PARTIAL"] = Field(
|
| 19 |
+
description="Whether the answer is supported by the context"
|
| 20 |
+
)
|
| 21 |
+
confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
|
| 22 |
+
default="MEDIUM",
|
| 23 |
+
description="Confidence level in the verification result"
|
| 24 |
+
)
|
| 25 |
+
unsupported_claims: List[str] = Field(
|
| 26 |
+
default_factory=list,
|
| 27 |
+
description="Claims not supported by context"
|
| 28 |
+
)
|
| 29 |
+
contradictions: List[str] = Field(
|
| 30 |
+
default_factory=list,
|
| 31 |
+
description="Contradictions between answer and context"
|
| 32 |
+
)
|
| 33 |
+
relevant: Literal["YES", "NO"] = Field(
|
| 34 |
+
description="Whether the answer is relevant to the question"
|
| 35 |
+
)
|
| 36 |
+
completeness: Literal["COMPLETE", "PARTIAL", "INCOMPLETE"] = Field(
|
| 37 |
+
default="PARTIAL",
|
| 38 |
+
description="How completely the answer addresses the question"
|
| 39 |
+
)
|
| 40 |
+
additional_details: str = Field(
|
| 41 |
+
default="",
|
| 42 |
+
description="Additional explanations and reasoning"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class BestAnswerSelection(BaseModel):
|
| 47 |
+
"""Structured output model for selecting the best answer from candidates."""
|
| 48 |
+
|
| 49 |
+
selected_index: int = Field(
|
| 50 |
+
description="The index (0-based) of the best answer from the candidates list"
|
| 51 |
+
)
|
| 52 |
+
reasoning: str = Field(
|
| 53 |
+
description="Explanation of why this answer was selected as the best"
|
| 54 |
+
)
|
| 55 |
+
confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
|
| 56 |
+
default="MEDIUM",
|
| 57 |
+
description="Confidence level in the selection"
|
| 58 |
+
)
|
| 59 |
+
comparison_summary: str = Field(
|
| 60 |
+
default="",
|
| 61 |
+
description="Brief comparison of the candidate answers"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class VerificationAgent:
|
| 66 |
+
"""Agent for verifying answers against source documents."""
|
| 67 |
+
|
| 68 |
+
def __init__(
|
| 69 |
+
self,
|
| 70 |
+
llm: Optional[ChatGoogleGenerativeAI] = None,
|
| 71 |
+
max_context_chars: int = None,
|
| 72 |
+
max_output_tokens: int = None,
|
| 73 |
+
) -> None:
|
| 74 |
+
"""Initialize the verification agent."""
|
| 75 |
+
logger.info("Initializing VerificationAgent...")
|
| 76 |
+
|
| 77 |
+
self.max_context_chars = max_context_chars or parameters.VERIFICATION_MAX_CONTEXT_CHARS
|
| 78 |
+
self.max_output_tokens = max_output_tokens or parameters.VERIFICATION_MAX_OUTPUT_TOKENS
|
| 79 |
+
|
| 80 |
+
base_llm = llm or ChatGoogleGenerativeAI(
|
| 81 |
+
model=parameters.VERIFICATION_AGENT_MODEL,
|
| 82 |
+
google_api_key=parameters.GOOGLE_API_KEY,
|
| 83 |
+
temperature=0,
|
| 84 |
+
max_output_tokens=self.max_output_tokens,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
self.llm = base_llm
|
| 88 |
+
self.structured_llm = base_llm.with_structured_output(VerificationResult)
|
| 89 |
+
self.selection_llm = base_llm.with_structured_output(BestAnswerSelection)
|
| 90 |
+
|
| 91 |
+
logger.info(f"VerificationAgent initialized (model={parameters.VERIFICATION_AGENT_MODEL})")
|
| 92 |
+
|
| 93 |
+
def generate_prompt(self, answer: str, context: str, question: Optional[str] = None) -> str:
|
| 94 |
+
"""Generate verification prompt."""
|
| 95 |
+
question_section = f"\n**Original Question:** {question}\n" if question else ""
|
| 96 |
+
|
| 97 |
+
return f"""Verify the following answer against the provided context.
|
| 98 |
+
|
| 99 |
+
**Check for:**
|
| 100 |
+
1. Factual support (YES/NO/PARTIAL)
|
| 101 |
+
2. Confidence level (HIGH/MEDIUM/LOW)
|
| 102 |
+
3. Unsupported claims
|
| 103 |
+
4. Contradictions
|
| 104 |
+
5. Relevance to question
|
| 105 |
+
6. Completeness (COMPLETE/PARTIAL/INCOMPLETE)
|
| 106 |
+
|
| 107 |
+
**Scoring:**
|
| 108 |
+
- HIGH: All claims directly stated, no ambiguity
|
| 109 |
+
- MEDIUM: Most claims supported, some inferred
|
| 110 |
+
- LOW: Significant claims unsupported
|
| 111 |
+
{question_section}
|
| 112 |
+
**Answer to Verify:**
|
| 113 |
+
{answer}
|
| 114 |
+
|
| 115 |
+
**Context:**
|
| 116 |
+
{context}
|
| 117 |
+
|
| 118 |
+
Provide your verification analysis."""
|
| 119 |
+
|
| 120 |
+
def format_verification_report(self, verification: VerificationResult) -> str:
|
| 121 |
+
"""Format verification result into readable report."""
|
| 122 |
+
report = f"**Supported:** {verification.supported}\n"
|
| 123 |
+
report += f"**Confidence:** {verification.confidence}\n"
|
| 124 |
+
report += f"**Unsupported Claims:** {', '.join(verification.unsupported_claims) or 'None'}\n"
|
| 125 |
+
report += f"**Contradictions:** {', '.join(verification.contradictions) or 'None'}\n"
|
| 126 |
+
report += f"**Relevant:** {verification.relevant}\n"
|
| 127 |
+
report += f"**Completeness:** {verification.completeness}\n"
|
| 128 |
+
report += f"**Additional Details:** {verification.additional_details or 'None'}\n"
|
| 129 |
+
return report
|
| 130 |
+
|
| 131 |
+
def generate_feedback_for_research(self, verification: VerificationResult) -> Optional[str]:
|
| 132 |
+
"""Generate feedback for research agent if improvements needed."""
|
| 133 |
+
feedback_parts = []
|
| 134 |
+
if verification.supported == "NO":
|
| 135 |
+
feedback_parts.append("Answer lacks sufficient support from documents.")
|
| 136 |
+
elif verification.supported == "PARTIAL":
|
| 137 |
+
feedback_parts.append("Some parts are not well supported.")
|
| 138 |
+
if verification.unsupported_claims:
|
| 139 |
+
claims_str = "; ".join(verification.unsupported_claims[:3])
|
| 140 |
+
feedback_parts.append(f"Unsupported: {claims_str}")
|
| 141 |
+
if verification.contradictions:
|
| 142 |
+
contradictions_str = "; ".join(verification.contradictions[:3])
|
| 143 |
+
feedback_parts.append(f"Contradictions: {contradictions_str}")
|
| 144 |
+
if verification.completeness == "INCOMPLETE":
|
| 145 |
+
feedback_parts.append("Answer is incomplete.")
|
| 146 |
+
if verification.confidence == "LOW":
|
| 147 |
+
feedback_parts.append("Focus on directly verifiable claims.")
|
| 148 |
+
# Always add additional_details if present, even if other feedback exists
|
| 149 |
+
if verification.additional_details:
|
| 150 |
+
feedback_parts.append(f"Additional Details: {verification.additional_details}")
|
| 151 |
+
return " | ".join(feedback_parts) if feedback_parts else None
|
| 152 |
+
|
| 153 |
+
def should_retry_research(self, verification: VerificationResult) -> bool:
|
| 154 |
+
"""Determine if research should be retried."""
|
| 155 |
+
if verification.supported == "NO" or verification.relevant == "NO":
|
| 156 |
+
return True
|
| 157 |
+
|
| 158 |
+
if verification.confidence == "LOW" and (
|
| 159 |
+
verification.unsupported_claims or verification.contradictions
|
| 160 |
+
):
|
| 161 |
+
return True
|
| 162 |
+
|
| 163 |
+
if verification.supported == "PARTIAL" and verification.contradictions:
|
| 164 |
+
return True
|
| 165 |
+
|
| 166 |
+
return False
|
| 167 |
+
|
| 168 |
+
def check(self, answer: str, documents: List[Document], question: Optional[str] = None) -> Dict:
|
| 169 |
+
"""
|
| 170 |
+
Verify answer against provided documents.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
answer: The answer to verify
|
| 174 |
+
documents: Source documents for verification
|
| 175 |
+
question: Optional original question
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
Dict with verification report, context, and metadata
|
| 179 |
+
"""
|
| 180 |
+
logger.info(f"Verifying answer ({len(answer)} chars) against {len(documents)} documents")
|
| 181 |
+
|
| 182 |
+
context = "\n\n".join([doc.page_content for doc in documents])
|
| 183 |
+
|
| 184 |
+
if len(context) > self.max_context_chars:
|
| 185 |
+
logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}")
|
| 186 |
+
context = context[:self.max_context_chars]
|
| 187 |
+
|
| 188 |
+
prompt = self.generate_prompt(answer, context, question)
|
| 189 |
+
|
| 190 |
+
try:
|
| 191 |
+
logger.debug("Calling LLM for verification...")
|
| 192 |
+
verification_result: VerificationResult = self.structured_llm.invoke(prompt)
|
| 193 |
+
logger.info(f"Verification: {verification_result.supported} ({verification_result.confidence})")
|
| 194 |
+
|
| 195 |
+
except Exception as e:
|
| 196 |
+
logger.error(f"Structured output failed: {e}")
|
| 197 |
+
|
| 198 |
+
try:
|
| 199 |
+
response = self.llm.invoke(prompt)
|
| 200 |
+
report = response.content if hasattr(response, "content") else str(response)
|
| 201 |
+
verification_result = self._parse_unstructured_response(report.strip())
|
| 202 |
+
except Exception as fallback_error:
|
| 203 |
+
logger.error(f"Fallback failed: {fallback_error}")
|
| 204 |
+
verification_result = VerificationResult(
|
| 205 |
+
supported="NO",
|
| 206 |
+
confidence="LOW",
|
| 207 |
+
relevant="NO",
|
| 208 |
+
completeness="INCOMPLETE",
|
| 209 |
+
additional_details=f"Verification failed: {str(e)}"
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
verification_report = self.format_verification_report(verification_result)
|
| 213 |
+
feedback = self.generate_feedback_for_research(verification_result)
|
| 214 |
+
|
| 215 |
+
if feedback:
|
| 216 |
+
logger.debug(f"Generated feedback: {feedback[:80]}...")
|
| 217 |
+
|
| 218 |
+
return {
|
| 219 |
+
"verification_report": verification_report,
|
| 220 |
+
"context_used": context,
|
| 221 |
+
"structured_result": verification_result.model_dump(),
|
| 222 |
+
"should_retry": self.should_retry_research(verification_result),
|
| 223 |
+
"feedback": feedback
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
def select_best_answer(
|
| 227 |
+
self,
|
| 228 |
+
candidate_answers: List[str],
|
| 229 |
+
documents: List[Document],
|
| 230 |
+
question: str
|
| 231 |
+
) -> Dict:
|
| 232 |
+
"""
|
| 233 |
+
Select the best answer from multiple candidates based on verification criteria.
|
| 234 |
+
|
| 235 |
+
Args:
|
| 236 |
+
candidate_answers: List of candidate answers to evaluate
|
| 237 |
+
documents: Source documents for verification
|
| 238 |
+
question: The original question
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
Dict with selected answer, index, reasoning, and verification details
|
| 242 |
+
"""
|
| 243 |
+
logger.info(f"Selecting best answer from {len(candidate_answers)} candidates")
|
| 244 |
+
|
| 245 |
+
if len(candidate_answers) == 0:
|
| 246 |
+
logger.warning("No candidate answers provided")
|
| 247 |
+
return {
|
| 248 |
+
"selected_answer": "No answers were generated.",
|
| 249 |
+
"selected_index": -1,
|
| 250 |
+
"reasoning": "No candidates available",
|
| 251 |
+
"confidence": "LOW"
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
if len(candidate_answers) == 1:
|
| 255 |
+
logger.info("Only one candidate, returning it directly")
|
| 256 |
+
return {
|
| 257 |
+
"selected_answer": candidate_answers[0],
|
| 258 |
+
"selected_index": 0,
|
| 259 |
+
"reasoning": "Only one candidate answer was provided",
|
| 260 |
+
"confidence": "MEDIUM"
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
context = "\n\n".join([doc.page_content for doc in documents])
|
| 264 |
+
if len(context) > self.max_context_chars:
|
| 265 |
+
logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}")
|
| 266 |
+
context = context[:self.max_context_chars]
|
| 267 |
+
|
| 268 |
+
candidates_text = ""
|
| 269 |
+
for i, answer in enumerate(candidate_answers):
|
| 270 |
+
candidates_text += f"\n**Candidate {i + 1}:**\n{answer}\n"
|
| 271 |
+
|
| 272 |
+
prompt = f"""You are evaluating multiple candidate answers to select the best one.
|
| 273 |
+
|
| 274 |
+
**Original Question:** {question}
|
| 275 |
+
|
| 276 |
+
**Candidate Answers:**
|
| 277 |
+
{candidates_text}
|
| 278 |
+
|
| 279 |
+
**Source Context:**
|
| 280 |
+
{context}
|
| 281 |
+
|
| 282 |
+
**Evaluation Criteria:**
|
| 283 |
+
1. **Factual Accuracy**: Which answer is most accurately supported by the context?
|
| 284 |
+
2. **Completeness**: Which answer most thoroughly addresses the question?
|
| 285 |
+
3. **Relevance**: Which answer stays most focused on what was asked?
|
| 286 |
+
4. **No Contradictions**: Which answer has the fewest contradictions with the source?
|
| 287 |
+
5. **Clarity**: Which answer is clearest and most well-structured?
|
| 288 |
+
|
| 289 |
+
Select the best answer by providing its index (0-based) and explain your reasoning."""
|
| 290 |
+
|
| 291 |
+
try:
|
| 292 |
+
logger.debug("Calling LLM for best answer selection...")
|
| 293 |
+
selection_result: BestAnswerSelection = self.selection_llm.invoke(prompt)
|
| 294 |
+
|
| 295 |
+
selected_index = selection_result.selected_index
|
| 296 |
+
if selected_index < 0 or selected_index >= len(candidate_answers):
|
| 297 |
+
logger.warning(f"Invalid selection index {selected_index}, defaulting to 0")
|
| 298 |
+
selected_index = 0
|
| 299 |
+
|
| 300 |
+
logger.info(f"Selected candidate {selected_index + 1} with {selection_result.confidence} confidence")
|
| 301 |
+
|
| 302 |
+
return {
|
| 303 |
+
"selected_answer": candidate_answers[selected_index],
|
| 304 |
+
"selected_index": selected_index,
|
| 305 |
+
"reasoning": selection_result.reasoning,
|
| 306 |
+
"confidence": selection_result.confidence,
|
| 307 |
+
"comparison_summary": selection_result.comparison_summary
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
except Exception as e:
|
| 311 |
+
logger.error(f"Best answer selection failed: {e}")
|
| 312 |
+
# Fallback: return the first candidate
|
| 313 |
+
return {
|
| 314 |
+
"selected_answer": candidate_answers[0],
|
| 315 |
+
"selected_index": 0,
|
| 316 |
+
"reasoning": f"Selection failed, using first candidate: {str(e)}",
|
| 317 |
+
"confidence": "LOW"
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
def _parse_unstructured_response(self, response_text: str) -> VerificationResult:
|
| 321 |
+
"""Parse unstructured response into VerificationResult (fallback)."""
|
| 322 |
+
try:
|
| 323 |
+
data = {
|
| 324 |
+
"supported": "NO",
|
| 325 |
+
"confidence": "LOW",
|
| 326 |
+
"unsupported_claims": [],
|
| 327 |
+
"contradictions": [],
|
| 328 |
+
"relevant": "NO",
|
| 329 |
+
"completeness": "INCOMPLETE",
|
| 330 |
+
"additional_details": ""
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
for line in response_text.split('\n'):
|
| 334 |
+
if ':' not in line:
|
| 335 |
+
continue
|
| 336 |
+
|
| 337 |
+
key, value = line.split(':', 1)
|
| 338 |
+
key = key.strip().lower().replace(' ', '_')
|
| 339 |
+
value = value.strip().upper()
|
| 340 |
+
|
| 341 |
+
if key == "SUPPORTED":
|
| 342 |
+
data["supported"] = "YES" if "YES" in value else ("PARTIAL" if "PARTIAL" in value else "NO")
|
| 343 |
+
elif key == "CONFIDENCE":
|
| 344 |
+
data["confidence"] = "HIGH" if "HIGH" in value else ("MEDIUM" if "MEDIUM" in value else "LOW")
|
| 345 |
+
elif key == "RELEVANT":
|
| 346 |
+
data["relevant"] = "YES" if "YES" in value else "NO"
|
| 347 |
+
elif key == "COMPLETENESS":
|
| 348 |
+
if "COMPLETE" in value and "INCOMPLETE" not in value:
|
| 349 |
+
data["completeness"] = "COMPLETE"
|
| 350 |
+
elif "PARTIAL" in value:
|
| 351 |
+
data["completeness"] = "PARTIAL"
|
| 352 |
+
|
| 353 |
+
return VerificationResult(**data)
|
| 354 |
+
except Exception as e:
|
| 355 |
+
logger.error(f"Failed to parse response: {e}")
|
| 356 |
+
return VerificationResult(
|
| 357 |
+
supported="NO",
|
| 358 |
+
confidence="LOW",
|
| 359 |
+
relevant="NO",
|
| 360 |
+
completeness="INCOMPLETE",
|
| 361 |
+
additional_details="Failed to parse verification response"
|
| 362 |
+
)
|
intelligence/context_validator.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Relevance checker module for document retrieval quality assessment.
|
| 3 |
+
"""
|
| 4 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
from typing import Literal, Optional, List
|
| 7 |
+
import logging
|
| 8 |
+
from configuration.parameters import parameters
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def estimate_tokens(text: str, chars_per_token: int = 4) -> int:
|
| 14 |
+
"""Estimate token count from text length."""
|
| 15 |
+
return len(text) // chars_per_token
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# ============================================================================
|
| 19 |
+
# Structured Output Models
|
| 20 |
+
# ============================================================================
|
| 21 |
+
|
| 22 |
+
class ContextValidationClassification(BaseModel):
|
| 23 |
+
"""Structured output for context validation classification."""
|
| 24 |
+
|
| 25 |
+
classification: Literal["CAN_ANSWER", "PARTIAL", "NO_MATCH"] = Field(
|
| 26 |
+
description=(
|
| 27 |
+
"CAN_ANSWER: Passages contain enough info to fully answer. "
|
| 28 |
+
"PARTIAL: Passages mention the topic but incomplete. "
|
| 29 |
+
"NO_MATCH: Passages don't discuss the topic at all."
|
| 30 |
+
)
|
| 31 |
+
)
|
| 32 |
+
confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
|
| 33 |
+
default="MEDIUM",
|
| 34 |
+
description="Confidence level in the classification"
|
| 35 |
+
)
|
| 36 |
+
reasoning: str = Field(
|
| 37 |
+
default="",
|
| 38 |
+
description="Brief explanation for the classification"
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class ContextQueryExpansion(BaseModel):
|
| 43 |
+
"""Structured output for query expansion/rewriting."""
|
| 44 |
+
|
| 45 |
+
rewritten_query: str = Field(
|
| 46 |
+
description="A rephrased version of the original query"
|
| 47 |
+
)
|
| 48 |
+
key_terms: List[str] = Field(
|
| 49 |
+
default_factory=list,
|
| 50 |
+
description="Key terms and synonyms to search for"
|
| 51 |
+
)
|
| 52 |
+
search_strategy: str = Field(
|
| 53 |
+
default="",
|
| 54 |
+
description="Brief explanation of the search approach"
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class ContextValidator:
|
| 59 |
+
"""
|
| 60 |
+
Checks context relevance of retrieved documents to a user's question.
|
| 61 |
+
|
| 62 |
+
Uses Gemini model with structured output to classify coverage
|
| 63 |
+
and provides query rewriting for improved retrieval.
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
VALID_LABELS = {"CAN_ANSWER", "PARTIAL", "NO_MATCH"}
|
| 67 |
+
|
| 68 |
+
def __init__(self):
|
| 69 |
+
"""Initialize the context validator."""
|
| 70 |
+
logger.info("Initializing ContextValidator...")
|
| 71 |
+
|
| 72 |
+
base_llm = ChatGoogleGenerativeAI(
|
| 73 |
+
model=parameters.RELEVANCE_CHECKER_MODEL,
|
| 74 |
+
google_api_key=parameters.GOOGLE_API_KEY,
|
| 75 |
+
temperature=0,
|
| 76 |
+
max_output_tokens=100,
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
self.llm = base_llm
|
| 80 |
+
self.structured_llm = base_llm.with_structured_output(ContextValidationClassification)
|
| 81 |
+
self.query_expansion_llm = base_llm.with_structured_output(ContextQueryExpansion)
|
| 82 |
+
|
| 83 |
+
logger.info(f"ContextValidator initialized (model={parameters.RELEVANCE_CHECKER_MODEL})")
|
| 84 |
+
|
| 85 |
+
def context_query_rewrite(self, original_query: str, context_hint: Optional[str] = None) -> Optional[ContextQueryExpansion]:
|
| 86 |
+
"""
|
| 87 |
+
Rewrite a query to potentially retrieve better results.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
original_query: The original user query
|
| 91 |
+
context_hint: Optional hint about available documents
|
| 92 |
+
|
| 93 |
+
Returns:
|
| 94 |
+
ContextQueryExpansion with rewritten query, or None on failure
|
| 95 |
+
"""
|
| 96 |
+
logger.debug(f"Rewriting query: {original_query[:80]}...")
|
| 97 |
+
|
| 98 |
+
context_section = f"\n**Available Context:** {context_hint}\n" if context_hint else ""
|
| 99 |
+
|
| 100 |
+
prompt = f"""Rewrite this query to improve document retrieval.
|
| 101 |
+
|
| 102 |
+
**Original Query:** {original_query}
|
| 103 |
+
{context_section}
|
| 104 |
+
**Instructions:**
|
| 105 |
+
1. Rephrase to be more specific and searchable
|
| 106 |
+
2. Extract key terms and add synonyms
|
| 107 |
+
3. Consider exact phrases in formal documents"""
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
result: ContextQueryExpansion = self.query_expansion_llm.invoke(prompt)
|
| 111 |
+
logger.debug(f"Query rewritten: {result.rewritten_query[:60]}...")
|
| 112 |
+
return result
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.error(f"Query rewrite failed: {e}")
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
def context_validate(self, question: str, retriever, k: int = 3) -> str:
|
| 118 |
+
"""
|
| 119 |
+
Retrieve top-k passages and classify coverage.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
question: The user's question
|
| 123 |
+
retriever: The retriever for fetching documents
|
| 124 |
+
k: Number of top documents to consider
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
Classification: "CAN_ANSWER", "PARTIAL", or "NO_MATCH"
|
| 128 |
+
"""
|
| 129 |
+
if not question or not question.strip():
|
| 130 |
+
logger.warning("Empty question provided")
|
| 131 |
+
return "NO_MATCH"
|
| 132 |
+
|
| 133 |
+
if k < 1:
|
| 134 |
+
k = 3
|
| 135 |
+
|
| 136 |
+
logger.info(f"Checking context relevance for: {question[:60]}...")
|
| 137 |
+
|
| 138 |
+
# Retrieve documents
|
| 139 |
+
try:
|
| 140 |
+
top_docs = retriever.invoke(question)
|
| 141 |
+
except Exception as e:
|
| 142 |
+
logger.error(f"Retriever invocation failed: {e}")
|
| 143 |
+
return "NO_MATCH"
|
| 144 |
+
|
| 145 |
+
if not top_docs:
|
| 146 |
+
logger.info("No documents returned")
|
| 147 |
+
return "NO_MATCH"
|
| 148 |
+
|
| 149 |
+
logger.debug(f"Retrieved {len(top_docs)} documents")
|
| 150 |
+
|
| 151 |
+
passages = "\n\n".join(doc.page_content for doc in top_docs[:k])
|
| 152 |
+
|
| 153 |
+
prompt = f"""Classify how well the passages address the question.
|
| 154 |
+
|
| 155 |
+
**Question:** {question}
|
| 156 |
+
|
| 157 |
+
**Passages:**
|
| 158 |
+
{passages}
|
| 159 |
+
|
| 160 |
+
Classify as CAN_ANSWER (fully answers), PARTIAL (mentions topic), or NO_MATCH (unrelated)."""
|
| 161 |
+
|
| 162 |
+
try:
|
| 163 |
+
result: ContextValidationClassification = self.structured_llm.invoke(prompt)
|
| 164 |
+
logger.info(f"Context relevance: {result.classification} ({result.confidence})")
|
| 165 |
+
return result.classification
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"Structured output failed: {e}")
|
| 169 |
+
|
| 170 |
+
# Fallback to text parsing
|
| 171 |
+
try:
|
| 172 |
+
response = self.llm.invoke(prompt)
|
| 173 |
+
raw_response = response.content if hasattr(response, "content") else str(response)
|
| 174 |
+
llm_response = raw_response.strip().upper()
|
| 175 |
+
|
| 176 |
+
for label in self.VALID_LABELS:
|
| 177 |
+
if label in llm_response:
|
| 178 |
+
logger.info(f"Fallback classification: {label}")
|
| 179 |
+
return label
|
| 180 |
+
|
| 181 |
+
return "NO_MATCH"
|
| 182 |
+
|
| 183 |
+
except Exception as fallback_error:
|
| 184 |
+
logger.error(f"Fallback failed: {fallback_error}")
|
| 185 |
+
return "NO_MATCH"
|
| 186 |
+
|
| 187 |
+
def context_validate_with_rewrite(self, question: str, retriever, k: int = 3, max_rewrites: int = 1) -> dict:
|
| 188 |
+
"""
|
| 189 |
+
Check relevance with automatic query rewriting if needed.
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
question: The user's question
|
| 193 |
+
retriever: The retriever to use
|
| 194 |
+
k: Number of top documents
|
| 195 |
+
max_rewrites: Maximum rewrite attempts
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
Dict with classification, query_used, and was_rewritten
|
| 199 |
+
"""
|
| 200 |
+
classification = self.context_validate(question, retriever, k)
|
| 201 |
+
|
| 202 |
+
if classification == "CAN_ANSWER" or max_rewrites <= 0:
|
| 203 |
+
return {
|
| 204 |
+
"classification": classification,
|
| 205 |
+
"query_used": question,
|
| 206 |
+
"was_rewritten": False
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
# Try query rewriting for poor results
|
| 210 |
+
if classification in ["PARTIAL", "NO_MATCH"]:
|
| 211 |
+
logger.info("Attempting query rewrite...")
|
| 212 |
+
|
| 213 |
+
expansion = self.context_query_rewrite(question)
|
| 214 |
+
if expansion and expansion.rewritten_query != question:
|
| 215 |
+
new_classification = self.context_validate(expansion.rewritten_query, retriever, k)
|
| 216 |
+
|
| 217 |
+
if self._is_better_classification(new_classification, classification):
|
| 218 |
+
logger.info(f"Rewrite improved: {classification} -> {new_classification}")
|
| 219 |
+
return {
|
| 220 |
+
"classification": new_classification,
|
| 221 |
+
"query_used": expansion.rewritten_query,
|
| 222 |
+
"was_rewritten": True,
|
| 223 |
+
"key_terms": expansion.key_terms
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
return {
|
| 227 |
+
"classification": classification,
|
| 228 |
+
"query_used": question,
|
| 229 |
+
"was_rewritten": False
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
def _is_better_classification(self, new: str, old: str) -> bool:
|
| 233 |
+
"""Check if new classification is better than old."""
|
| 234 |
+
ranking = {"NO_MATCH": 0, "PARTIAL": 1, "CAN_ANSWER": 2}
|
| 235 |
+
return ranking.get(new, 0) > ranking.get(old, 0)
|
intelligence/knowledge_synthesizer.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List, Optional
|
| 2 |
+
import logging
|
| 3 |
+
from langchain_core.documents import Document
|
| 4 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 5 |
+
from configuration.parameters import parameters
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def estimate_tokens(text: str, chars_per_token: int = 4) -> int:
|
| 11 |
+
"""
|
| 12 |
+
Estimate token count from text length.
|
| 13 |
+
"""
|
| 14 |
+
return len(text) // chars_per_token
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ResearchAgent:
|
| 18 |
+
"""
|
| 19 |
+
ResearchAgent generates answers to user questions using Gemini LLM,
|
| 20 |
+
focusing on extracting factual, source-cited information from documents.
|
| 21 |
+
"""
|
| 22 |
+
def __init__(
|
| 23 |
+
self,
|
| 24 |
+
llm: Optional[ChatGoogleGenerativeAI] = None,
|
| 25 |
+
top_k: int = None,
|
| 26 |
+
max_context_chars: int = None,
|
| 27 |
+
max_output_tokens: int = None,
|
| 28 |
+
) -> None:
|
| 29 |
+
"""
|
| 30 |
+
Initialize the research agent with the Gemini model and configuration.
|
| 31 |
+
"""
|
| 32 |
+
logger.info("[RESEARCH_AGENT] Initializing...")
|
| 33 |
+
self.top_k = top_k or parameters.RESEARCH_TOP_K
|
| 34 |
+
self.max_context_chars = max_context_chars or parameters.RESEARCH_MAX_CONTEXT_CHARS
|
| 35 |
+
self.max_output_tokens = max_output_tokens or parameters.RESEARCH_MAX_OUTPUT_TOKENS
|
| 36 |
+
self.llm = llm or ChatGoogleGenerativeAI(
|
| 37 |
+
model=parameters.RESEARCH_AGENT_MODEL,
|
| 38 |
+
google_api_key=parameters.GOOGLE_API_KEY,
|
| 39 |
+
temperature=0.2,
|
| 40 |
+
max_output_tokens=self.max_output_tokens
|
| 41 |
+
)
|
| 42 |
+
logger.info(f"[RESEARCH_AGENT] ✓ Initialized (top_k={self.top_k}, model={parameters.RESEARCH_AGENT_MODEL})")
|
| 43 |
+
|
| 44 |
+
def sanitize_response(self, response_text: str) -> str:
|
| 45 |
+
"""
|
| 46 |
+
Sanitize the LLM's response by stripping unnecessary whitespace.
|
| 47 |
+
"""
|
| 48 |
+
return response_text.strip()
|
| 49 |
+
|
| 50 |
+
def generate_prompt(self, question: str, context: str, feedback: Optional[str] = None) -> str:
|
| 51 |
+
"""
|
| 52 |
+
Generate a structured prompt for the LLM to generate a precise and factual answer.
|
| 53 |
+
Includes special instructions for handling tables, charts, and visualizations.
|
| 54 |
+
"""
|
| 55 |
+
base_prompt = f"""You are an AI assistant designed to provide precise and factual answers based on the given context.
|
| 56 |
+
|
| 57 |
+
**Instructions:**
|
| 58 |
+
- Answer the following question using only the provided context.
|
| 59 |
+
- Be clear, concise, and factual.
|
| 60 |
+
- Return as much information as you can get from the context.
|
| 61 |
+
- Only include claims that are directly supported by the context.
|
| 62 |
+
|
| 63 |
+
**IMPORTANT - Data Consolidation:**
|
| 64 |
+
- If multiple charts, tables, or data sources provide similar information, CONSOLIDATE the data and provide a single, unified answer.
|
| 65 |
+
- DO NOT list or compare values from multiple versions of the same charts/tables separately.
|
| 66 |
+
- Present only the most relevant or consensus value for each data point, unless there is a clear, significant difference that must be explained.
|
| 67 |
+
- If there are minor discrepancies, choose the value that appears most frequently or is best supported by the context, and mention only that value.
|
| 68 |
+
|
| 69 |
+
**IMPORTANT - Chart and Page Reference:**
|
| 70 |
+
- When referencing data from a chart, always indicate the chart's heading or title, and also include the page title if available.
|
| 71 |
+
- Do NOT use phrases like "another chart" or "a different chart". Always refer to the chart by its heading/title and the page title if you need to mention the source.
|
| 72 |
+
|
| 73 |
+
**CRITICAL - Table, Chart, and Visualization Handling:**
|
| 74 |
+
- Pay VERY CLOSE attention to any tables in the context (formatted with | characters or markdown table format).
|
| 75 |
+
- Tables contain structured data - read them carefully row by row, column by column.
|
| 76 |
+
- Extract and cite specific numbers, percentages, scores, and ratings from tables.
|
| 77 |
+
- If a numbered table (Table 1, Table 4, etc.) is relevant, explicitly mention it and provide the exact values.
|
| 78 |
+
- **Analyze complex charts and visualizations** when present in the context:
|
| 79 |
+
- Look for chart descriptions, data points, trends, and patterns
|
| 80 |
+
- Extract specific values from line charts, bar charts, pie charts, and scatter plots
|
| 81 |
+
- Identify trends, correlations, and relationships shown in visualizations
|
| 82 |
+
- Note any zones, quadrants, or regions in complex diagrams
|
| 83 |
+
- Reference chart titles, axis labels, and legends when citing data
|
| 84 |
+
- Compare multiple visualizations if relevant to the question
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
if feedback:
|
| 88 |
+
base_prompt += f"""
|
| 89 |
+
**IMPORTANT - Previous Answer Feedback:**
|
| 90 |
+
Your previous answer had issues that need to be addressed:
|
| 91 |
+
{feedback}
|
| 92 |
+
|
| 93 |
+
Please generate an improved answer that:
|
| 94 |
+
1. Addresses the unsupported claims by finding support in the context tables and charts
|
| 95 |
+
2. Fixes any contradictions with the source material
|
| 96 |
+
3. Ensures all statements are verifiable from the context
|
| 97 |
+
4. Look carefully at ALL tables and visualizations - the data you need may be in a numbered table or chart description
|
| 98 |
+
5. Read table data and chart descriptions carefully - each row/data point represents specific information
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
+
base_prompt += f"""
|
| 102 |
+
**Question:** {question}
|
| 103 |
+
|
| 104 |
+
**Context (pay special attention to tables marked with ### Table, chart descriptions, and data visualizations):**
|
| 105 |
+
{context}
|
| 106 |
+
|
| 107 |
+
**Provide your answer below (cite specific table numbers, chart references, and exact values from the tables and visualizations):**
|
| 108 |
+
"""
|
| 109 |
+
return base_prompt
|
| 110 |
+
|
| 111 |
+
def generate(
|
| 112 |
+
self,
|
| 113 |
+
question: str,
|
| 114 |
+
documents: List[Document],
|
| 115 |
+
feedback: Optional[str] = None,
|
| 116 |
+
previous_answer: Optional[str] = None
|
| 117 |
+
) -> Dict:
|
| 118 |
+
"""
|
| 119 |
+
Generate an initial answer using the provided documents.
|
| 120 |
+
Args:
|
| 121 |
+
question: The user's question
|
| 122 |
+
documents: List of relevant documents
|
| 123 |
+
feedback: Optional feedback from verification agent for re-research
|
| 124 |
+
previous_answer: Optional previous answer that failed verification
|
| 125 |
+
Returns:
|
| 126 |
+
Dict with 'draft_answer' and 'context_used'
|
| 127 |
+
"""
|
| 128 |
+
logger.info(f"[RESEARCH_AGENT] Generating answer for: {question[:80]}...")
|
| 129 |
+
logger.debug(f"[RESEARCH_AGENT] Documents: {len(documents)}, Feedback: {feedback is not None}")
|
| 130 |
+
if not documents:
|
| 131 |
+
logger.warning("[RESEARCH_AGENT] No documents provided")
|
| 132 |
+
return {
|
| 133 |
+
"draft_answer": "I could not find supporting documents to answer this question.",
|
| 134 |
+
"context_used": ""
|
| 135 |
+
}
|
| 136 |
+
# Combine the top document contents into one string
|
| 137 |
+
context = "\n\n".join([doc.page_content for doc in documents[:self.top_k]])
|
| 138 |
+
# Truncate context if too long
|
| 139 |
+
if len(context) > self.max_context_chars:
|
| 140 |
+
logger.debug(f"[RESEARCH_AGENT] Context truncated: {len(context)} -> {self.max_context_chars} chars")
|
| 141 |
+
context = context[:self.max_context_chars]
|
| 142 |
+
# Create a prompt for the LLM (with optional feedback)
|
| 143 |
+
prompt = self.generate_prompt(question, context, feedback)
|
| 144 |
+
# Call the LLM to generate the answer
|
| 145 |
+
try:
|
| 146 |
+
response = self.llm.invoke(prompt)
|
| 147 |
+
content = response.content if hasattr(response, "content") else str(response)
|
| 148 |
+
answer = content.strip()
|
| 149 |
+
logger.info("[RESEARCH_AGENT] Answer generated successfully")
|
| 150 |
+
except Exception as e:
|
| 151 |
+
logger.error(f"[RESEARCH_AGENT] LLM call failed: {e}", exc_info=True)
|
| 152 |
+
raise RuntimeError("Failed to generate answer due to a model error.") from e
|
| 153 |
+
# Sanitize the response
|
| 154 |
+
draft_answer = self.sanitize_response(answer) if answer else "I cannot answer this question based on the provided documents."
|
| 155 |
+
logger.debug(f"[RESEARCH_AGENT] Answer length: {len(draft_answer)} chars")
|
| 156 |
+
return {
|
| 157 |
+
"draft_answer": draft_answer,
|
| 158 |
+
"context_used": context
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
|
intelligence/orchestrator.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent orchestrator orchestration using LangGraph.
|
| 3 |
+
|
| 4 |
+
Defines the multi-agent orchestrator that:
|
| 5 |
+
1. Checks document relevance
|
| 6 |
+
2. Generates multiple answer candidates using research agent
|
| 7 |
+
3. Selects the best answer through verification
|
| 8 |
+
4. Provides feedback loop for iterative improvement
|
| 9 |
+
"""
|
| 10 |
+
from langgraph.graph import StateGraph, END
|
| 11 |
+
from typing import TypedDict, List, Dict, Any, Optional
|
| 12 |
+
from langchain_core.documents import Document
|
| 13 |
+
from langchain_core.retrievers import BaseRetriever
|
| 14 |
+
import logging
|
| 15 |
+
|
| 16 |
+
from .knowledge_synthesizer import ResearchAgent
|
| 17 |
+
from .accuracy_verifier import VerificationAgent
|
| 18 |
+
from .context_validator import ContextValidator
|
| 19 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 20 |
+
from configuration.parameters import parameters
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class AgentState(TypedDict):
|
| 26 |
+
"""State object passed between orchestrator nodes."""
|
| 27 |
+
question: str
|
| 28 |
+
documents: List[Document]
|
| 29 |
+
draft_answer: str
|
| 30 |
+
verification_report: str
|
| 31 |
+
is_relevant: bool
|
| 32 |
+
retriever: BaseRetriever
|
| 33 |
+
feedback: Optional[str]
|
| 34 |
+
research_attempts: int
|
| 35 |
+
query_used: str
|
| 36 |
+
candidate_answers: List[str]
|
| 37 |
+
selection_reasoning: str
|
| 38 |
+
# For multi-question support
|
| 39 |
+
is_multi_query: bool
|
| 40 |
+
sub_queries: List[str]
|
| 41 |
+
sub_answers: List[str]
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class AgentWorkflow:
|
| 45 |
+
"""
|
| 46 |
+
Orchestrates multi-agent orchestrator for document Q&A.
|
| 47 |
+
|
| 48 |
+
Workflow:
|
| 49 |
+
1. Relevance Check - Determines if documents can answer the question
|
| 50 |
+
2. Research - Generates multiple answer candidates using document context
|
| 51 |
+
3. Verification - Selects the best answer from candidates
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
MAX_RESEARCH_ATTEMPTS: int = 7
|
| 55 |
+
NUM_RESEARCH_CANDIDATES: int = 3
|
| 56 |
+
|
| 57 |
+
def __init__(self, num_candidates: int = None) -> None:
|
| 58 |
+
"""Initialize orchestrator with required agents."""
|
| 59 |
+
logger.info("Initializing AgentWorkflow...")
|
| 60 |
+
self.researcher = ResearchAgent()
|
| 61 |
+
self.verifier = VerificationAgent()
|
| 62 |
+
self.context_validator = ContextValidator()
|
| 63 |
+
self.compiled_orchestrator = None
|
| 64 |
+
self.llm = ChatGoogleGenerativeAI(
|
| 65 |
+
model=parameters.LLM_MODEL_NAME,
|
| 66 |
+
google_api_key=parameters.GOOGLE_API_KEY,
|
| 67 |
+
temperature=0.1,
|
| 68 |
+
max_output_tokens=256
|
| 69 |
+
)
|
| 70 |
+
if num_candidates is not None:
|
| 71 |
+
self.NUM_RESEARCH_CANDIDATES = num_candidates
|
| 72 |
+
logger.info(f"AgentWorkflow initialized (candidates={self.NUM_RESEARCH_CANDIDATES})")
|
| 73 |
+
|
| 74 |
+
def build_orchestrator(self) -> Any:
|
| 75 |
+
"""Create and compile the orchestrator graph."""
|
| 76 |
+
logger.debug("Building orchestrator graph...")
|
| 77 |
+
orchestrator = StateGraph(AgentState)
|
| 78 |
+
|
| 79 |
+
orchestrator.add_node("detect_query_type", self._detect_query_type)
|
| 80 |
+
orchestrator.add_node("process_sub_queries", self._process_sub_queries_step)
|
| 81 |
+
orchestrator.add_node("combine_answers", self._combine_answers_step)
|
| 82 |
+
orchestrator.add_node("check_relevance", self._check_relevance_step)
|
| 83 |
+
orchestrator.add_node("research", self._research_step)
|
| 84 |
+
orchestrator.add_node("verify", self._verification_step)
|
| 85 |
+
|
| 86 |
+
orchestrator.set_entry_point("detect_query_type")
|
| 87 |
+
orchestrator.add_conditional_edges(
|
| 88 |
+
"detect_query_type",
|
| 89 |
+
lambda state: "multi" if state.get("is_multi_query") else "single",
|
| 90 |
+
{"multi": "process_sub_queries", "single": "check_relevance"}
|
| 91 |
+
)
|
| 92 |
+
orchestrator.add_edge("process_sub_queries", "combine_answers")
|
| 93 |
+
orchestrator.add_edge("combine_answers", END)
|
| 94 |
+
orchestrator.add_conditional_edges(
|
| 95 |
+
"check_relevance",
|
| 96 |
+
self._decide_after_relevance_check,
|
| 97 |
+
{"relevant": "research", "irrelevant": END}
|
| 98 |
+
)
|
| 99 |
+
orchestrator.add_edge("research", "verify")
|
| 100 |
+
orchestrator.add_conditional_edges(
|
| 101 |
+
"verify",
|
| 102 |
+
self._decide_next_step,
|
| 103 |
+
{"re_research": "research", "end": END}
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
return orchestrator.compile()
|
| 107 |
+
|
| 108 |
+
def _detect_query_type(self, state: AgentState) -> Dict[str, Any]:
|
| 109 |
+
"""
|
| 110 |
+
Use LLM to detect if the question is multi-part and decompose it if so.
|
| 111 |
+
"""
|
| 112 |
+
prompt = f"""
|
| 113 |
+
You are an expert assistant for document Q&A. Analyze the following question and determine:
|
| 114 |
+
1. Is it a single question or does it contain multiple sub-questions?
|
| 115 |
+
2. If it contains multiple questions, decompose it into a list of clear, standalone sub-questions (no overlap, no ambiguity).
|
| 116 |
+
|
| 117 |
+
Return your answer as a JSON object with two fields:
|
| 118 |
+
- is_multi_query: true or false
|
| 119 |
+
- sub_queries: a list of strings (the sub-questions, or a single-item list if only one)
|
| 120 |
+
|
| 121 |
+
Question: {state['question']}
|
| 122 |
+
"""
|
| 123 |
+
try:
|
| 124 |
+
response = self.llm.invoke(prompt)
|
| 125 |
+
import json
|
| 126 |
+
content = response.content if hasattr(response, "content") else str(response)
|
| 127 |
+
# Try to extract JSON from the response
|
| 128 |
+
start = content.find('{')
|
| 129 |
+
end = content.rfind('}')
|
| 130 |
+
if start != -1 and end != -1:
|
| 131 |
+
json_str = content[start:end+1]
|
| 132 |
+
result = json.loads(json_str)
|
| 133 |
+
is_multi = bool(result.get("is_multi_query", False))
|
| 134 |
+
sub_queries = result.get("sub_queries", [])
|
| 135 |
+
else:
|
| 136 |
+
# Fallback: treat as single question
|
| 137 |
+
is_multi = False
|
| 138 |
+
sub_queries = [state["question"]]
|
| 139 |
+
except Exception as e:
|
| 140 |
+
logger.error(f"LLM decomposition failed: {e}")
|
| 141 |
+
is_multi = False
|
| 142 |
+
sub_queries = [state["question"]]
|
| 143 |
+
if is_multi:
|
| 144 |
+
logger.info(f"[LLM Decompose] Multi-question detected: {len(sub_queries)} sub-queries")
|
| 145 |
+
else:
|
| 146 |
+
logger.info("[LLM Decompose] Single question detected; no decomposition needed.")
|
| 147 |
+
return {"is_multi_query": is_multi, "sub_queries": sub_queries}
|
| 148 |
+
|
| 149 |
+
def _process_sub_queries_step(self, state: AgentState) -> Dict[str, Any]:
|
| 150 |
+
sub_answers = []
|
| 151 |
+
logger.info(f"[Decompose] Processing {len(state['sub_queries'])} sub-queries...")
|
| 152 |
+
for sub_query in state["sub_queries"]:
|
| 153 |
+
logger.info(f"[Decompose] Processing sub-query: {sub_query}")
|
| 154 |
+
sub_state = state.copy()
|
| 155 |
+
sub_state["question"] = sub_query
|
| 156 |
+
rel = self._check_relevance_step(sub_state)
|
| 157 |
+
if not rel.get("is_relevant"):
|
| 158 |
+
logger.warning(f"[Decompose] Sub-query not relevant: {sub_query}")
|
| 159 |
+
sub_answers.append(rel.get("draft_answer", "No answer found."))
|
| 160 |
+
continue
|
| 161 |
+
sub_state.update(rel)
|
| 162 |
+
research = self._research_step(sub_state)
|
| 163 |
+
sub_state.update(research)
|
| 164 |
+
verify = self._verification_step(sub_state)
|
| 165 |
+
sub_state.update(verify)
|
| 166 |
+
sub_answers.append(sub_state["draft_answer"])
|
| 167 |
+
logger.info(f"[Decompose] Sub-query answers: {sub_answers}")
|
| 168 |
+
return {"sub_answers": sub_answers}
|
| 169 |
+
|
| 170 |
+
def _combine_answers_step(self, state: AgentState) -> Dict[str, Any]:
|
| 171 |
+
logger.info(f"[Decompose] Combining {len(state['sub_answers'])} sub-answers into final answer.")
|
| 172 |
+
combined = "\n\n".join(f"Q{i+1}: {q}\nA: {a}" for i, (q, a) in enumerate(zip(state["sub_queries"], state["sub_answers"])))
|
| 173 |
+
return {"draft_answer": combined, "verification_report": "Multi-question answer combined."}
|
| 174 |
+
|
| 175 |
+
def _check_relevance_step(self, state: AgentState) -> Dict[str, Any]:
|
| 176 |
+
"""Check if retrieved documents are relevant to the question."""
|
| 177 |
+
logger.debug("Checking context relevance...")
|
| 178 |
+
|
| 179 |
+
result = self.context_validator.context_validate_with_rewrite(
|
| 180 |
+
question=state["question"],
|
| 181 |
+
retriever=state["retriever"],
|
| 182 |
+
k=20,
|
| 183 |
+
max_rewrites=1
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
classification = result["classification"]
|
| 187 |
+
query_used = result["query_used"]
|
| 188 |
+
was_rewritten = result.get("was_rewritten", False)
|
| 189 |
+
|
| 190 |
+
logger.info(f"Relevance: {classification}")
|
| 191 |
+
if was_rewritten:
|
| 192 |
+
logger.debug(f"Query rewritten: {query_used[:60]}...")
|
| 193 |
+
|
| 194 |
+
if classification in ["CAN_ANSWER", "PARTIAL"]:
|
| 195 |
+
if was_rewritten:
|
| 196 |
+
documents = state["retriever"].invoke(query_used)
|
| 197 |
+
return {"is_relevant": True, "query_used": query_used, "documents": documents}
|
| 198 |
+
return {"is_relevant": True, "query_used": state["question"]}
|
| 199 |
+
else:
|
| 200 |
+
return {
|
| 201 |
+
"is_relevant": False,
|
| 202 |
+
"query_used": state["question"],
|
| 203 |
+
"draft_answer": "This question isn't related to the uploaded documents. Please ask another question."
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
def _decide_after_relevance_check(self, state: AgentState) -> str:
|
| 207 |
+
"""Decide next step after relevance check."""
|
| 208 |
+
return "relevant" if state["is_relevant"] else "irrelevant"
|
| 209 |
+
|
| 210 |
+
def full_pipeline(self, question: str, retriever: BaseRetriever) -> Dict[str, str]:
|
| 211 |
+
"""
|
| 212 |
+
Execute the full Q&A pipeline.
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
question: The user's question
|
| 216 |
+
retriever: The retriever for document lookup
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
Dict with 'draft_answer' and 'verification_report'
|
| 220 |
+
"""
|
| 221 |
+
try:
|
| 222 |
+
if self.compiled_orchestrator is None:
|
| 223 |
+
self.compiled_orchestrator = self.build_orchestrator()
|
| 224 |
+
|
| 225 |
+
logger.info(f"Starting pipeline: {question[:80]}...")
|
| 226 |
+
|
| 227 |
+
documents = retriever.invoke(question)
|
| 228 |
+
logger.info(f"Retrieved {len(documents)} documents")
|
| 229 |
+
|
| 230 |
+
initial_state: AgentState = {
|
| 231 |
+
"question": question,
|
| 232 |
+
"documents": documents,
|
| 233 |
+
"draft_answer": "",
|
| 234 |
+
"verification_report": "",
|
| 235 |
+
"is_relevant": False,
|
| 236 |
+
"retriever": retriever,
|
| 237 |
+
"feedback": None,
|
| 238 |
+
"research_attempts": 0,
|
| 239 |
+
"query_used": question,
|
| 240 |
+
"candidate_answers": [],
|
| 241 |
+
"selection_reasoning": "",
|
| 242 |
+
"is_multi_query": False,
|
| 243 |
+
"sub_queries": [],
|
| 244 |
+
"sub_answers": []
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
final_state = self.compiled_orchestrator.invoke(initial_state)
|
| 248 |
+
|
| 249 |
+
logger.info(f"Pipeline completed (attempts: {final_state.get('research_attempts', 1)})")
|
| 250 |
+
|
| 251 |
+
return {
|
| 252 |
+
"draft_answer": final_state["draft_answer"],
|
| 253 |
+
"verification_report": final_state["verification_report"]
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
except Exception as e:
|
| 257 |
+
logger.error(f"Pipeline failed: {e}", exc_info=True)
|
| 258 |
+
raise RuntimeError(f"Workflow execution failed: {e}") from e
|
| 259 |
+
|
| 260 |
+
def _research_step(self, state: AgentState) -> Dict[str, Any]:
|
| 261 |
+
"""Generate multiple answer candidates using the research agent."""
|
| 262 |
+
attempts = state.get("research_attempts", 0) + 1
|
| 263 |
+
feedback = state.get("feedback")
|
| 264 |
+
previous_answer = state.get("draft_answer") if feedback else None
|
| 265 |
+
# Consolidate contradictions and unsupported claims into feedback
|
| 266 |
+
contradictions = state.get("contradictions_for_research", [])
|
| 267 |
+
unsupported_claims = state.get("unsupported_claims_for_research", [])
|
| 268 |
+
feedback_for_research = state.get("feedback_for_research", feedback)
|
| 269 |
+
extra_feedback = ""
|
| 270 |
+
if contradictions:
|
| 271 |
+
extra_feedback += " Contradictions: " + "; ".join(contradictions) + "."
|
| 272 |
+
if unsupported_claims:
|
| 273 |
+
extra_feedback += " Unsupported Claims: " + "; ".join(unsupported_claims) + "."
|
| 274 |
+
# If feedback_for_research is present, append extra_feedback; otherwise, use extra_feedback only
|
| 275 |
+
if feedback_for_research:
|
| 276 |
+
feedback_for_research = feedback_for_research + extra_feedback
|
| 277 |
+
else:
|
| 278 |
+
feedback_for_research = extra_feedback.strip()
|
| 279 |
+
logger.info(f"Research step (attempt {attempts}/{self.MAX_RESEARCH_ATTEMPTS})")
|
| 280 |
+
logger.info(f"Generating {self.NUM_RESEARCH_CANDIDATES} candidate answers...")
|
| 281 |
+
candidate_answers = []
|
| 282 |
+
for i in range(self.NUM_RESEARCH_CANDIDATES):
|
| 283 |
+
logger.info(f"Generating candidate {i + 1}/{self.NUM_RESEARCH_CANDIDATES}")
|
| 284 |
+
result = self.researcher.generate(
|
| 285 |
+
question=state["question"],
|
| 286 |
+
documents=state["documents"],
|
| 287 |
+
feedback=feedback_for_research,
|
| 288 |
+
previous_answer=previous_answer
|
| 289 |
+
)
|
| 290 |
+
candidate_answers.append(result["draft_answer"])
|
| 291 |
+
logger.info(f"Generated {len(candidate_answers)} candidate answers")
|
| 292 |
+
return {
|
| 293 |
+
"candidate_answers": candidate_answers,
|
| 294 |
+
"research_attempts": attempts,
|
| 295 |
+
"feedback": None
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
def _verification_step(self, state: AgentState) -> Dict[str, Any]:
|
| 299 |
+
"""Select the best answer from candidates and verify it."""
|
| 300 |
+
logger.debug("Selecting best answer from candidates...")
|
| 301 |
+
|
| 302 |
+
candidate_answers = state.get("candidate_answers", [])
|
| 303 |
+
|
| 304 |
+
if not candidate_answers:
|
| 305 |
+
logger.warning("No candidate answers found, using draft_answer")
|
| 306 |
+
candidate_answers = [state.get("draft_answer", "")]
|
| 307 |
+
|
| 308 |
+
# Select the best answer from candidates
|
| 309 |
+
selection_result = self.verifier.select_best_answer(
|
| 310 |
+
candidate_answers=candidate_answers,
|
| 311 |
+
documents=state["documents"],
|
| 312 |
+
question=state["question"]
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
best_answer = selection_result["selected_answer"]
|
| 316 |
+
selection_reasoning = selection_result.get("reasoning", "")
|
| 317 |
+
|
| 318 |
+
logger.info(f"Selected candidate {selection_result['selected_index'] + 1} as best answer")
|
| 319 |
+
|
| 320 |
+
# Verify the selected answer
|
| 321 |
+
verification_result = self.verifier.check(
|
| 322 |
+
answer=best_answer,
|
| 323 |
+
documents=state["documents"],
|
| 324 |
+
question=state["question"]
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
# Enhance verification report with selection info
|
| 328 |
+
verification_report = verification_result["verification_report"]
|
| 329 |
+
verification_report = f"**Candidates Evaluated:** {len(candidate_answers)}\n" + \
|
| 330 |
+
f"**Selected Candidate:** {selection_result['selected_index'] + 1}\n" + \
|
| 331 |
+
f"**Selection Confidence:** {selection_result.get('confidence', 'N/A')}\n" + \
|
| 332 |
+
f"**Selection Reasoning:** {selection_reasoning}\n\n" + \
|
| 333 |
+
verification_report
|
| 334 |
+
|
| 335 |
+
return {
|
| 336 |
+
"draft_answer": best_answer,
|
| 337 |
+
"verification_report": verification_report,
|
| 338 |
+
"feedback": verification_result.get("feedback"),
|
| 339 |
+
"selection_reasoning": selection_reasoning
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
def _decide_next_step(self, state: AgentState) -> str:
|
| 343 |
+
"""Decide whether to re-research or end orchestrator."""
|
| 344 |
+
verification_report = state["verification_report"]
|
| 345 |
+
research_attempts = state.get("research_attempts", 1)
|
| 346 |
+
feedback = state.get("feedback")
|
| 347 |
+
needs_re_research = False
|
| 348 |
+
# Extract contradictions and unsupported claims for feedback
|
| 349 |
+
contradictions = []
|
| 350 |
+
unsupported_claims = []
|
| 351 |
+
import re
|
| 352 |
+
for line in verification_report.splitlines():
|
| 353 |
+
if line.startswith("**Contradictions:"):
|
| 354 |
+
contradictions = [c.strip() for c in line.split(":", 1)[-1].split(",") if c.strip() and c.strip().lower() != "none"]
|
| 355 |
+
if line.startswith("**Unsupported Claims:"):
|
| 356 |
+
unsupported_claims = [u.strip() for u in line.split(":", 1)[-1].split(",") if u.strip() and u.strip().lower() != "none"]
|
| 357 |
+
if "Supported: NO" in verification_report:
|
| 358 |
+
needs_re_research = True
|
| 359 |
+
logger.warning("[Re-Research] Answer not supported; triggering re-research.")
|
| 360 |
+
elif "Relevant: NO" in verification_report:
|
| 361 |
+
needs_re_research = True
|
| 362 |
+
logger.warning("[Re-Research] Answer not relevant; triggering re-research.")
|
| 363 |
+
elif "Confidence: LOW" in verification_report and "Supported: PARTIAL" in verification_report:
|
| 364 |
+
needs_re_research = True
|
| 365 |
+
logger.warning("[Re-Research] Low confidence with partial support; triggering re-research.")
|
| 366 |
+
elif "Completeness: INCOMPLETE" in verification_report:
|
| 367 |
+
needs_re_research = True
|
| 368 |
+
logger.warning("[Re-Research] Answer is incomplete; triggering re-research.")
|
| 369 |
+
elif "Completeness: PARTIAL" in verification_report:
|
| 370 |
+
needs_re_research = True
|
| 371 |
+
logger.warning("[Re-Research] Answer is partially complete; triggering re-research.")
|
| 372 |
+
if feedback and not needs_re_research:
|
| 373 |
+
if "contradiction" in feedback.lower() or "unsupported" in feedback.lower():
|
| 374 |
+
needs_re_research = True
|
| 375 |
+
logger.warning("[Re-Research] Feedback indicates contradiction/unsupported; triggering re-research.")
|
| 376 |
+
# Store extra feedback for research node
|
| 377 |
+
state["contradictions_for_research"] = contradictions
|
| 378 |
+
state["unsupported_claims_for_research"] = unsupported_claims
|
| 379 |
+
state["feedback_for_research"] = feedback
|
| 380 |
+
if needs_re_research and research_attempts < self.MAX_RESEARCH_ATTEMPTS:
|
| 381 |
+
logger.info(f"[Re-Research] Re-researching (attempt {research_attempts + 1})")
|
| 382 |
+
return "re_research"
|
| 383 |
+
elif needs_re_research:
|
| 384 |
+
logger.warning("[Re-Research] Max attempts reached, returning best effort.")
|
| 385 |
+
return "end"
|
| 386 |
+
else:
|
| 387 |
+
logger.info("[Re-Research] Verification passed; ending workflow.")
|
| 388 |
+
return "end"
|
main.py
ADDED
|
@@ -0,0 +1,986 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import configuration.logger_setup
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
logger = logging.getLogger(__name__)
|
| 5 |
+
|
| 6 |
+
import hashlib
|
| 7 |
+
import socket
|
| 8 |
+
from typing import List, Dict
|
| 9 |
+
import os
|
| 10 |
+
import shutil
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
import time
|
| 14 |
+
import random
|
| 15 |
+
|
| 16 |
+
from content_analyzer.document_parser import DocumentProcessor
|
| 17 |
+
from search_engine.indexer import RetrieverBuilder
|
| 18 |
+
from intelligence.orchestrator import AgentWorkflow
|
| 19 |
+
from configuration import definitions, parameters
|
| 20 |
+
import gradio as gr
|
| 21 |
+
|
| 22 |
+
# Example data for demo
|
| 23 |
+
EXAMPLES = {
|
| 24 |
+
"Generative AI and Jobs": {
|
| 25 |
+
"question": "Which occupations are most likely to be automated by AI?",
|
| 26 |
+
"file_paths": ["samples/OIT-NASK-IAGen_WP140_web.pdf"]
|
| 27 |
+
},
|
| 28 |
+
"Energy and AI": {
|
| 29 |
+
"question": "What is the accuracy of AI models in coding?",
|
| 30 |
+
"file_paths": ["samples/EnergyandAI.pdf"]
|
| 31 |
+
},
|
| 32 |
+
"Digital Progress and Trends Report 2025": {
|
| 33 |
+
"question": "which country has most Gen Ai patents and which country has most total funding raised by AI start-ups?",
|
| 34 |
+
"file_paths": ["samples/Digital Progress and Trends Report 2025, Strengthening AI Foundations.pdf"]
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def format_chat_history(history: List[Dict]) -> str:
|
| 40 |
+
"""Format chat history as markdown for display."""
|
| 41 |
+
if not history:
|
| 42 |
+
return "*No conversation history yet. Ask a question to get started!*"
|
| 43 |
+
|
| 44 |
+
formatted = []
|
| 45 |
+
for i, entry in enumerate(history, 1):
|
| 46 |
+
timestamp = entry.get("timestamp", "")
|
| 47 |
+
question = entry.get("question", "")
|
| 48 |
+
answer = entry.get("answer", "")
|
| 49 |
+
confidence = entry.get("confidence", "N/A")
|
| 50 |
+
|
| 51 |
+
formatted.append(f"""
|
| 52 |
+
---
|
| 53 |
+
### 💬 Q{i} ({timestamp})
|
| 54 |
+
**Question:** {question}
|
| 55 |
+
|
| 56 |
+
**Answer:** {answer}
|
| 57 |
+
|
| 58 |
+
*Confidence: {confidence}*
|
| 59 |
+
""")
|
| 60 |
+
|
| 61 |
+
return "\n".join(formatted)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def format_document_context(documents: List, question: str = "") -> str:
|
| 65 |
+
"""Format retrieved documents with annotation highlighting."""
|
| 66 |
+
if not documents:
|
| 67 |
+
return "*No documents retrieved yet.*"
|
| 68 |
+
|
| 69 |
+
formatted = [f"### 📚 Retrieved Context ({len(documents)} chunks)\n"]
|
| 70 |
+
|
| 71 |
+
# Extract key terms from question for highlighting
|
| 72 |
+
key_terms = []
|
| 73 |
+
if question:
|
| 74 |
+
stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'in', 'on', 'at', 'to', 'for', 'of', 'and', 'or', 'what', 'how', 'why', 'when', 'where', 'which'}
|
| 75 |
+
key_terms = [word.lower() for word in question.split() if word.lower() not in stopwords and len(word) > 2]
|
| 76 |
+
|
| 77 |
+
for i, doc in enumerate(documents[:5], 1):
|
| 78 |
+
content = doc.page_content if hasattr(doc, 'page_content') else str(doc)
|
| 79 |
+
source = doc.metadata.get('source', 'Unknown') if hasattr(doc, 'metadata') else 'Unknown'
|
| 80 |
+
|
| 81 |
+
# Truncate long content
|
| 82 |
+
if len(content) > 500:
|
| 83 |
+
content = content[:500] + "..."
|
| 84 |
+
|
| 85 |
+
# Highlight key terms
|
| 86 |
+
highlighted_content = content
|
| 87 |
+
for term in key_terms[:5]:
|
| 88 |
+
import re
|
| 89 |
+
pattern = re.compile(re.escape(term), re.IGNORECASE)
|
| 90 |
+
highlighted_content = pattern.sub(f"**{term}**", highlighted_content)
|
| 91 |
+
|
| 92 |
+
formatted.append(f"""
|
| 93 |
+
<details>
|
| 94 |
+
<summary>📄 Chunk {i} - {os.path.basename(source)}</summary>
|
| 95 |
+
|
| 96 |
+
{highlighted_content}
|
| 97 |
+
|
| 98 |
+
</details>
|
| 99 |
+
""")
|
| 100 |
+
|
| 101 |
+
if len(documents) > 5:
|
| 102 |
+
formatted.append(f"\n*... and {len(documents) - 5} more chunks*")
|
| 103 |
+
|
| 104 |
+
return "\n".join(formatted)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _get_file_hashes(uploaded_files: List) -> frozenset:
|
| 108 |
+
"""Generate SHA-256 hashes for uploaded files."""
|
| 109 |
+
hashes = set()
|
| 110 |
+
for file in uploaded_files:
|
| 111 |
+
with open(file.name, "rb") as f:
|
| 112 |
+
hashes.add(hashlib.sha256(f.read()).hexdigest())
|
| 113 |
+
return frozenset(hashes)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _find_open_port(start_port: int, max_attempts: int = 20) -> int:
|
| 117 |
+
"""Find an available TCP port starting from start_port."""
|
| 118 |
+
port = start_port
|
| 119 |
+
for _ in range(max_attempts):
|
| 120 |
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
| 121 |
+
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
| 122 |
+
try:
|
| 123 |
+
sock.bind(("127.0.0.1", port))
|
| 124 |
+
return port
|
| 125 |
+
except OSError:
|
| 126 |
+
port += 1
|
| 127 |
+
raise RuntimeError(f"Could not find an open port starting at {start_port}")
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _setup_gradio_shim():
|
| 131 |
+
"""Shim Gradio's JSON schema conversion to tolerate boolean additionalProperties values."""
|
| 132 |
+
import gradio as gr
|
| 133 |
+
from gradio_client import utils as grc_utils
|
| 134 |
+
_orig_json_schema_to_python_type = grc_utils._json_schema_to_python_type
|
| 135 |
+
def _json_schema_to_python_type_safe(schema, defs=None):
|
| 136 |
+
if isinstance(schema, bool):
|
| 137 |
+
return "Any" if schema else "Never"
|
| 138 |
+
return _orig_json_schema_to_python_type(schema, defs)
|
| 139 |
+
grc_utils._json_schema_to_python_type = _json_schema_to_python_type_safe
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def main():
|
| 143 |
+
"""Main application entry point."""
|
| 144 |
+
_setup_gradio_shim()
|
| 145 |
+
|
| 146 |
+
logger.info("=" * 60)
|
| 147 |
+
logger.info("Starting SmartDoc AI application...")
|
| 148 |
+
logger.info("=" * 60)
|
| 149 |
+
|
| 150 |
+
# Initialize components
|
| 151 |
+
processor = DocumentProcessor()
|
| 152 |
+
retriever_indexer = RetrieverBuilder()
|
| 153 |
+
orchestrator = AgentWorkflow()
|
| 154 |
+
|
| 155 |
+
logger.info("All components initialized successfully")
|
| 156 |
+
|
| 157 |
+
# CSS styling - Clean, accessible light theme with professional colors
|
| 158 |
+
css = """
|
| 159 |
+
/* Global styling - Light, clean background */
|
| 160 |
+
.gradio-container {
|
| 161 |
+
background: linear-gradient(180deg, #f8fafc 0%, #e2e8f0 100%) !important;
|
| 162 |
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
/* Title styles - Dark text for readability */
|
| 166 |
+
.app-title {
|
| 167 |
+
font-size: 2.2em !important;
|
| 168 |
+
text-align: center !important;
|
| 169 |
+
color: #1e293b !important;
|
| 170 |
+
font-weight: 700 !important;
|
| 171 |
+
margin-bottom: 8px !important;
|
| 172 |
+
}
|
| 173 |
+
.app-subtitle {
|
| 174 |
+
font-size: 1.1em !important;
|
| 175 |
+
text-align: center !important;
|
| 176 |
+
color: #0369a1 !important;
|
| 177 |
+
font-weight: 500 !important;
|
| 178 |
+
}
|
| 179 |
+
.app-description {
|
| 180 |
+
text-align: center;
|
| 181 |
+
color: #475569 !important;
|
| 182 |
+
font-size: 0.95em !important;
|
| 183 |
+
line-height: 1.6 !important;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
/* Section headers */
|
| 187 |
+
.section-header {
|
| 188 |
+
color: #1e293b !important;
|
| 189 |
+
font-weight: 600 !important;
|
| 190 |
+
border-bottom: 2px solid #0ea5e9 !important;
|
| 191 |
+
padding-bottom: 8px !important;
|
| 192 |
+
margin-bottom: 16px !important;
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
/* Chat history panel - Clean white card with more height */
|
| 196 |
+
.chat-history {
|
| 197 |
+
min-height: 500px;
|
| 198 |
+
max-height: 600px;
|
| 199 |
+
overflow-y: auto;
|
| 200 |
+
border: 1px solid #cbd5e1;
|
| 201 |
+
border-radius: 12px;
|
| 202 |
+
padding: 20px;
|
| 203 |
+
background: #ffffff;
|
| 204 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
|
| 205 |
+
color: #334155 !important;
|
| 206 |
+
}
|
| 207 |
+
#chat-history {
|
| 208 |
+
min-height: 120px !important;
|
| 209 |
+
max-height: none !important;
|
| 210 |
+
height: auto !important;
|
| 211 |
+
}
|
| 212 |
+
.chat-history h3 {
|
| 213 |
+
color: #0f172a !important;
|
| 214 |
+
}
|
| 215 |
+
.chat-history strong {
|
| 216 |
+
color: #1e293b !important;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
/* Document context panel */
|
| 220 |
+
.doc-context {
|
| 221 |
+
max-height: 380px;
|
| 222 |
+
overflow-y: auto;
|
| 223 |
+
border: 1px solid #cbd5e1;
|
| 224 |
+
border-radius: 12px;
|
| 225 |
+
padding: 20px;
|
| 226 |
+
background: #ffffff;
|
| 227 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
|
| 228 |
+
color: #334155 !important;
|
| 229 |
+
}
|
| 230 |
+
.doc-context details {
|
| 231 |
+
margin-bottom: 12px;
|
| 232 |
+
padding: 14px;
|
| 233 |
+
background: #f1f5f9;
|
| 234 |
+
border-radius: 8px;
|
| 235 |
+
border-left: 4px solid #0ea5e9;
|
| 236 |
+
}
|
| 237 |
+
.doc-context summary {
|
| 238 |
+
cursor: pointer;
|
| 239 |
+
font-weight: 600;
|
| 240 |
+
color: #0369a1 !important;
|
| 241 |
+
}
|
| 242 |
+
.doc-context p, .doc-context span {
|
| 243 |
+
color: #475569 !important;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
/* Answer box - Success green accent, auto-height */
|
| 247 |
+
.answer-box > div:nth-child(2) {
|
| 248 |
+
border-left: 4px solid #10b981 !important;
|
| 249 |
+
padding: 16px 16px 16px 20px !important;
|
| 250 |
+
background: #f0fdf4 !important;
|
| 251 |
+
border-radius: 8px !important;
|
| 252 |
+
min-height: 100px;
|
| 253 |
+
color: #166534 !important;
|
| 254 |
+
}
|
| 255 |
+
.answer-box p, .answer-box li, .answer-box span {
|
| 256 |
+
color: #166534 !important;
|
| 257 |
+
}
|
| 258 |
+
.answer-box strong {
|
| 259 |
+
color: #14532d !important;
|
| 260 |
+
}
|
| 261 |
+
.answer-box h1, .answer-box h2, .answer-box h3, .answer-box h4 {
|
| 262 |
+
color: #15803d !important;
|
| 263 |
+
}
|
| 264 |
+
.answer-box code {
|
| 265 |
+
background: #dcfce7 !important;
|
| 266 |
+
color: #166534 !important;
|
| 267 |
+
padding: 2px 6px !important;
|
| 268 |
+
border-radius: 4px !important;
|
| 269 |
+
}
|
| 270 |
+
.answer-box pre {
|
| 271 |
+
background: #dcfce7 !important;
|
| 272 |
+
padding: 12px !important;
|
| 273 |
+
border-radius: 6px !important;
|
| 274 |
+
overflow-x: auto !important;
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
/* Verification box - Blue accent */
|
| 278 |
+
.verification-box > div:nth-child(2) {
|
| 279 |
+
border-left: 4px solid #0ea5e9 !important;
|
| 280 |
+
padding: 16px 16px 16px 20px !important;
|
| 281 |
+
background: #f0f9ff !important;
|
| 282 |
+
border-radius: 8px !important;
|
| 283 |
+
min-height: 80px;
|
| 284 |
+
color: #0369a1 !important;
|
| 285 |
+
}
|
| 286 |
+
.verification-box p, .verification-box li, .verification-box span {
|
| 287 |
+
color: #0c4a6e !important;
|
| 288 |
+
}
|
| 289 |
+
.verification-box strong {
|
| 290 |
+
color: #075985 !important;
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
/* Stats panel - Professional blue gradient */
|
| 294 |
+
.stats-panel {
|
| 295 |
+
background: linear-gradient(135deg, #0369a1 0%, #0284c7 50%, #0ea5e9 100%) !important;
|
| 296 |
+
color: #ffffff !important;
|
| 297 |
+
padding: 20px !important;
|
| 298 |
+
border-radius: 12px !important;
|
| 299 |
+
text-align: center;
|
| 300 |
+
box-shadow: 0 4px 14px rgba(3, 105, 161, 0.3);
|
| 301 |
+
}
|
| 302 |
+
.stats-panel strong {
|
| 303 |
+
color: #ffffff !important;
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
/* Info panel */
|
| 307 |
+
.info-panel {
|
| 308 |
+
background: #eff6ff !important;
|
| 309 |
+
border: 1px solid #bfdbfe !important;
|
| 310 |
+
border-radius: 8px !important;
|
| 311 |
+
padding: 12px !important;
|
| 312 |
+
color: #1e40af !important;
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
/* Form elements */
|
| 316 |
+
.gr-input, .gr-textbox textarea {
|
| 317 |
+
background: #ffffff !important;
|
| 318 |
+
border: 1px solid #cbd5e1 !important;
|
| 319 |
+
border-radius: 8px !important;
|
| 320 |
+
color: #1e293b !important;
|
| 321 |
+
}
|
| 322 |
+
.gr-input:focus, .gr-textbox textarea:focus {
|
| 323 |
+
border-color: #0ea5e9 !important;
|
| 324 |
+
box-shadow: 0 0 0 3px rgba(14, 165, 233, 0.1) !important;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
/* Labels */
|
| 328 |
+
label {
|
| 329 |
+
color: #374151 !important;
|
| 330 |
+
font-weight: 500 !important;
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
/* Dropdown - High contrast with darker background for visibility */
|
| 334 |
+
.gr-dropdown,
|
| 335 |
+
[data-testid="dropdown"],
|
| 336 |
+
.svelte-dropdown,dropdownExample
|
| 337 |
+
div[class*="dropdown"] {
|
| 338 |
+
background: #e0e7ff !important;
|
| 339 |
+
color: #1e293b !important;
|
| 340 |
+
border: 2px solid #1e40af !important;
|
| 341 |
+
border-radius: 8px !important;
|
| 342 |
+
box-shadow: 0 2px 8px rgba(30, 64, 175, 0.2) !important;
|
| 343 |
+
}
|
| 344 |
+
.gr-dropdown:hover,
|
| 345 |
+
[data-testid="dropdown"]:hover {
|
| 346 |
+
background: #c7d2fe !important;
|
| 347 |
+
border-color: #1d4ed8 !important;
|
| 348 |
+
box-shadow: 0 4px 12px rgba(30, 64, 175, 0.3) !important;
|
| 349 |
+
}
|
| 350 |
+
.gr-dropdown select,
|
| 351 |
+
.gr-dropdown input,
|
| 352 |
+
[data-testid="dropdown"] input {
|
| 353 |
+
color: #1e293b !important;
|
| 354 |
+
background: transparent !important;
|
| 355 |
+
font-weight: 500 !important;
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
/* Dropdown container and options */
|
| 359 |
+
[data-testid="dropdown"] span,
|
| 360 |
+
.dropdown-container span,
|
| 361 |
+
div[class*="dropdown"] span {
|
| 362 |
+
color: #1e293b !important;
|
| 363 |
+
font-weight: 500 !important;
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
/* Dropdown list options */
|
| 367 |
+
.gr-dropdown ul,
|
| 368 |
+
.dropdown-options,
|
| 369 |
+
ul[class*="dropdown"] {
|
| 370 |
+
background: #ffffff !important;
|
| 371 |
+
border: 2px solid #1e40af !important;
|
| 372 |
+
border-radius: 8px !important;
|
| 373 |
+
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.15) !important;
|
| 374 |
+
}
|
| 375 |
+
.gr-dropdown li,
|
| 376 |
+
.dropdown-options li,
|
| 377 |
+
ul[class*="dropdown"] li {
|
| 378 |
+
color: #1e293b !important;
|
| 379 |
+
padding: 10px 14px !important;
|
| 380 |
+
}
|
| 381 |
+
.gr-dropdown li:hover,
|
| 382 |
+
ul[class*="dropdown"] li:hover {
|
| 383 |
+
background: #c7d2fe !important;
|
| 384 |
+
color: #1e40af !important;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
/* Dropdown label */
|
| 388 |
+
.gr-dropdown label,
|
| 389 |
+
[data-testid="dropdown"] label {
|
| 390 |
+
color: #1e40af !important;
|
| 391 |
+
font-weight: 600 !important;
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
/* Tabs - Clean styling */
|
| 395 |
+
.tab-nav {
|
| 396 |
+
border-bottom: 2px solid #e2e8f0 !important;
|
| 397 |
+
}
|
| 398 |
+
.tab-nav button {
|
| 399 |
+
color: #64748b !important;
|
| 400 |
+
font-weight: 500 !important;
|
| 401 |
+
padding: 12px 20px !important;
|
| 402 |
+
border: none !important;
|
| 403 |
+
background: transparent !important;
|
| 404 |
+
}
|
| 405 |
+
.tab-nav button.selected {
|
| 406 |
+
color: #0369a1 !important;
|
| 407 |
+
border-bottom: 3px solid #0369a1 !important;
|
| 408 |
+
font-weight: 600 !important;
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
/* Markdown text */
|
| 412 |
+
.prose, .markdown-text {
|
| 413 |
+
color: #334155 !important;
|
| 414 |
+
}
|
| 415 |
+
.prose h1, .prose h2, .prose h3,
|
| 416 |
+
.markdown-text h1, .markdown-text h2, .markdown-text h3 {
|
| 417 |
+
color: #1e293b !important;
|
| 418 |
+
}
|
| 419 |
+
.prose strong, .markdown-text strong {
|
| 420 |
+
color: #0f172a !important;
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
/* Scrollbar styling */
|
| 424 |
+
::-webkit-scrollbar {
|
| 425 |
+
width: 8px;
|
| 426 |
+
height: 8px;
|
| 427 |
+
}
|
| 428 |
+
::-webkit-scrollbar-track {
|
| 429 |
+
background: #f1f5f9;
|
| 430 |
+
border-radius: 4px;
|
| 431 |
+
}
|
| 432 |
+
::-webkit-scrollbar-thumb {
|
| 433 |
+
background: #94a3b8;
|
| 434 |
+
border-radius: 4px;
|
| 435 |
+
}
|
| 436 |
+
::-webkit-scrollbar-thumb:hover {
|
| 437 |
+
background: #64748b;
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
button.secondary {
|
| 441 |
+
background: #1e40af !important;
|
| 442 |
+
color: #ffffff !important;
|
| 443 |
+
border: none !important;
|
| 444 |
+
border-radius: 8px !important;
|
| 445 |
+
font-weight: 600 !important;
|
| 446 |
+
box-shadow: 0 2px 6px rgba(30, 64, 175, 0.3) !important;
|
| 447 |
+
padding: 12px 20px !important;
|
| 448 |
+
min-height: 44px !important;
|
| 449 |
+
}
|
| 450 |
+
button.secondary:hover {
|
| 451 |
+
background: #1d4ed8 !important;
|
| 452 |
+
box-shadow: 0 4px 10px rgba(30, 64, 175, 0.4) !important;
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
/* Left side input boxes with borders */
|
| 456 |
+
.left-panel-box {
|
| 457 |
+
background: #fafafa !important;
|
| 458 |
+
border: 2px solid #94a3b8 !important;
|
| 459 |
+
border-radius: 10px !important;
|
| 460 |
+
padding: 14px !important;
|
| 461 |
+
margin-bottom: 8px !important;
|
| 462 |
+
}
|
| 463 |
+
.left-panel-box:hover {
|
| 464 |
+
border-color: #64748b !important;
|
| 465 |
+
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1) !important;
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
/* File upload box with border */
|
| 469 |
+
.file-upload-box {
|
| 470 |
+
background: #f8fafc !important;
|
| 471 |
+
border: 2px dashed #64748b !important;
|
| 472 |
+
border-radius: 10px !important;
|
| 473 |
+
padding: 14px !important;
|
| 474 |
+
}
|
| 475 |
+
.file-upload-box:hover {
|
| 476 |
+
border-color: #0369a1 !important;
|
| 477 |
+
border-style: solid !important;
|
| 478 |
+
background: #f0f9ff !important;
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
/* Question input box with border */
|
| 482 |
+
.question-box {
|
| 483 |
+
background: #fffbeb !important;
|
| 484 |
+
border: 2px solid #f59e0b !important;
|
| 485 |
+
border-radius: 10px !important;
|
| 486 |
+
padding: 14px !important;
|
| 487 |
+
}
|
| 488 |
+
.question-box:hover {
|
| 489 |
+
border-color: #d97706 !important;
|
| 490 |
+
box-shadow: 0 2px 8px rgba(245, 158, 11, 0.2) !important;
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
/* Dropdown Example - Beige background on 3rd parent container */
|
| 494 |
+
.dropdownExample {
|
| 495 |
+
background: #f5f5dc !important;
|
| 496 |
+
padding: 16px !important;
|
| 497 |
+
border-radius: 8px !important;
|
| 498 |
+
border: 2px solid #d1d5db !important;
|
| 499 |
+
margin-bottom: 16px !important;
|
| 500 |
+
}
|
| 501 |
+
"""
|
| 502 |
+
js = """
|
| 503 |
+
function createGradioAnimation() {
|
| 504 |
+
var container = document.createElement('div');
|
| 505 |
+
container.id = 'gradio-animation';
|
| 506 |
+
container.style.fontSize = '2.4em';
|
| 507 |
+
container.style.fontWeight = '700';
|
| 508 |
+
container.style.textAlign = 'center';
|
| 509 |
+
container.style.marginBottom = '20px';
|
| 510 |
+
container.style.marginTop = '10px';
|
| 511 |
+
container.style.color = '#0369a1';
|
| 512 |
+
container.style.letterSpacing = '-0.02em';
|
| 513 |
+
var text = '📄 SmartDoc AI';
|
| 514 |
+
for (var i = 0; i < text.length; i++) {
|
| 515 |
+
(function(i){
|
| 516 |
+
setTimeout(function(){
|
| 517 |
+
var letter = document.createElement('span');
|
| 518 |
+
letter.style.opacity = '0';
|
| 519 |
+
letter.style.transition = 'opacity 0.2s ease';
|
| 520 |
+
letter.innerText = text[i];
|
| 521 |
+
container.appendChild(letter);
|
| 522 |
+
setTimeout(function() { letter.style.opacity = '1'; }, 50);
|
| 523 |
+
}, i * 80);
|
| 524 |
+
})(i);
|
| 525 |
+
}
|
| 526 |
+
var gradioContainer = document.querySelector('.gradio-container');
|
| 527 |
+
gradioContainer.insertBefore(container, gradioContainer.firstChild);
|
| 528 |
+
return 'Animation created';
|
| 529 |
+
}
|
| 530 |
+
(() => {
|
| 531 |
+
const upload_messages = [
|
| 532 |
+
"Crunching your documents...",
|
| 533 |
+
"Warming up the AI...",
|
| 534 |
+
"Extracting knowledge...",
|
| 535 |
+
"Scanning for insights...",
|
| 536 |
+
"Preparing your data...",
|
| 537 |
+
"Looking for answers...",
|
| 538 |
+
"Analyzing file structure...",
|
| 539 |
+
"Reading your files...",
|
| 540 |
+
"Indexing content...",
|
| 541 |
+
"Almost ready..."
|
| 542 |
+
];
|
| 543 |
+
|
| 544 |
+
let intervalId = null;
|
| 545 |
+
let timerId = null;
|
| 546 |
+
let startMs = null;
|
| 547 |
+
let lastMsg = null;
|
| 548 |
+
|
| 549 |
+
function pickMsg() {
|
| 550 |
+
if (upload_messages.length === 0) return "";
|
| 551 |
+
if (upload_messages.length === 1) return upload_messages[0];
|
| 552 |
+
let m;
|
| 553 |
+
do {
|
| 554 |
+
m = upload_messages[Math.floor(Math.random() * upload_messages.length)];
|
| 555 |
+
} while (m === lastMsg);
|
| 556 |
+
lastMsg = m;
|
| 557 |
+
return m;
|
| 558 |
+
}
|
| 559 |
+
|
| 560 |
+
function getMsgSpan() {
|
| 561 |
+
const root = document.getElementById("processing-message");
|
| 562 |
+
if (!root) return null;
|
| 563 |
+
return root.querySelector("#processing-msg");
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
function getTimerSpan() {
|
| 567 |
+
const root = document.getElementById("processing-message");
|
| 568 |
+
if (!root) return null;
|
| 569 |
+
return root.querySelector("#processing-timer");
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
function setMsg(text) {
|
| 573 |
+
const span = getMsgSpan();
|
| 574 |
+
if (!span) return;
|
| 575 |
+
span.textContent = text;
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
function formatElapsed(startMs) {
|
| 579 |
+
const s = (Date.now() - startMs) / 1000;
|
| 580 |
+
return `${s.toFixed(1)}s elapsed`;
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
function startRotationAndTimer() {
|
| 584 |
+
stopRotationAndTimer();
|
| 585 |
+
setMsg(pickMsg());
|
| 586 |
+
startMs = Date.now();
|
| 587 |
+
intervalId = setInterval(() => setMsg(pickMsg()), 2000);
|
| 588 |
+
const timerSpan = getTimerSpan();
|
| 589 |
+
if (timerSpan) {
|
| 590 |
+
timerSpan.textContent = formatElapsed(startMs);
|
| 591 |
+
timerId = setInterval(() => {
|
| 592 |
+
timerSpan.textContent = formatElapsed(startMs);
|
| 593 |
+
}, 200);
|
| 594 |
+
}
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
function stopRotationAndTimer() {
|
| 598 |
+
if (intervalId) {
|
| 599 |
+
clearInterval(intervalId);
|
| 600 |
+
intervalId = null;
|
| 601 |
+
}
|
| 602 |
+
if (timerId) {
|
| 603 |
+
clearInterval(timerId);
|
| 604 |
+
timerId = null;
|
| 605 |
+
}
|
| 606 |
+
const timerSpan = getTimerSpan();
|
| 607 |
+
if (timerSpan) timerSpan.textContent = "";
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
// Auto start/stop based on visibility of the processing box
|
| 611 |
+
function watchProcessingBox() {
|
| 612 |
+
const root = document.getElementById("processing-message");
|
| 613 |
+
if (!root) {
|
| 614 |
+
setTimeout(watchProcessingBox, 250);
|
| 615 |
+
return;
|
| 616 |
+
}
|
| 617 |
+
const isVisible = () => root.offsetParent !== null;
|
| 618 |
+
let prev = isVisible();
|
| 619 |
+
if (prev) startRotationAndTimer();
|
| 620 |
+
|
| 621 |
+
const obs = new MutationObserver(() => {
|
| 622 |
+
const now = isVisible();
|
| 623 |
+
if (now && !prev) startRotationAndTimer();
|
| 624 |
+
if (!now && prev) stopRotationAndTimer();
|
| 625 |
+
prev = now;
|
| 626 |
+
});
|
| 627 |
+
|
| 628 |
+
obs.observe(root, { attributes: true, attributeFilter: ["style", "class"] });
|
| 629 |
+
}
|
| 630 |
+
|
| 631 |
+
window.smartdocStartRotationAndTimer = startRotationAndTimer;
|
| 632 |
+
window.smartdocStopRotationAndTimer = stopRotationAndTimer;
|
| 633 |
+
|
| 634 |
+
watchProcessingBox();
|
| 635 |
+
})();
|
| 636 |
+
"""
|
| 637 |
+
|
| 638 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="SmartDoc AI", css=css, js=js) as demo:
|
| 639 |
+
gr.Markdown("### SmartDoc AI - Document Q&A", elem_classes="app-title")
|
| 640 |
+
gr.Markdown("Upload your documents and ask questions. Answers will appear below, just like a chat.", elem_classes="app-description")
|
| 641 |
+
gr.Markdown("---")
|
| 642 |
+
|
| 643 |
+
# Examples dropdown
|
| 644 |
+
example_dropdown = gr.Dropdown(
|
| 645 |
+
label="Quick Start - Choose an Example",
|
| 646 |
+
choices=list(EXAMPLES.keys()),
|
| 647 |
+
value=None,
|
| 648 |
+
info="Select a pre-loaded example to try"
|
| 649 |
+
)
|
| 650 |
+
loaded_file_info = gr.Markdown("", elem_classes="info-panel", visible=False)
|
| 651 |
+
|
| 652 |
+
files = gr.Files(label="Upload your files", file_types=definitions.ALLOWED_TYPES)
|
| 653 |
+
question = gr.Textbox(label="Ask a question", lines=2, placeholder="Type your question here...")
|
| 654 |
+
chat = gr.Chatbot(label="Answers", elem_id="chat-history")
|
| 655 |
+
submit_btn = gr.Button("Get Answer", variant="primary")
|
| 656 |
+
processing_message = gr.HTML("", elem_id="processing-message", visible=False)
|
| 657 |
+
doc_context_display = gr.Markdown("*Submit a question to see which document sections were referenced*", elem_classes="doc-context", visible=False)
|
| 658 |
+
refresh_context_btn = gr.Button("Refresh Sources", variant="secondary", visible=False)
|
| 659 |
+
with gr.Tab("Context"):
|
| 660 |
+
pass # No .render() calls here; components are already defined and used in outputs
|
| 661 |
+
|
| 662 |
+
session_state = gr.State({
|
| 663 |
+
"file_hashes": frozenset(),
|
| 664 |
+
"retriever": None,
|
| 665 |
+
"chat_history": [],
|
| 666 |
+
"last_documents": [],
|
| 667 |
+
"total_questions": 0,
|
| 668 |
+
"session_start": datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 669 |
+
})
|
| 670 |
+
|
| 671 |
+
def process_question(question_text, uploaded_files, chat_history):
|
| 672 |
+
import time
|
| 673 |
+
import random
|
| 674 |
+
chat_history = chat_history or []
|
| 675 |
+
upload_messages = [
|
| 676 |
+
"Crunching your documents...",
|
| 677 |
+
"Warming up the AI...",
|
| 678 |
+
"Extracting knowledge...",
|
| 679 |
+
"Scanning for insights...",
|
| 680 |
+
"Preparing your data...",
|
| 681 |
+
"Looking for answers...",
|
| 682 |
+
"Analyzing file structure...",
|
| 683 |
+
"Reading your files...",
|
| 684 |
+
"Indexing content...",
|
| 685 |
+
"Almost ready..."
|
| 686 |
+
]
|
| 687 |
+
last_msg = None
|
| 688 |
+
start_time = time.time()
|
| 689 |
+
msg = random.choice([m for m in upload_messages if m != last_msg])
|
| 690 |
+
last_msg = msg
|
| 691 |
+
yield (
|
| 692 |
+
chat_history,
|
| 693 |
+
gr.update(visible=False),
|
| 694 |
+
gr.update(visible=False),
|
| 695 |
+
gr.update(interactive=False),
|
| 696 |
+
gr.update(interactive=False),
|
| 697 |
+
gr.update(interactive=False),
|
| 698 |
+
gr.update(interactive=False),
|
| 699 |
+
gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
|
| 700 |
+
<span id="processing-msg"></span>
|
| 701 |
+
<span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
|
| 702 |
+
</div>''', visible=True)
|
| 703 |
+
)
|
| 704 |
+
|
| 705 |
+
try:
|
| 706 |
+
if not question_text.strip():
|
| 707 |
+
chat_history.append({"role": "user", "content": question_text})
|
| 708 |
+
chat_history.append({"role": "assistant", "content": "Please enter a question."})
|
| 709 |
+
yield (
|
| 710 |
+
chat_history,
|
| 711 |
+
gr.update(visible=False),
|
| 712 |
+
gr.update(visible=False),
|
| 713 |
+
gr.update(interactive=True),
|
| 714 |
+
gr.update(interactive=True),
|
| 715 |
+
gr.update(interactive=True),
|
| 716 |
+
gr.update(interactive=True),
|
| 717 |
+
gr.update(value="", visible=False)
|
| 718 |
+
)
|
| 719 |
+
return
|
| 720 |
+
if not uploaded_files:
|
| 721 |
+
chat_history.append({"role": "user", "content": question_text})
|
| 722 |
+
chat_history.append({"role": "assistant", "content": "Please upload at least one document."})
|
| 723 |
+
yield (
|
| 724 |
+
chat_history,
|
| 725 |
+
gr.update(visible=False),
|
| 726 |
+
gr.update(visible=False),
|
| 727 |
+
gr.update(interactive=True),
|
| 728 |
+
gr.update(interactive=True),
|
| 729 |
+
gr.update(interactive=True),
|
| 730 |
+
gr.update(interactive=True),
|
| 731 |
+
gr.update(value="", visible=False)
|
| 732 |
+
)
|
| 733 |
+
return
|
| 734 |
+
# Stage 2: Chunking with per-chunk progress and rotating status
|
| 735 |
+
all_chunks = []
|
| 736 |
+
seen_hashes = set()
|
| 737 |
+
total_chunks = 0
|
| 738 |
+
chunk_counts = []
|
| 739 |
+
for file in uploaded_files:
|
| 740 |
+
with open(file.name, 'rb') as f:
|
| 741 |
+
file_content = f.read()
|
| 742 |
+
file_hash = processor._generate_hash(file_content)
|
| 743 |
+
cache_path = processor.cache_dir / f"{file_hash}.pkl"
|
| 744 |
+
if processor._is_cache_valid(cache_path):
|
| 745 |
+
chunks = processor._load_from_cache(cache_path)
|
| 746 |
+
if not chunks:
|
| 747 |
+
chunks = processor._process_file(file)
|
| 748 |
+
processor._save_to_cache(chunks, cache_path)
|
| 749 |
+
else:
|
| 750 |
+
chunks = processor._process_file(file)
|
| 751 |
+
processor._save_to_cache(chunks, cache_path)
|
| 752 |
+
chunk_counts.append(len(chunks))
|
| 753 |
+
total_chunks += len(chunks)
|
| 754 |
+
if total_chunks == 0:
|
| 755 |
+
total_chunks = 1
|
| 756 |
+
chunk_idx = 0
|
| 757 |
+
msg = random.choice(upload_messages)
|
| 758 |
+
for file, file_chunk_count in zip(uploaded_files, chunk_counts):
|
| 759 |
+
with open(file.name, 'rb') as f:
|
| 760 |
+
file_content = f.read()
|
| 761 |
+
file_hash = processor._generate_hash(file_content)
|
| 762 |
+
cache_path = processor.cache_dir / f"{file_hash}.pkl"
|
| 763 |
+
if processor._is_cache_valid(cache_path):
|
| 764 |
+
chunks = processor._load_from_cache(cache_path)
|
| 765 |
+
if not chunks:
|
| 766 |
+
chunks = processor._process_file(file)
|
| 767 |
+
processor._save_to_cache(chunks, cache_path)
|
| 768 |
+
else:
|
| 769 |
+
chunks = processor._process_file(file)
|
| 770 |
+
processor._save_to_cache(chunks, cache_path)
|
| 771 |
+
for chunk in chunks:
|
| 772 |
+
chunk_hash = processor._generate_hash(chunk.page_content.encode())
|
| 773 |
+
if chunk_hash not in seen_hashes:
|
| 774 |
+
seen_hashes.add(chunk_hash)
|
| 775 |
+
all_chunks.append(chunk)
|
| 776 |
+
# else: skip duplicate chunk
|
| 777 |
+
chunk_idx += 1
|
| 778 |
+
# Rotate status message every 10 seconds
|
| 779 |
+
elapsed = time.time() - start_time
|
| 780 |
+
if chunk_idx == 1 or (elapsed // 10) > ((elapsed-1) // 10):
|
| 781 |
+
msg = random.choice([m for m in upload_messages if m != last_msg])
|
| 782 |
+
last_msg = msg
|
| 783 |
+
# When yielding progress, always do:
|
| 784 |
+
yield (
|
| 785 |
+
chat_history,
|
| 786 |
+
gr.update(visible=False),
|
| 787 |
+
gr.update(visible=False),
|
| 788 |
+
gr.update(interactive=False),
|
| 789 |
+
gr.update(interactive=False),
|
| 790 |
+
gr.update(interactive=False),
|
| 791 |
+
gr.update(interactive=False),
|
| 792 |
+
gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
|
| 793 |
+
<span id="processing-msg"></span>
|
| 794 |
+
<span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
|
| 795 |
+
</div>''', visible=True)
|
| 796 |
+
)
|
| 797 |
+
# After all chunks, show 100%
|
| 798 |
+
elapsed = time.time() - start_time
|
| 799 |
+
yield (
|
| 800 |
+
chat_history,
|
| 801 |
+
gr.update(visible=False),
|
| 802 |
+
gr.update(visible=False),
|
| 803 |
+
gr.update(interactive=False),
|
| 804 |
+
gr.update(interactive=False),
|
| 805 |
+
gr.update(interactive=False),
|
| 806 |
+
gr.update(interactive=False),
|
| 807 |
+
gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
|
| 808 |
+
<span id="processing-msg"></span>
|
| 809 |
+
<span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
|
| 810 |
+
</div>''', visible=True)
|
| 811 |
+
)
|
| 812 |
+
# Stage 3: Building Retriever
|
| 813 |
+
elapsed = time.time() - start_time
|
| 814 |
+
yield (
|
| 815 |
+
chat_history,
|
| 816 |
+
gr.update(visible=False),
|
| 817 |
+
gr.update(visible=False),
|
| 818 |
+
gr.update(interactive=False),
|
| 819 |
+
gr.update(interactive=False),
|
| 820 |
+
gr.update(interactive=False),
|
| 821 |
+
gr.update(interactive=False),
|
| 822 |
+
gr.update(value=(
|
| 823 |
+
'<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04); display:flex; align-items:center;">'
|
| 824 |
+
'<img src="https://media.giphy.com/media/26ufnwz3wDUli7GU0/giphy.gif" alt="AI working" style="height:40px; margin-right:16px;">'
|
| 825 |
+
'<span id="processing-msg"></span>'
|
| 826 |
+
'</div>'
|
| 827 |
+
), visible=True)
|
| 828 |
+
)
|
| 829 |
+
retriever = retriever_indexer.build_hybrid_retriever(all_chunks)
|
| 830 |
+
# Stage 4: Generating Answer
|
| 831 |
+
elapsed = time.time() - start_time
|
| 832 |
+
yield (
|
| 833 |
+
chat_history,
|
| 834 |
+
gr.update(visible=False),
|
| 835 |
+
gr.update(visible=False),
|
| 836 |
+
gr.update(interactive=False),
|
| 837 |
+
gr.update(interactive=False),
|
| 838 |
+
gr.update(interactive=False),
|
| 839 |
+
gr.update(interactive=False),
|
| 840 |
+
gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
|
| 841 |
+
<span id="processing-msg"></span>
|
| 842 |
+
<span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
|
| 843 |
+
</div>''', visible=True)
|
| 844 |
+
)
|
| 845 |
+
result = orchestrator.full_pipeline(question=question_text, retriever=retriever)
|
| 846 |
+
answer = result["draft_answer"]
|
| 847 |
+
# Stage 5: Verifying Answer
|
| 848 |
+
elapsed = time.time() - start_time
|
| 849 |
+
yield (
|
| 850 |
+
chat_history,
|
| 851 |
+
gr.update(visible=False),
|
| 852 |
+
gr.update(visible=False),
|
| 853 |
+
gr.update(interactive=False),
|
| 854 |
+
gr.update(interactive=False),
|
| 855 |
+
gr.update(interactive=False),
|
| 856 |
+
gr.update(interactive=False),
|
| 857 |
+
gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
|
| 858 |
+
<span id="processing-msg"></span>
|
| 859 |
+
<span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
|
| 860 |
+
</div>''', visible=True)
|
| 861 |
+
)
|
| 862 |
+
verification = result.get("verification_report", "No verification details available.")
|
| 863 |
+
logger.info(f"Verification (internal):\n{verification}")
|
| 864 |
+
# Do not display verification to user, only use internally
|
| 865 |
+
chat_history.append({"role": "user", "content": question_text})
|
| 866 |
+
chat_history.append({"role": "assistant", "content": f"**Answer:**\n{answer}"})
|
| 867 |
+
|
| 868 |
+
session_state.value["last_documents"] = retriever.invoke(question_text)
|
| 869 |
+
# Final: Show results and make context tab visible
|
| 870 |
+
total_elapsed = time.time() - start_time
|
| 871 |
+
yield (
|
| 872 |
+
chat_history,
|
| 873 |
+
gr.update(visible=True), # doc_context_display
|
| 874 |
+
gr.update(visible=True), # refresh_context_btn
|
| 875 |
+
gr.update(interactive=True),
|
| 876 |
+
gr.update(interactive=True),
|
| 877 |
+
gr.update(interactive=True),
|
| 878 |
+
gr.update(interactive=True),
|
| 879 |
+
gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
|
| 880 |
+
<span id="processing-msg"></span>
|
| 881 |
+
<span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
|
| 882 |
+
</div>''', visible=True)
|
| 883 |
+
)
|
| 884 |
+
|
| 885 |
+
time.sleep(1.5)
|
| 886 |
+
yield (
|
| 887 |
+
chat_history,
|
| 888 |
+
gr.update(visible=True),
|
| 889 |
+
gr.update(visible=True),
|
| 890 |
+
gr.update(interactive=True),
|
| 891 |
+
gr.update(interactive=True),
|
| 892 |
+
gr.update(interactive=True),
|
| 893 |
+
gr.update(interactive=True),
|
| 894 |
+
gr.update(value="", visible=False)
|
| 895 |
+
)
|
| 896 |
+
except Exception as e:
|
| 897 |
+
logger.error(f"Processing error: {e}", exc_info=True)
|
| 898 |
+
chat_history.append({"role": "user", "content": question_text})
|
| 899 |
+
chat_history.append({"role": "assistant", "content": f"Error: {str(e)}"})
|
| 900 |
+
yield (
|
| 901 |
+
chat_history,
|
| 902 |
+
gr.update(visible=False),
|
| 903 |
+
gr.update(visible=False),
|
| 904 |
+
gr.update(interactive=True),
|
| 905 |
+
gr.update(interactive=True),
|
| 906 |
+
gr.update(interactive=True),
|
| 907 |
+
gr.update(interactive=True),
|
| 908 |
+
gr.update(value="", visible=False)
|
| 909 |
+
)
|
| 910 |
+
|
| 911 |
+
submit_btn.click(
|
| 912 |
+
fn=process_question,
|
| 913 |
+
inputs=[question, files, chat],
|
| 914 |
+
outputs=[chat, doc_context_display, refresh_context_btn, submit_btn, question, files, example_dropdown, processing_message],
|
| 915 |
+
queue=True,
|
| 916 |
+
show_progress=True
|
| 917 |
+
)
|
| 918 |
+
|
| 919 |
+
def refresh_context():
|
| 920 |
+
docs = session_state.value.get("last_documents", [])
|
| 921 |
+
last_question = ""
|
| 922 |
+
for msg in reversed(chat.value or []):
|
| 923 |
+
if msg["role"] == "user":
|
| 924 |
+
last_question = msg["content"]
|
| 925 |
+
break
|
| 926 |
+
return format_document_context(docs, last_question)
|
| 927 |
+
|
| 928 |
+
refresh_context_btn.click(
|
| 929 |
+
fn=refresh_context,
|
| 930 |
+
inputs=[],
|
| 931 |
+
outputs=[doc_context_display]
|
| 932 |
+
)
|
| 933 |
+
|
| 934 |
+
def load_example(example_key):
|
| 935 |
+
if not example_key or example_key not in EXAMPLES:
|
| 936 |
+
return [], "", "Select a valid example from the dropdown above"
|
| 937 |
+
ex_data = EXAMPLES[example_key]
|
| 938 |
+
question_text = ex_data["question"]
|
| 939 |
+
file_paths = ex_data["file_paths"]
|
| 940 |
+
import tempfile
|
| 941 |
+
temp_dir = tempfile.mkdtemp()
|
| 942 |
+
copied_files = []
|
| 943 |
+
file_info_text = f"Loaded: {example_key}\n\n"
|
| 944 |
+
for source_file_path in file_paths:
|
| 945 |
+
abs_source = os.path.abspath(source_file_path)
|
| 946 |
+
if os.path.exists(abs_source):
|
| 947 |
+
filename = os.path.basename(abs_source)
|
| 948 |
+
temp_file_path = os.path.join(temp_dir, filename)
|
| 949 |
+
shutil.copy2(abs_source, temp_file_path)
|
| 950 |
+
copied_files.append(temp_file_path)
|
| 951 |
+
file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
|
| 952 |
+
file_info_text += f"{filename} ({file_size_mb:.2f} MB)\n"
|
| 953 |
+
else:
|
| 954 |
+
file_info_text += f"{source_file_path} not found\n"
|
| 955 |
+
if not copied_files:
|
| 956 |
+
return [], "", "Could not load example files"
|
| 957 |
+
return copied_files, question_text, file_info_text
|
| 958 |
+
|
| 959 |
+
# Remove the Load Example button and related logic
|
| 960 |
+
# Instead, load the example immediately when dropdown changes
|
| 961 |
+
example_dropdown.change(
|
| 962 |
+
fn=load_example,
|
| 963 |
+
inputs=[example_dropdown],
|
| 964 |
+
outputs=[files, question, loaded_file_info]
|
| 965 |
+
)
|
| 966 |
+
# Launch server - Compatible with both local and Hugging Face Spaces
|
| 967 |
+
# HF Spaces sets SPACE_ID environment variable
|
| 968 |
+
is_hf_space = os.environ.get("SPACE_ID") is not None
|
| 969 |
+
|
| 970 |
+
if is_hf_space:
|
| 971 |
+
# Hugging Face Spaces configuration
|
| 972 |
+
logger.info("Running on Hugging Face Spaces")
|
| 973 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
| 974 |
+
else:
|
| 975 |
+
# Local development configuration
|
| 976 |
+
configured_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
|
| 977 |
+
server_port = _find_open_port(configured_port)
|
| 978 |
+
|
| 979 |
+
logger.info(f"Launching Gradio on port {server_port}")
|
| 980 |
+
logger.info(f"Access the app at: http://127.0.0.1:{server_port}")
|
| 981 |
+
|
| 982 |
+
demo.launch(server_name="127.0.0.1", server_port=server_port, share=False)
|
| 983 |
+
|
| 984 |
+
|
| 985 |
+
if __name__ == "__main__":
|
| 986 |
+
main()
|
maintenance.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import glob
|
| 4 |
+
|
| 5 |
+
def clean_pycache():
|
| 6 |
+
"""
|
| 7 |
+
Recursively finds and removes all __pycache__ directories and .pyc files
|
| 8 |
+
within the current working directory.
|
| 9 |
+
"""
|
| 10 |
+
print("Starting Python cache cleanup...")
|
| 11 |
+
|
| 12 |
+
# Find and remove __pycache__ directories
|
| 13 |
+
pycache_dirs = glob.glob('**/__pycache__', recursive=True)
|
| 14 |
+
if not pycache_dirs:
|
| 15 |
+
print("No __pycache__ directories found.")
|
| 16 |
+
else:
|
| 17 |
+
for path in pycache_dirs:
|
| 18 |
+
try:
|
| 19 |
+
if os.path.isdir(path):
|
| 20 |
+
shutil.rmtree(path)
|
| 21 |
+
print(f"Removed directory: {path}")
|
| 22 |
+
except OSError as e:
|
| 23 |
+
print(f"Error removing directory {path}: {e}")
|
| 24 |
+
|
| 25 |
+
# Find and remove remaining .pyc files (less common)
|
| 26 |
+
pyc_files = glob.glob('**/*.pyc', recursive=True)
|
| 27 |
+
if not pyc_files:
|
| 28 |
+
print("No .pyc files found.")
|
| 29 |
+
else:
|
| 30 |
+
for path in pyc_files:
|
| 31 |
+
try:
|
| 32 |
+
if os.path.isfile(path):
|
| 33 |
+
os.remove(path)
|
| 34 |
+
print(f"Removed file: {path}")
|
| 35 |
+
except OSError as e:
|
| 36 |
+
print(f"Error removing file {path}: {e}")
|
| 37 |
+
|
| 38 |
+
print("Cleanup complete.")
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
clean_pycache()
|
requirements.txt
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies
|
| 2 |
+
aiofiles>=23.2.1
|
| 3 |
+
aiohttp>=3.11.0
|
| 4 |
+
annotated-types>=0.7.0
|
| 5 |
+
anyio>=4.8.0
|
| 6 |
+
|
| 7 |
+
# PDF Processing
|
| 8 |
+
pdfplumber>=0.11.0
|
| 9 |
+
pdf2image>=1.17.0
|
| 10 |
+
Pillow>=10.0.0
|
| 11 |
+
|
| 12 |
+
# Computer Vision for local chart detection (cost optimization)
|
| 13 |
+
opencv-python>=4.8.0
|
| 14 |
+
|
| 15 |
+
# LangChain ecosystem
|
| 16 |
+
langchain>=0.3.16
|
| 17 |
+
langchain-core>=0.3.32
|
| 18 |
+
langchain-text-splitters>=0.3.5
|
| 19 |
+
langchain-google-genai>=2.0.0
|
| 20 |
+
langchain-community>=0.3.16
|
| 21 |
+
langchain-chroma>=0.1.0
|
| 22 |
+
|
| 23 |
+
# Google AI for chart analysis
|
| 24 |
+
google-generativeai>=0.8.0
|
| 25 |
+
|
| 26 |
+
# Vector store
|
| 27 |
+
chromadb>=0.6.3
|
| 28 |
+
|
| 29 |
+
# Web framework
|
| 30 |
+
gradio>=5.13.0
|
| 31 |
+
|
| 32 |
+
# Data processing
|
| 33 |
+
pandas>=2.1.4
|
| 34 |
+
numpy>=1.26.4
|
| 35 |
+
beautifulsoup4>=4.12.3
|
| 36 |
+
|
| 37 |
+
# Document loaders
|
| 38 |
+
python-docx>=1.1.2
|
| 39 |
+
docx2txt>=0.8
|
| 40 |
+
|
| 41 |
+
# Configuration
|
| 42 |
+
pydantic>=2.11.10,<2.12.5
|
| 43 |
+
pydantic-settings>=2.10.1,<3.0.0
|
| 44 |
+
python-dotenv>=1.0.1
|
| 45 |
+
|
| 46 |
+
# BM25 retriever
|
| 47 |
+
rank-bm25>=0.2.2
|
| 48 |
+
|
| 49 |
+
# Utilities
|
| 50 |
+
tqdm>=4.67.0
|
| 51 |
+
requests>=2.32.0
|
| 52 |
+
tiktoken>=0.8.0
|
| 53 |
+
tenacity>=9.0.0
|
search_engine/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .indexer import RetrieverBuilder
|
| 2 |
+
|
| 3 |
+
__all__ = ["RetrieverBuilder"]
|
search_engine/indexer.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Retriever indexer module for DocChat.
|
| 3 |
+
|
| 4 |
+
Provides utilities for building different types of retrievers:
|
| 5 |
+
- Vector-based retriever (ChromaDB + embeddings)
|
| 6 |
+
- Hybrid retriever (BM25 + Vector with ensemble)
|
| 7 |
+
"""
|
| 8 |
+
import logging
|
| 9 |
+
import sys
|
| 10 |
+
from typing import List, Any
|
| 11 |
+
import time
|
| 12 |
+
import hashlib
|
| 13 |
+
import os
|
| 14 |
+
import json
|
| 15 |
+
|
| 16 |
+
from langchain_core.documents import Document
|
| 17 |
+
from langchain_core.retrievers import BaseRetriever
|
| 18 |
+
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
| 19 |
+
from langchain_chroma import Chroma
|
| 20 |
+
from langchain_community.retrievers import BM25Retriever
|
| 21 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 22 |
+
from langchain_core.vectorstores import VectorStoreRetriever
|
| 23 |
+
from configuration.parameters import parameters
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def doc_id(doc) -> str:
|
| 29 |
+
src = doc.metadata.get("source", "")
|
| 30 |
+
page = doc.metadata.get("page", "")
|
| 31 |
+
chunk = doc.metadata.get("chunk_id", "")
|
| 32 |
+
base = f"{src}::{page}::{chunk}"
|
| 33 |
+
return hashlib.sha256(base.encode("utf-8")).hexdigest()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def content_hash(doc) -> str:
|
| 37 |
+
return hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def load_manifest(path):
|
| 41 |
+
if os.path.exists(path):
|
| 42 |
+
with open(path, "r") as f:
|
| 43 |
+
return json.load(f)
|
| 44 |
+
return {}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def save_manifest(path, manifest):
|
| 48 |
+
with open(path, "w") as f:
|
| 49 |
+
json.dump(manifest, f)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class EnsembleRetriever(BaseRetriever):
|
| 53 |
+
"""
|
| 54 |
+
Custom Ensemble Retriever combining multiple retrievers with weighted RRF.
|
| 55 |
+
|
| 56 |
+
Attributes:
|
| 57 |
+
retrievers: List of retriever instances
|
| 58 |
+
weights: List of weights (should sum to 1.0)
|
| 59 |
+
c: RRF constant (default: 60)
|
| 60 |
+
k: Max documents to return (default: 10)
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
retrievers: List[Any]
|
| 64 |
+
weights: List[float]
|
| 65 |
+
c: int = 60
|
| 66 |
+
k: int = 10
|
| 67 |
+
|
| 68 |
+
class Config:
|
| 69 |
+
arbitrary_types_allowed = True
|
| 70 |
+
|
| 71 |
+
def _get_relevant_documents(
|
| 72 |
+
self,
|
| 73 |
+
query: str,
|
| 74 |
+
*,
|
| 75 |
+
run_manager: CallbackManagerForRetrieverRun = None
|
| 76 |
+
) -> List[Document]:
|
| 77 |
+
"""Retrieve and combine documents using weighted RRF, deduplicating charts by content and aggregating page numbers."""
|
| 78 |
+
logger.debug(f"[ENSEMBLE] Query: {query[:80]}...")
|
| 79 |
+
all_docs_with_scores = {}
|
| 80 |
+
retriever_names = ["BM25", "Vector"]
|
| 81 |
+
for idx, (retriever, weight) in enumerate(zip(self.retrievers, self.weights)):
|
| 82 |
+
retriever_name = retriever_names[idx] if idx < len(retriever_names) else f"Retriever_{idx}"
|
| 83 |
+
try:
|
| 84 |
+
docs = retriever.invoke(query)
|
| 85 |
+
logger.debug(f"[ENSEMBLE] {retriever_name}: {len(docs)} docs (weight: {weight})")
|
| 86 |
+
for rank, doc in enumerate(docs):
|
| 87 |
+
# Deduplicate by content and source only
|
| 88 |
+
doc_key = (doc.page_content, doc.metadata.get('source', ''))
|
| 89 |
+
rrf_score = weight / (rank + 1 + self.c)
|
| 90 |
+
if doc_key in all_docs_with_scores:
|
| 91 |
+
existing_doc, existing_score = all_docs_with_scores[doc_key]
|
| 92 |
+
# Aggregate page numbers
|
| 93 |
+
existing_pages = set()
|
| 94 |
+
if isinstance(existing_doc.metadata.get('page'), list):
|
| 95 |
+
existing_pages.update(existing_doc.metadata['page'])
|
| 96 |
+
else:
|
| 97 |
+
existing_pages.add(existing_doc.metadata.get('page'))
|
| 98 |
+
existing_pages.add(doc.metadata.get('page'))
|
| 99 |
+
# Update metadata to include all pages
|
| 100 |
+
existing_doc.metadata['page'] = sorted(p for p in existing_pages if p is not None)
|
| 101 |
+
all_docs_with_scores[doc_key] = (existing_doc, existing_score + rrf_score)
|
| 102 |
+
else:
|
| 103 |
+
all_docs_with_scores[doc_key] = (doc, rrf_score)
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.warning(f"[ENSEMBLE] {retriever_name} failed: {e}")
|
| 106 |
+
continue
|
| 107 |
+
sorted_docs = sorted(all_docs_with_scores.values(), key=lambda x: x[1], reverse=True)
|
| 108 |
+
result = [doc for doc, score in sorted_docs[:self.k]]
|
| 109 |
+
logger.debug(f"[ENSEMBLE] Returning {len(result)} documents")
|
| 110 |
+
return result
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
class RetrieverBuilder:
|
| 114 |
+
"""Builder class for creating document retrievers with caching."""
|
| 115 |
+
|
| 116 |
+
def __init__(self):
|
| 117 |
+
"""Initialize with embeddings model."""
|
| 118 |
+
self.embeddings = GoogleGenerativeAIEmbeddings(
|
| 119 |
+
model="models/text-embedding-004",
|
| 120 |
+
google_api_key=parameters.GOOGLE_API_KEY,
|
| 121 |
+
batch_size=32, # Enable batching for faster embedding computation
|
| 122 |
+
)
|
| 123 |
+
self._retriever_cache = {} # {docset_hash: retriever}
|
| 124 |
+
|
| 125 |
+
def _hash_docs(self, docs):
|
| 126 |
+
# Create a hash of all document contents and metadata
|
| 127 |
+
m = hashlib.sha256()
|
| 128 |
+
for doc in docs:
|
| 129 |
+
m.update(doc.page_content.encode('utf-8'))
|
| 130 |
+
for k, v in sorted(doc.metadata.items()):
|
| 131 |
+
m.update(str(k).encode('utf-8'))
|
| 132 |
+
m.update(str(v).encode('utf-8'))
|
| 133 |
+
return m.hexdigest()
|
| 134 |
+
|
| 135 |
+
def build_hybrid_retriever(self, docs) -> EnsembleRetriever:
|
| 136 |
+
"""
|
| 137 |
+
Build hybrid retriever using BM25 and vector search.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
docs: List of documents to index
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
EnsembleRetriever combining BM25 and vector search
|
| 144 |
+
"""
|
| 145 |
+
logger.info(f"Building hybrid retriever with {len(docs)} documents...")
|
| 146 |
+
if not docs:
|
| 147 |
+
raise ValueError("No documents provided")
|
| 148 |
+
chroma_dir = parameters.CHROMA_DB_PATH
|
| 149 |
+
manifest_path = os.path.join(chroma_dir, "indexed_manifest.json")
|
| 150 |
+
os.makedirs(chroma_dir, exist_ok=True)
|
| 151 |
+
manifest = load_manifest(manifest_path)
|
| 152 |
+
vector_store = Chroma(
|
| 153 |
+
embedding_function=self.embeddings,
|
| 154 |
+
persist_directory=chroma_dir,
|
| 155 |
+
)
|
| 156 |
+
to_add = []
|
| 157 |
+
ids_to_add = []
|
| 158 |
+
to_delete_ids = []
|
| 159 |
+
current_ids = set()
|
| 160 |
+
for d in docs:
|
| 161 |
+
_id = doc_id(d)
|
| 162 |
+
_hash = content_hash(d)
|
| 163 |
+
current_ids.add(_id)
|
| 164 |
+
if _id not in manifest:
|
| 165 |
+
to_add.append(d)
|
| 166 |
+
ids_to_add.append(_id)
|
| 167 |
+
manifest[_id] = _hash
|
| 168 |
+
elif manifest[_id] != _hash:
|
| 169 |
+
to_delete_ids.append(_id)
|
| 170 |
+
to_add.append(d)
|
| 171 |
+
ids_to_add.append(_id)
|
| 172 |
+
manifest[_id] = _hash
|
| 173 |
+
if to_add:
|
| 174 |
+
# Safety net: de-dupe before add_documents
|
| 175 |
+
seen = set()
|
| 176 |
+
uniq_docs, uniq_ids = [], []
|
| 177 |
+
for doc, _id in zip(to_add, ids_to_add):
|
| 178 |
+
if _id in seen:
|
| 179 |
+
continue
|
| 180 |
+
seen.add(_id)
|
| 181 |
+
uniq_docs.append(doc)
|
| 182 |
+
uniq_ids.append(_id)
|
| 183 |
+
# Debugging: show duplicate IDs and their sources
|
| 184 |
+
from collections import Counter
|
| 185 |
+
counts = Counter(ids_to_add)
|
| 186 |
+
dupes = [i for i, c in counts.items() if c > 1]
|
| 187 |
+
if dupes:
|
| 188 |
+
print("Duplicate IDs:", len(dupes))
|
| 189 |
+
for d in dupes[:10]:
|
| 190 |
+
idxs = [k for k, x in enumerate(ids_to_add) if x == d]
|
| 191 |
+
print("ID:", d, "examples:")
|
| 192 |
+
for k in idxs[:3]:
|
| 193 |
+
md = to_add[k].metadata
|
| 194 |
+
print(" ", md.get("source"), md.get("page"), md.get("chunk_index"))
|
| 195 |
+
vector_store.add_documents(uniq_docs, ids=uniq_ids)
|
| 196 |
+
save_manifest(manifest_path, manifest)
|
| 197 |
+
# Create BM25 retriever
|
| 198 |
+
t_bm25_start = time.time()
|
| 199 |
+
texts = [doc.page_content for doc in docs]
|
| 200 |
+
metadatas = [doc.metadata for doc in docs]
|
| 201 |
+
bm25_retriever = BM25Retriever.from_texts(texts=texts, metadatas=metadatas)
|
| 202 |
+
bm25_retriever.k = parameters.BM25_SEARCH_K
|
| 203 |
+
t_bm25_end = time.time()
|
| 204 |
+
logger.info(f"[PROFILE] BM25 retriever creation: {t_bm25_end - t_bm25_start:.2f}s")
|
| 205 |
+
logger.debug(f"BM25 indexed {len(texts)} texts, k={bm25_retriever.k}")
|
| 206 |
+
t_vec_retr_start = time.time()
|
| 207 |
+
vector_retriever = vector_store.as_retriever(
|
| 208 |
+
search_type="mmr",
|
| 209 |
+
search_kwargs={
|
| 210 |
+
"k": parameters.VECTOR_Search_K_CHROMA,
|
| 211 |
+
"fetch_k": parameters.VECTOR_FETCH_K,
|
| 212 |
+
"lambda_mult": 0.7,
|
| 213 |
+
},
|
| 214 |
+
)
|
| 215 |
+
t_vec_retr_end = time.time()
|
| 216 |
+
logger.info(f"[PROFILE] Vector retriever creation: {t_vec_retr_end - t_vec_retr_start:.2f}s")
|
| 217 |
+
logger.debug("Vector retriever created")
|
| 218 |
+
t_ensemble_start = time.time()
|
| 219 |
+
hybrid_retriever = EnsembleRetriever(
|
| 220 |
+
retrievers=[bm25_retriever, vector_retriever],
|
| 221 |
+
weights=parameters.HYBRID_RETRIEVER_WEIGHTS,
|
| 222 |
+
k=parameters.VECTOR_SEARCH_K,
|
| 223 |
+
)
|
| 224 |
+
t_ensemble_end = time.time()
|
| 225 |
+
logger.info(f"[PROFILE] Ensemble retriever creation: {t_ensemble_end - t_ensemble_start:.2f}s")
|
| 226 |
+
logger.info(f"Hybrid retriever created (k={parameters.VECTOR_SEARCH_K})")
|
| 227 |
+
logger.info(f"[PROFILE] Total hybrid retriever build: {t_ensemble_end - t_bm25_start:.2f}s")
|
| 228 |
+
return hybrid_retriever
|
test_token_size.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""Test script to analyze token size of retrieved documents."""
|
| 3 |
+
|
| 4 |
+
import sys
|
| 5 |
+
sys.path.insert(0, r'd:\MultiRAGgent\docchat')
|
| 6 |
+
|
| 7 |
+
from content_analyzer.document_parser import DocumentProcessor
|
| 8 |
+
from search_engine.indexer import RetrieverBuilder
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Initialize
|
| 12 |
+
processor = DocumentProcessor()
|
| 13 |
+
retriever_indexer = RetrieverBuilder()
|
| 14 |
+
|
| 15 |
+
# Load the example document
|
| 16 |
+
example_file = Path(r'd:\MultiRAGgent\docchat\examples\google-2024-environmental-report.pdf')
|
| 17 |
+
|
| 18 |
+
print(f"\n{'='*80}")
|
| 19 |
+
print("[TOKEN_ANALYSIS] Loading document: {example_file.name}")
|
| 20 |
+
print(f"{'='*80}\n")
|
| 21 |
+
|
| 22 |
+
# Process document
|
| 23 |
+
chunks = processor.process([str(example_file)])
|
| 24 |
+
print(f"[TOKEN_ANALYSIS] ✓ Loaded {len(chunks)} chunks from document")
|
| 25 |
+
|
| 26 |
+
# Build retriever
|
| 27 |
+
print(f"\n[TOKEN_ANALYSIS] Building hybrid retriever...")
|
| 28 |
+
retriever = retriever_indexer.build_retriever_with_scores(chunks)
|
| 29 |
+
print(f"[TOKEN_ANALYSIS] ✓ Retriever built\n")
|
| 30 |
+
|
| 31 |
+
# Test retrieval
|
| 32 |
+
question = "Retrieve the data center PUE efficiency values in Singapore 2nd facility in 2019 and 2022"
|
| 33 |
+
print(f"[TOKEN_ANALYSIS] Question: {question}\n")
|
| 34 |
+
|
| 35 |
+
retrieved_docs = retriever.invoke(question)
|
| 36 |
+
|
| 37 |
+
# Calculate token metrics
|
| 38 |
+
print(f"\n{'='*80}")
|
| 39 |
+
print(f"[TOKEN_ANALYSIS] RETRIEVAL RESULTS")
|
| 40 |
+
print(f"{'='*80}\n")
|
| 41 |
+
|
| 42 |
+
print(f"[TOKEN_ANALYSIS] Retrieved {len(retrieved_docs)} documents")
|
| 43 |
+
|
| 44 |
+
# Character and token analysis
|
| 45 |
+
total_chars = sum(len(doc.page_content) for doc in retrieved_docs)
|
| 46 |
+
# Different tokenization estimates
|
| 47 |
+
tokens_gpt = total_chars / 4 # ~4 chars per token (GPT)
|
| 48 |
+
tokens_gemini = total_chars / 3 # ~3 chars per token (Gemini - more aggressive)
|
| 49 |
+
tokens_claude = total_chars / 4.5 # ~4.5 chars per token (Claude)
|
| 50 |
+
|
| 51 |
+
if retrieved_docs:
|
| 52 |
+
avg_chars = total_chars // len(retrieved_docs)
|
| 53 |
+
avg_tokens_gemini = avg_chars // 3
|
| 54 |
+
|
| 55 |
+
print(f"\n[CHARACTER COUNT]")
|
| 56 |
+
print(f" Total characters: {total_chars:,}")
|
| 57 |
+
print(f" Average per doc: {avg_chars:,} chars")
|
| 58 |
+
|
| 59 |
+
print(f"\n[TOKEN COUNT ESTIMATES]")
|
| 60 |
+
print(f" Gemini (1 token ≈ 3 chars): {tokens_gemini:,.0f} tokens")
|
| 61 |
+
print(f" GPT/Claude (1 token ≈ 4 chars): {tokens_gpt:,.0f} tokens")
|
| 62 |
+
print(f" Average per doc (Gemini): {avg_tokens_gemini:,} tokens")
|
| 63 |
+
|
| 64 |
+
print(f"\n[QUOTA ANALYSIS]")
|
| 65 |
+
print(f" Gemini free tier limit: 250,000 tokens/day")
|
| 66 |
+
print(f" Your 64 docs use: {tokens_gemini:,.0f} tokens")
|
| 67 |
+
percentage = (tokens_gemini / 250000) * 100
|
| 68 |
+
print(f" Percentage of daily quota: {percentage:.1f}%")
|
| 69 |
+
|
| 70 |
+
print(f"\n[DOCUMENT SIZE BREAKDOWN]")
|
| 71 |
+
for i, doc in enumerate(retrieved_docs[:5], 1):
|
| 72 |
+
chars = len(doc.page_content)
|
| 73 |
+
tokens = chars // 3
|
| 74 |
+
print(f" Doc {i}: {chars:,} chars (~{tokens:,} tokens)")
|
| 75 |
+
if len(retrieved_docs) > 5:
|
| 76 |
+
print(f" ... and {len(retrieved_docs) - 5} more documents")
|
| 77 |
+
|
| 78 |
+
print(f"\n{'='*80}\n")
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test fixtures and shared utilities for DocChat tests.
|
| 3 |
+
"""
|
| 4 |
+
import pytest
|
| 5 |
+
from unittest.mock import MagicMock
|
| 6 |
+
from langchain_core.documents import Document
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class FakeLLM:
|
| 10 |
+
"""Mock LLM for testing without API calls."""
|
| 11 |
+
|
| 12 |
+
def __init__(self, content: str = "Test response"):
|
| 13 |
+
self.content = content
|
| 14 |
+
self.last_prompt = None
|
| 15 |
+
self.invoke_count = 0
|
| 16 |
+
|
| 17 |
+
def invoke(self, prompt: str):
|
| 18 |
+
self.last_prompt = prompt
|
| 19 |
+
self.invoke_count += 1
|
| 20 |
+
return type("Response", (), {"content": self.content})()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class FakeRetriever:
|
| 24 |
+
"""Mock retriever for testing without vector store."""
|
| 25 |
+
|
| 26 |
+
def __init__(self, documents: list = None):
|
| 27 |
+
self.documents = documents or []
|
| 28 |
+
self.invoke_count = 0
|
| 29 |
+
self.last_query = None
|
| 30 |
+
|
| 31 |
+
def invoke(self, query: str):
|
| 32 |
+
self.last_query = query
|
| 33 |
+
self.invoke_count += 1
|
| 34 |
+
return self.documents
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@pytest.fixture
|
| 38 |
+
def sample_documents():
|
| 39 |
+
"""Create sample documents for testing."""
|
| 40 |
+
return [
|
| 41 |
+
Document(
|
| 42 |
+
page_content="The data center in Singapore achieved a PUE of 1.12 in 2022.",
|
| 43 |
+
metadata={"source": "test.pdf", "page": 1}
|
| 44 |
+
),
|
| 45 |
+
Document(
|
| 46 |
+
page_content="Carbon-free energy in Asia Pacific reached 45% in 2023.",
|
| 47 |
+
metadata={"source": "test.pdf", "page": 2}
|
| 48 |
+
),
|
| 49 |
+
Document(
|
| 50 |
+
page_content="DeepSeek-R1 outperformed o1-mini on coding benchmarks.",
|
| 51 |
+
metadata={"source": "deepseek.pdf", "page": 1}
|
| 52 |
+
),
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@pytest.fixture
|
| 57 |
+
def fake_llm():
|
| 58 |
+
"""Create a fake LLM for testing."""
|
| 59 |
+
return FakeLLM("This is a test response.")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@pytest.fixture
|
| 63 |
+
def fake_retriever(sample_documents):
|
| 64 |
+
"""Create a fake retriever with sample documents."""
|
| 65 |
+
return FakeRetriever(sample_documents)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@pytest.fixture
|
| 69 |
+
def empty_retriever():
|
| 70 |
+
"""Create a fake retriever that returns no documents."""
|
| 71 |
+
return FakeRetriever([])
|
tests/test_accuracy_verifier.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for the VerificationAgent.
|
| 3 |
+
"""
|
| 4 |
+
import pytest
|
| 5 |
+
from unittest.mock import MagicMock, patch
|
| 6 |
+
from langchain_core.documents import Document
|
| 7 |
+
|
| 8 |
+
# Import after setting up mocks to avoid API key validation
|
| 9 |
+
import sys
|
| 10 |
+
sys.path.insert(0, '.')
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class TestVerificationAgent:
|
| 14 |
+
"""Test suite for VerificationAgent."""
|
| 15 |
+
|
| 16 |
+
@pytest.fixture
|
| 17 |
+
def mock_parameters(self, monkeypatch):
|
| 18 |
+
"""Mock parameters to avoid API key requirement."""
|
| 19 |
+
monkeypatch.setenv("GOOGLE_API_KEY", "test_key_for_testing")
|
| 20 |
+
|
| 21 |
+
@pytest.fixture
|
| 22 |
+
def accuracy_verifier(self, mock_parameters, fake_llm):
|
| 23 |
+
"""Create a VerificationAgent with mocked LLM."""
|
| 24 |
+
from intelligence.accuracy_verifier import VerificationAgent
|
| 25 |
+
return VerificationAgent(llm=fake_llm)
|
| 26 |
+
|
| 27 |
+
def test_check_with_supported_answer(self, accuracy_verifier, sample_documents):
|
| 28 |
+
"""Test verification with an answer supported by documents."""
|
| 29 |
+
# Configure the fake LLM to return a supported response
|
| 30 |
+
accuracy_verifier.llm.content = """
|
| 31 |
+
Supported: YES
|
| 32 |
+
Unsupported Claims: []
|
| 33 |
+
Contradictions: []
|
| 34 |
+
Relevant: YES
|
| 35 |
+
Additional Details: The answer is well-supported by the context.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
result = accuracy_verifier.check(
|
| 39 |
+
answer="The PUE in Singapore was 1.12 in 2022.",
|
| 40 |
+
documents=sample_documents
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
assert "verification_report" in result
|
| 44 |
+
assert "Supported: YES" in result["verification_report"]
|
| 45 |
+
assert "context_used" in result
|
| 46 |
+
|
| 47 |
+
def test_check_with_unsupported_answer(self, accuracy_verifier, sample_documents):
|
| 48 |
+
"""Test verification with an unsupported answer."""
|
| 49 |
+
accuracy_verifier.llm.content = """
|
| 50 |
+
Supported: NO
|
| 51 |
+
Unsupported Claims: [The PUE was 1.5]
|
| 52 |
+
Contradictions: []
|
| 53 |
+
Relevant: YES
|
| 54 |
+
Additional Details: The claimed PUE value is not in the context.
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
result = accuracy_verifier.check(
|
| 58 |
+
answer="The PUE in Singapore was 1.5 in 2022.",
|
| 59 |
+
documents=sample_documents
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
assert "Supported: NO" in result["verification_report"]
|
| 63 |
+
|
| 64 |
+
def test_parse_verification_response_valid(self, accuracy_verifier):
|
| 65 |
+
"""Test parsing a valid verification response."""
|
| 66 |
+
response = """
|
| 67 |
+
Supported: YES
|
| 68 |
+
Unsupported Claims: []
|
| 69 |
+
Contradictions: []
|
| 70 |
+
Relevant: YES
|
| 71 |
+
Additional Details: All claims verified.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
parsed = accuracy_verifier.parse_verification_response(response)
|
| 75 |
+
|
| 76 |
+
assert parsed["Supported"] == "YES"
|
| 77 |
+
assert parsed["Relevant"] == "YES"
|
| 78 |
+
assert parsed["Unsupported Claims"] == []
|
| 79 |
+
|
| 80 |
+
def test_parse_verification_response_with_claims(self, accuracy_verifier):
|
| 81 |
+
"""Test parsing response with unsupported claims."""
|
| 82 |
+
response = """
|
| 83 |
+
Supported: NO
|
| 84 |
+
Unsupported Claims: [claim1, claim2]
|
| 85 |
+
Contradictions: [contradiction1]
|
| 86 |
+
Relevant: YES
|
| 87 |
+
Additional Details: Multiple issues found.
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
parsed = accuracy_verifier.parse_verification_response(response)
|
| 91 |
+
|
| 92 |
+
assert parsed["Supported"] == "NO"
|
| 93 |
+
assert len(parsed["Unsupported Claims"]) == 2
|
| 94 |
+
assert len(parsed["Contradictions"]) == 1
|
| 95 |
+
|
| 96 |
+
def test_format_verification_report(self, accuracy_verifier):
|
| 97 |
+
"""Test formatting a verification report."""
|
| 98 |
+
verification = {
|
| 99 |
+
"Supported": "YES",
|
| 100 |
+
"Unsupported Claims": [],
|
| 101 |
+
"Contradictions": [],
|
| 102 |
+
"Relevant": "YES",
|
| 103 |
+
"Additional Details": "Well verified."
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
report = accuracy_verifier.format_verification_report(verification)
|
| 107 |
+
|
| 108 |
+
assert "**Supported:** YES" in report
|
| 109 |
+
assert "**Relevant:** YES" in report
|
| 110 |
+
assert "**Unsupported Claims:** None" in report
|
tests/test_context_validator.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for the RelevanceChecker.
|
| 3 |
+
"""
|
| 4 |
+
import pytest
|
| 5 |
+
from unittest.mock import MagicMock
|
| 6 |
+
from langchain_core.documents import Document
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
sys.path.insert(0, '.')
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class TestRelevanceChecker:
|
| 13 |
+
"""Test suite for RelevanceChecker."""
|
| 14 |
+
|
| 15 |
+
@pytest.fixture
|
| 16 |
+
def mock_parameters(self, monkeypatch):
|
| 17 |
+
"""Mock parameters to avoid API key requirement."""
|
| 18 |
+
monkeypatch.setenv("GOOGLE_API_KEY", "test_key_for_testing")
|
| 19 |
+
|
| 20 |
+
@pytest.fixture
|
| 21 |
+
def context_validator(self, mock_parameters, fake_llm):
|
| 22 |
+
"""Create a RelevanceChecker with mocked LLM."""
|
| 23 |
+
from intelligence.context_validator import RelevanceChecker
|
| 24 |
+
checker = RelevanceChecker()
|
| 25 |
+
checker.llm = fake_llm
|
| 26 |
+
return checker
|
| 27 |
+
|
| 28 |
+
def test_check_can_answer(self, context_validator, fake_retriever):
|
| 29 |
+
"""Test when documents can fully answer the question."""
|
| 30 |
+
context_validator.llm.content = "CAN_ANSWER"
|
| 31 |
+
|
| 32 |
+
result = context_validator.check(
|
| 33 |
+
question="What is the PUE in Singapore?",
|
| 34 |
+
retriever=fake_retriever,
|
| 35 |
+
k=3
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
assert result == "CAN_ANSWER"
|
| 39 |
+
assert fake_retriever.invoke_count == 1
|
| 40 |
+
|
| 41 |
+
def test_check_partial_match(self, context_validator, fake_retriever):
|
| 42 |
+
"""Test when documents partially match the question."""
|
| 43 |
+
context_validator.llm.content = "PARTIAL"
|
| 44 |
+
|
| 45 |
+
result = context_validator.check(
|
| 46 |
+
question="What is the historical trend of PUE?",
|
| 47 |
+
retriever=fake_retriever,
|
| 48 |
+
k=3
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
assert result == "PARTIAL"
|
| 52 |
+
|
| 53 |
+
def test_check_no_match(self, context_validator, fake_retriever):
|
| 54 |
+
"""Test when documents don't match the question."""
|
| 55 |
+
context_validator.llm.content = "NO_MATCH"
|
| 56 |
+
|
| 57 |
+
result = context_validator.check(
|
| 58 |
+
question="What is the weather in Paris?",
|
| 59 |
+
retriever=fake_retriever,
|
| 60 |
+
k=3
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
assert result == "NO_MATCH"
|
| 64 |
+
|
| 65 |
+
def test_check_empty_question(self, context_validator, fake_retriever):
|
| 66 |
+
"""Test with empty question returns NO_MATCH."""
|
| 67 |
+
result = context_validator.check(
|
| 68 |
+
question="",
|
| 69 |
+
retriever=fake_retriever,
|
| 70 |
+
k=3
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
assert result == "NO_MATCH"
|
| 74 |
+
|
| 75 |
+
def test_check_empty_retriever_results(self, context_validator, empty_retriever):
|
| 76 |
+
"""Test when retriever returns no documents."""
|
| 77 |
+
result = context_validator.check(
|
| 78 |
+
question="Any question",
|
| 79 |
+
retriever=empty_retriever,
|
| 80 |
+
k=3
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
assert result == "NO_MATCH"
|
| 84 |
+
|
| 85 |
+
def test_check_invalid_llm_response(self, context_validator, fake_retriever):
|
| 86 |
+
"""Test when LLM returns invalid response."""
|
| 87 |
+
context_validator.llm.content = "INVALID_LABEL"
|
| 88 |
+
|
| 89 |
+
result = context_validator.check(
|
| 90 |
+
question="What is the PUE?",
|
| 91 |
+
retriever=fake_retriever,
|
| 92 |
+
k=3
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
assert result == "NO_MATCH"
|
| 96 |
+
|
| 97 |
+
def test_check_retriever_exception(self, context_validator):
|
| 98 |
+
"""Test when retriever throws an exception."""
|
| 99 |
+
failing_retriever = MagicMock()
|
| 100 |
+
failing_retriever.invoke.side_effect = Exception("Connection error")
|
| 101 |
+
|
| 102 |
+
result = context_validator.check(
|
| 103 |
+
question="Any question",
|
| 104 |
+
retriever=failing_retriever,
|
| 105 |
+
k=3
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
assert result == "NO_MATCH"
|
| 109 |
+
|
| 110 |
+
def test_check_invalid_k_value(self, context_validator, fake_retriever):
|
| 111 |
+
"""Test with invalid k value defaults to 3."""
|
| 112 |
+
context_validator.llm.content = "CAN_ANSWER"
|
| 113 |
+
|
| 114 |
+
result = context_validator.check(
|
| 115 |
+
question="What is the PUE?",
|
| 116 |
+
retriever=fake_retriever,
|
| 117 |
+
k=-1
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
assert result == "CAN_ANSWER"
|
tests/test_knowledge_synthesizer.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
try:
|
| 4 |
+
from langchain_core.documents import Document
|
| 5 |
+
from intelligence.knowledge_synthesizer import ResearchAgent
|
| 6 |
+
LANGCHAIN_AVAILABLE = True
|
| 7 |
+
except ImportError:
|
| 8 |
+
Document = None # type: ignore
|
| 9 |
+
ResearchAgent = None # type: ignore
|
| 10 |
+
LANGCHAIN_AVAILABLE = False
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class FakeLLM:
|
| 14 |
+
"""Simple stand-in for ChatGoogleGenerativeAI to avoid network calls."""
|
| 15 |
+
|
| 16 |
+
def __init__(self, content: str) -> None:
|
| 17 |
+
self.content = content
|
| 18 |
+
self.last_prompt = None
|
| 19 |
+
|
| 20 |
+
def invoke(self, prompt: str):
|
| 21 |
+
self.last_prompt = prompt
|
| 22 |
+
return type("Resp", (), {"content": self.content})
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@unittest.skipUnless(LANGCHAIN_AVAILABLE, "langchain not installed in this environment")
|
| 26 |
+
class ResearchAgentTests(unittest.TestCase):
|
| 27 |
+
def test_generate_returns_stubbed_content_with_citations(self):
|
| 28 |
+
docs = [
|
| 29 |
+
Document(page_content="Alpha text", metadata={"id": "a1"}),
|
| 30 |
+
Document(page_content="Beta text", metadata={"source": "s1"}),
|
| 31 |
+
]
|
| 32 |
+
llm = FakeLLM("Answer about alpha")
|
| 33 |
+
agent = ResearchAgent(llm=llm, top_k=1, max_context_chars=200)
|
| 34 |
+
|
| 35 |
+
result = agent.generate("What is alpha?", docs)
|
| 36 |
+
|
| 37 |
+
self.assertEqual(result["draft_answer"], "Answer about alpha")
|
| 38 |
+
self.assertIn("Alpha text", llm.last_prompt)
|
| 39 |
+
|
| 40 |
+
def test_generate_handles_no_documents(self):
|
| 41 |
+
llm = FakeLLM("unused")
|
| 42 |
+
agent = ResearchAgent(llm=llm)
|
| 43 |
+
|
| 44 |
+
result = agent.generate("Any question", [])
|
| 45 |
+
|
| 46 |
+
self.assertIn("could not find supporting documents", result["draft_answer"])
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
if __name__ == "__main__":
|
| 50 |
+
unittest.main()
|
tests/test_visual_extraction.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test script for Gemini Vision chart extraction.
|
| 3 |
+
|
| 4 |
+
This script demonstrates how to use the chart extraction feature
|
| 5 |
+
and validates that it's working correctly.
|
| 6 |
+
"""
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
# Add parent directory to path
|
| 13 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 14 |
+
|
| 15 |
+
from content_analyzer.document_parser import DocumentProcessor
|
| 16 |
+
from configuration.parameters import parameters
|
| 17 |
+
|
| 18 |
+
# Configure logging
|
| 19 |
+
logging.basicConfig(
|
| 20 |
+
level=logging.INFO,
|
| 21 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 22 |
+
)
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_chart_extraction():
|
| 27 |
+
"""Test chart extraction on a sample PDF with charts."""
|
| 28 |
+
|
| 29 |
+
logger.info("=" * 60)
|
| 30 |
+
logger.info("Testing Gemini Vision Chart Extraction")
|
| 31 |
+
logger.info("=" * 60)
|
| 32 |
+
|
| 33 |
+
# Check if chart extraction is enabled
|
| 34 |
+
if not parameters.ENABLE_CHART_EXTRACTION:
|
| 35 |
+
logger.warning("?? Chart extraction is DISABLED")
|
| 36 |
+
logger.info("Enable it by setting ENABLE_CHART_EXTRACTION=true in .env")
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
logger.info(f"? Chart extraction enabled")
|
| 40 |
+
logger.info(f"?? Using model: {parameters.CHART_VISION_MODEL}")
|
| 41 |
+
logger.info(f"?? Max tokens: {parameters.CHART_MAX_TOKENS}")
|
| 42 |
+
|
| 43 |
+
# Initialize processor
|
| 44 |
+
try:
|
| 45 |
+
processor = DocumentProcessor()
|
| 46 |
+
logger.info("? DocumentProcessor initialized")
|
| 47 |
+
|
| 48 |
+
if processor.gemini_client:
|
| 49 |
+
logger.info("? Gemini Vision client ready")
|
| 50 |
+
else:
|
| 51 |
+
logger.error("? Gemini Vision client not initialized")
|
| 52 |
+
return
|
| 53 |
+
|
| 54 |
+
except Exception as e:
|
| 55 |
+
logger.error(f"? Failed to initialize processor: {e}")
|
| 56 |
+
return
|
| 57 |
+
|
| 58 |
+
# Test with example PDF (if exists)
|
| 59 |
+
test_files = [
|
| 60 |
+
"examples/google-2024-environmental-report.pdf",
|
| 61 |
+
"examples/deppseek.pdf",
|
| 62 |
+
"test/sample_with_charts.pdf"
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
found_file = None
|
| 66 |
+
for test_file in test_files:
|
| 67 |
+
if os.path.exists(test_file):
|
| 68 |
+
found_file = test_file
|
| 69 |
+
break
|
| 70 |
+
|
| 71 |
+
if not found_file:
|
| 72 |
+
logger.warning("?? No test PDF files found")
|
| 73 |
+
logger.info("Available test files:")
|
| 74 |
+
for tf in test_files:
|
| 75 |
+
logger.info(f" - {tf}")
|
| 76 |
+
logger.info("\nTo test manually:")
|
| 77 |
+
logger.info("1. Place a PDF with charts in one of the above locations")
|
| 78 |
+
logger.info("2. Run this script again")
|
| 79 |
+
return
|
| 80 |
+
|
| 81 |
+
logger.info(f"\n?? Processing test file: {found_file}")
|
| 82 |
+
|
| 83 |
+
# Create mock file object
|
| 84 |
+
class MockFile:
|
| 85 |
+
def __init__(self, path):
|
| 86 |
+
self.name = path
|
| 87 |
+
self.size = os.path.getsize(path)
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
# Process the file
|
| 91 |
+
mock_file = MockFile(found_file)
|
| 92 |
+
chunks = processor.process([mock_file])
|
| 93 |
+
|
| 94 |
+
logger.info(f"\n? Processing complete!")
|
| 95 |
+
logger.info(f"?? Total chunks extracted: {len(chunks)}")
|
| 96 |
+
|
| 97 |
+
# Count chart chunks
|
| 98 |
+
chart_chunks = [c for c in chunks if c.metadata.get("type") == "chart"]
|
| 99 |
+
text_chunks = [c for c in chunks if c.metadata.get("type") != "chart"]
|
| 100 |
+
|
| 101 |
+
logger.info(f"?? Chart chunks: {len(chart_chunks)}")
|
| 102 |
+
logger.info(f"?? Text chunks: {len(text_chunks)}")
|
| 103 |
+
|
| 104 |
+
# Display chart analyses
|
| 105 |
+
if chart_chunks:
|
| 106 |
+
logger.info(f"\n{'=' * 60}")
|
| 107 |
+
logger.info("?? CHART ANALYSES EXTRACTED:")
|
| 108 |
+
logger.info('=' * 60)
|
| 109 |
+
|
| 110 |
+
for i, chunk in enumerate(chart_chunks, 1):
|
| 111 |
+
logger.info(f"\n--- Chart {i} ---")
|
| 112 |
+
logger.info(f"Page: {chunk.metadata.get('page')}")
|
| 113 |
+
logger.info(f"Preview: {chunk.page_content[:200]}...")
|
| 114 |
+
logger.info("")
|
| 115 |
+
else:
|
| 116 |
+
logger.info("\n?? No charts detected in this document")
|
| 117 |
+
logger.info("This could mean:")
|
| 118 |
+
logger.info(" - Document contains no charts")
|
| 119 |
+
logger.info(" - Charts are embedded as tables (already extracted)")
|
| 120 |
+
logger.info(" - Charts are too complex for detection")
|
| 121 |
+
|
| 122 |
+
logger.info(f"\n{'=' * 60}")
|
| 123 |
+
logger.info("? Test completed successfully!")
|
| 124 |
+
logger.info('=' * 60)
|
| 125 |
+
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"? Test failed: {e}", exc_info=True)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def test_api_connection():
|
| 131 |
+
"""Test Gemini API connection."""
|
| 132 |
+
logger.info("\n" + "=" * 60)
|
| 133 |
+
logger.info("Testing Gemini API Connection")
|
| 134 |
+
logger.info("=" * 60)
|
| 135 |
+
|
| 136 |
+
try:
|
| 137 |
+
import google.generativeai as genai
|
| 138 |
+
from PIL import Image
|
| 139 |
+
import io
|
| 140 |
+
|
| 141 |
+
genai.configure(api_key=parameters.GOOGLE_API_KEY)
|
| 142 |
+
model = genai.GenerativeModel(parameters.CHART_VISION_MODEL)
|
| 143 |
+
|
| 144 |
+
logger.info("? Gemini client initialized")
|
| 145 |
+
|
| 146 |
+
# Test with a simple text prompt
|
| 147 |
+
response = model.generate_content("Hello! Can you respond with 'API Working'?")
|
| 148 |
+
logger.info(f"? API Response: {response.text}")
|
| 149 |
+
|
| 150 |
+
logger.info("? Gemini API connection successful!")
|
| 151 |
+
|
| 152 |
+
except ImportError as e:
|
| 153 |
+
logger.error(f"? Missing dependency: {e}")
|
| 154 |
+
logger.info("Install with: pip install google-generativeai Pillow")
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.error(f"? API test failed: {e}")
|
| 157 |
+
logger.info("Check your GOOGLE_API_KEY in .env file")
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
if __name__ == "__main__":
|
| 161 |
+
print("\n?? SmartDoc AI - Chart Extraction Test Suite\n")
|
| 162 |
+
|
| 163 |
+
# Test 1: API Connection
|
| 164 |
+
test_api_connection()
|
| 165 |
+
|
| 166 |
+
# Test 2: Chart Extraction
|
| 167 |
+
test_chart_extraction()
|
| 168 |
+
|
| 169 |
+
print("\n? All tests completed!\n")
|
vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c8fe3c8d74ae8a7762e6f389543f0f2c53e6127832955b377ed768f8759db70d
|
| 3 |
+
size 16165996
|
vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:059abd7ab166731c13bd8dc4dc0724104918b450e9625ca4bc9f27ed0016170e
|
| 3 |
+
size 100
|
vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc43535869cc54fbd80a6a47dac2fd0b07f4eeb0c028b5c96026b6cdc271832b
|
| 3 |
+
size 463184
|
vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa6bfa281c8fe4e4977d5382b077dee4a3c4e5c750985cdf3d3660a6f92dab67
|
| 3 |
+
size 20132
|
vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ffcd2c7be0de4c70919af69080b33cbd5c7487471058b2a70ee5bf95ab86ea00
|
| 3 |
+
size 42436
|