Spaces:

TilanB
/

SmartDocAI

Running

App Files Files Community

TilanB commited on Dec 29, 2025

Commit

50fcf88

1 Parent(s): 8c07ea1

Initial commit for Hugging Face Space

Browse files

Files changed (39) hide show

.dockerignore +57 -0
.env.template +48 -0
.gitignore +105 -0
README.md +114 -11
activate_venv.bat +18 -0
activate_venv.ps1 +18 -0
configuration/__init__.py +4 -0
configuration/definitions.py +8 -0
configuration/logger_setup.py +62 -0
configuration/parameters.py +133 -0
content_analyzer/__init__.py +3 -0
content_analyzer/document_parser.py +842 -0
content_analyzer/visual_detector.py +354 -0
core/__init__.py +3 -0
core/diagnostics.py +125 -0
core/lifecycle.py +160 -0
core/logger.py +16 -0
dependencies.txt +52 -0
intelligence/__init__.py +5 -0
intelligence/accuracy_verifier.py +362 -0
intelligence/context_validator.py +235 -0
intelligence/knowledge_synthesizer.py +172 -0
intelligence/orchestrator.py +388 -0
main.py +986 -0
maintenance.py +41 -0
requirements.txt +53 -0
search_engine/__init__.py +3 -0
search_engine/indexer.py +228 -0
test_token_size.py +78 -0
tests/conftest.py +71 -0
tests/test_accuracy_verifier.py +110 -0
tests/test_context_validator.py +120 -0
tests/test_knowledge_synthesizer.py +50 -0
tests/test_visual_extraction.py +169 -0
vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/data_level0.bin +3 -0
vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/header.bin +3 -0
vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/index_metadata.pickle +3 -0
vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/length.bin +3 -0
vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/link_lists.bin +3 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,57 @@

+# Ignore Python bytecode
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+# Virtual environments
+.venv/
+venv/
+env/
+ENV/
+# IDE settings
+.idea/
+.vscode/
+*.swp
+*.swo
+# Environment files (keep .env.example)
+.env
+.env.local
+.env.*.local
+# Logs (mount as volume in production)
+logs/
+*.log
+# ChromaDB data (mount as volume in production)
+chroma_db/
+# Document cache (mount as volume in production)
+document_cache/
+# Test artifacts
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+# Build artifacts
+dist/
+build/
+*.egg-info/
+# Jupyter
+.ipynb_checkpoints/
+# Git
+.git/
+.gitignore
+# Documentation build
+docs/_build/
+# Misc
+*.DS_Store
+Thumbs.db

.env.template ADDED Viewed

	@@ -0,0 +1,48 @@

+# SmartDoc AI Environment Configuration
+# Copy this file to .env and fill in your values
+# =============================================================================
+# REQUIRED SETTINGS
+# =============================================================================
+# Google API Key for Gemini models (required)
+# Get your key at: https://makersuite.google.com/app/apikey
+GOOGLE_API_KEY=your_google_api_key_here
+# =============================================================================
+# OPTIONAL SETTINGS (with defaults)
+# =============================================================================
+# Database settings
+# CHROMA_DB_PATH=./chroma_db
+# Chunking settings
+# CHUNK_SIZE=1000
+# CHUNK_OVERLAP=100
+# Retriever settings
+# VECTOR_SEARCH_K=5          # Number of documents to retrieve via vector search
+# VECTOR_FETCH_K=20          # Candidate pool size for MMR
+# VECTOR_SCORE_THRESHOLD=0.3 # Minimum relevance score (0-1)
+# BM25_SEARCH_K=5            # Number of documents to retrieve via BM25
+# HYBRID_RETRIEVER_WEIGHTS=[0.4, 0.6]  # [BM25 weight, Vector weight]
+# Logging settings
+# LOG_LEVEL=INFO
+# Cache settings
+# CACHE_DIR=document_cache
+# CACHE_EXPIRE_DAYS=7
+# LLM settings
+# LLM_MAX_RETRIES=3
+# LLM_RETRY_DELAY=1.0
+# LLM_MODEL_NAME=gemini-2.5-flash-lite  # Default model for all agents
+# Agent-specific LLM models (override LLM_MODEL_NAME if needed)
+# RESEARCH_AGENT_MODEL=gemini-2.5-flash-lite
+# VERIFICATION_AGENT_MODEL=gemini-2.5-flash-lite
+# RELEVANCE_CHECKER_MODEL=gemini-2.5-flash-lite
+# Server settings (optional, for Gradio)
+# GRADIO_SERVER_PORT=7860

.gitignore ADDED Viewed

	@@ -0,0 +1,105 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+*.manifest
+*.spec
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Environments - IMPORTANT: Keep .env out of repo
+.env
+.env.local
+.env.*.local
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+.vs/
+# Visual Studio
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# Logs
+logs/
+*.log
+# Database
+*.db
+*.sqlite3
+# ChromaDB / Vector stores - Recreated at runtime
+chroma_db/
+*.chroma
+# Cache - Recreated at runtime
+.cache/
+cache/
+document_cache/
+*.pkl
+# OS files
+.DS_Store
+Thumbs.db
+# Temporary files
+tmp/
+temp/
+*.tmp
+# Hugging Face Spaces
+.gradio/
+flagged/

README.md CHANGED Viewed

@@ -1,14 +1,117 @@
 ---
-title: SmartDocAI
-emoji: 👁
-colorFrom: purple
-colorTo: gray
-sdk: gradio
-sdk_version: 6.2.0
-app_file: app.py
-pinned: false
-license: mit
-short_description: SmartDoc AI is an intelligent document analysis and question
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# SmartDoc AI
+SmartDoc AI is an advanced document analysis and question answering system. It allows you to upload documents, ask questions, and receive accurate, source-verified answers. The system uses a multi-agent workflow, hybrid search, and both local and cloud-based chart detection for high performance and cost efficiency.
+---
+## Features
+- **Multi-format Document Support**: PDF, DOCX, TXT, and Markdown
+- **Smart Chunking**: Configurable chunk size and overlap for optimal retrieval
+- **Intelligent Caching**: Speeds up repeated queries
+- **Chart Extraction**: Detects and analyzes charts using OpenCV and Gemini Vision
+- **Hybrid Search**: Combines keyword and vector search for best results
+- **Multi-Agent Workflow**: Relevance checking, research, and answer verification
+- **Production Ready**: Structured logging, environment-based config, and test suite
+- **Efficient**: Local chart detection saves up to 95% on API costs
 ---
+## Quick Start
+### Prerequisites
+- Python 3.11 or higher
+- Google API Key for Gemini models ([Get one here](https://ai.google.dev/))
+### Installation
+1. Clone the repository:
+```bash
+git clone https://github.com/TilanTAB/Intelligent-Document-Analysis-Q-A-3.git
+cd Intelligent-Document-Analysis-Q-A-3
+```
+2. Activate the virtual environment:
+```bash
+# Windows PowerShell
+.\activate_venv.ps1
+# Windows Command Prompt
+activate_venv.bat
+# Or manually:
+.\venv\Scripts\Activate.ps1
+```
+3. Install dependencies (if needed):
+```bash
+pip install -r dependencies.txt
+```
+4. Configure environment variables:
+```bash
+cp .env.template .env
+# Edit .env and set your API key
+GOOGLE_API_KEY=your_api_key_here
+```
+5. (Optional) Verify installation:
+```bash
+python verify_environment.py
+```
+6. Run the application:
+```bash
+python main.py
+```
+7. Open your browser to [http://localhost:7860](http://localhost:7860)
+---
+## Configuration
+All settings can be configured via environment variables or the `.env` file. Key options include:
+- `GOOGLE_API_KEY`: Your Gemini API key (required)
+- `CHUNK_SIZE`, `CHUNK_OVERLAP`: Document chunking
+- `ENABLE_CHART_EXTRACTION`: Enable/disable chart detection
+- `CHART_USE_LOCAL_DETECTION`: Use OpenCV for free chart detection
+- `CHART_ENABLE_BATCH_ANALYSIS`: Batch process charts for speed
+- `CHART_GEMINI_BATCH_SIZE`: Number of charts per Gemini API call
+- `LOG_LEVEL`: Logging verbosity
+- `GRADIO_SERVER_PORT`: Web interface port
+---
+## Project Structure
+- `intelligence/` - Multi-agent system (relevance, research, verification)
+- `configuration/` - App settings and logging
+- `content_analyzer/` - Document and chart processing
+- `search_engine/` - Hybrid retriever logic
+- `core/` - Utilities and diagnostics
+- `tests/` - Test suite
+- `main.py` - Application entry point
+---
+## Troubleshooting
+- **API Key Not Found**: Set `GOOGLE_API_KEY` in your `.env` file.
+- **Python 3.13 Issues**: Use Python 3.11 or 3.12 for best compatibility.
+- **Chart Detection Slow**: Lower `CHART_DPI` or `CHART_MAX_IMAGE_SIZE` in `.env`.
+- **ChromaDB Lock Issues**: Stop all instances and remove lock files in `vector_store/`.
+---
+## Contributing
+Contributions are welcome! Please fork the repository, create a feature branch, and submit a pull request with a clear description.
+---
+## License
+This project is licensed under the MIT License.
 ---
+SmartDoc AI is actively maintained and designed for real-world document analysis and Q&A. For updates and support, visit the [GitHub repository](https://github.com/TilanTAB/Intelligent-Document-Analysis-Q-A-3).

activate_venv.bat ADDED Viewed

	@@ -0,0 +1,18 @@

+@echo off
+REM SmartDoc AI - Virtual Environment Activation Script (Windows Command Prompt)
+REM Run this script to activate the virtual environment
+echo ?? Activating SmartDoc AI virtual environment...
+echo.
+call venv\Scripts\activate.bat
+echo.
+echo ? Virtual environment activated!
+echo.
+echo ?? To run the application:
+echo    python main.py
+echo.
+echo ?? To deactivate:
+echo    deactivate
+echo.

activate_venv.ps1 ADDED Viewed

	@@ -0,0 +1,18 @@

+# SmartDoc AI - Virtual Environment Activation Script
+# Run this script to activate the virtual environment
+Write-Host "?? Activating SmartDoc AI virtual environment..." -ForegroundColor Cyan
+# Activate the virtual environment
+& ".\venv\Scripts\Activate.ps1"
+Write-Host "? Virtual environment activated!" -ForegroundColor Green
+Write-Host ""
+Write-Host "?? Installed packages:" -ForegroundColor Yellow
+pip list | Select-String -Pattern "langchain|chromadb|gradio|opencv|google-generativeai"
+Write-Host ""
+Write-Host "?? To run the application:" -ForegroundColor Cyan
+Write-Host "   python main.py" -ForegroundColor White
+Write-Host ""
+Write-Host "?? To deactivate:" -ForegroundColor Cyan
+Write-Host "   deactivate" -ForegroundColor White

configuration/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .parameters import parameters
+from .definitions import MAX_FILE_SIZE, MAX_TOTAL_SIZE, ALLOWED_TYPES
+__all__ = ["parameters", "MAX_FILE_SIZE", "MAX_TOTAL_SIZE", "ALLOWED_TYPES"]

configuration/definitions.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Maximum allowed size for a single file (50 MB)
+MAX_FILE_SIZE: int = 50 * 1024 * 1024
+# Maximum allowed total size for all uploaded files (200 MB)
+MAX_TOTAL_SIZE: int = 200 * 1024 * 1024
+# Allowed file types for upload
+ALLOWED_TYPES: list = [".txt", ".pdf", ".docx", ".md"]

configuration/logger_setup.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import logging
+from logging.handlers import RotatingFileHandler, QueueHandler, QueueListener
+import queue
+import os
+import sys
+from pathlib import Path
+# Custom formatter to remove unsupported Unicode characters
+class SafeFormatter(logging.Formatter):
+    def format(self, record):
+        msg = super().format(record)
+        # Remove characters not supported by cp1252 (0-255)
+        safe_msg = ''.join(c if ord(c) < 256 else '?' for c in msg)
+        return safe_msg
+# Ensure the logs directory exists
+log_dir = Path("logs")
+log_dir.mkdir(exist_ok=True)
+# Configure log file path
+log_file_path = os.path.join("logs", "app.log")
+# Set up a queue for log messages
+log_queue = queue.Queue(-1)  # No limit on size (-1)
+# Detailed log format with timestamp, level, logger name, and message
+detailed_format = "%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
+# Create a rotating file handler for the application logs
+file_handler = RotatingFileHandler(
+    log_file_path,
+    maxBytes=10 * 1024 * 1024,  # 10 MB
+    backupCount=5,              # Keep 5 backups
+    delay=True                   # Delay file opening until a log message is emitted
+)
+file_handler.setFormatter(SafeFormatter(detailed_format))
+# Create a queue handler to send log messages to the queue
+queue_handler = QueueHandler(log_queue)
+# Console handler (direct, not via queue)
+console_handler = logging.StreamHandler(sys.stdout)
+console_handler.setFormatter(SafeFormatter(detailed_format))
+console_handler.setLevel(logging.INFO)
+# Get the root logger
+root_logger = logging.getLogger()
+root_logger.setLevel(logging.INFO)
+root_logger.handlers = [console_handler, queue_handler]  # Console direct, queue for file
+# Create and start a listener for the queue to process log messages in the background
+listener = QueueListener(log_queue, file_handler)
+listener.start()
+# Suppress verbose logs from specific third-party libraries
+logging.getLogger("langchain").setLevel(logging.WARNING)
+logging.getLogger("langchain_community").setLevel(logging.WARNING)
+logging.getLogger("chromadb").setLevel(logging.WARNING)
+logging.getLogger("google").setLevel(logging.WARNING)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+root_logger.info("Logging system initialized successfully.")

configuration/parameters.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic import Field, field_validator
+from typing import Optional
+import os
+from .definitions import MAX_FILE_SIZE, MAX_TOTAL_SIZE, ALLOWED_TYPES
+class Settings(BaseSettings):
+    """
+    Application parameters loaded from environment variables.
+    For local development:
+        Create a .env file in the project root with your configuration:
+        GOOGLE_API_KEY=your_api_key_here
+    For Hugging Face Spaces:
+        Add GOOGLE_API_KEY as a secret in Space Settings > Repository secrets
+    """
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+        case_sensitive=False,
+    )
+    # File upload parameters with defaults from definitions
+    MAX_FILE_SIZE: int = MAX_FILE_SIZE
+    MAX_TOTAL_SIZE: int = MAX_TOTAL_SIZE
+    ALLOWED_TYPES: list = ALLOWED_TYPES
+    # API keys - REQUIRED, must be set via environment variable or HF Secrets
+    GOOGLE_API_KEY: str = Field(
+        ...,  # Required field
+        description="Google API key for Gemini models",
+    )
+    # Database parameters
+    CHROMA_DB_PATH: str = "./chroma_db"
+    # Chunking parameters
+    CHUNK_SIZE: int = 2000
+    CHUNK_OVERLAP: int = 100
+    # Retriever parameters
+    VECTOR_SEARCH_K: int = 25
+    VECTOR_Search_K_CHROMA: int = 15
+    VECTOR_FETCH_K: int = 35
+    VECTOR_SCORE_THRESHOLD: float = 0.3
+    BM25_SEARCH_K: int = 8
+    HYBRID_RETRIEVER_WEIGHTS: list = [0.4, 0.6]  # [BM25 weight, Vector weight]
+    CHROMA_COLLECTION_NAME: str = "documents"
+    # Workflow parameters
+    MAX_RESEARCH_ATTEMPTS: int = 2
+    ENABLE_QUERY_REWRITING: bool = True
+    MAX_QUERY_REWRITES: int = 1
+    RELEVANCE_CHECK_K: int = 20
+    # Research agent parameters
+    RESEARCH_TOP_K: int = 15
+    RESEARCH_MAX_CONTEXT_CHARS: int = 8000000000
+    RESEARCH_MAX_OUTPUT_TOKENS: int = 500
+    # Verification parameters
+    VERIFICATION_MAX_CONTEXT_CHARS: int = 800000000
+    VERIFICATION_MAX_OUTPUT_TOKENS: int = 300
+    # Logging parameters
+    LOG_LEVEL: str = "INFO"
+    # Cache parameters
+    CACHE_DIR: str = "document_cache"
+    CACHE_EXPIRE_DAYS: int = 7
+    # LLM parameters
+    LLM_MAX_RETRIES: int = 3
+    LLM_RETRY_DELAY: float = 1.0
+    LLM_MODEL_NAME: str = "gemini-2.5-flash-lite"  # Default model for all agents
+    # Agent-specific LLM models (override LLM_MODEL_NAME if needed)
+    RESEARCH_AGENT_MODEL: str = "gemini-2.5-flash-lite"
+    VERIFICATION_AGENT_MODEL: str = "gemini-2.5-flash-lite"
+    RELEVANCE_CHECKER_MODEL: str = "gemini-2.5-flash-lite"
+    # Chart extraction parameters
+    ENABLE_CHART_EXTRACTION: bool = True
+    CHART_VISION_MODEL: str = "gemini-2.5-flash-lite"
+    CHART_MAX_TOKENS: int = 1500
+    CHART_DPI: int = 150  # Lower DPI saves memory
+    CHART_BATCH_SIZE: int = 3  # Process pages in batches
+    CHART_MAX_IMAGE_SIZE: int = 1920  # Max dimension for images
+    # Local chart detection parameters (cost optimization)
+    CHART_USE_LOCAL_DETECTION: bool = True  # Use OpenCV first (FREE)
+    CHART_MIN_CONFIDENCE: float = 0.4  # Only analyze charts with confidence > 40%
+    CHART_SKIP_GEMINI_DETECTION: bool = True  # Skip Gemini for detection, only use for analysis
+    CHART_GEMINI_FALLBACK_ENABLED: bool = False  # Optional: Use Gemini if local fails
+    # Gemini batch processing parameters (speed optimization - 2-3× faster)
+    CHART_GEMINI_BATCH_SIZE: int = 1  # Analyze 1 chart per API call (reduced from 2 for reliability)
+    CHART_ENABLE_BATCH_ANALYSIS: bool = True  # Enable batch processing for speed
+    @field_validator("GOOGLE_API_KEY")
+    @classmethod
+    def validate_api_key(cls, v: str) -> str:
+        """Validate that API key is provided and not a placeholder."""
+        if not v or v.strip() == "":
+            raise ValueError("GOOGLE_API_KEY is required. Set it in your .env file or HF Secrets.")
+        if v.startswith("your_") or v == "YOUR_API_KEY_HERE":
+            raise ValueError("Please replace the placeholder GOOGLE_API_KEY with your actual API key.")
+        return v
+def _get_parameters():
+    """Get parameters instance with helpful error messages."""
+    is_hf_space = os.environ.get("SPACE_ID") is not None
+    try:
+        return Settings()
+    except Exception as e:
+        import sys
+        print(f"⚠️  Configuration Error: {e}", file=sys.stderr)
+        if is_hf_space:
+            print("💡 Tip: Add GOOGLE_API_KEY in Space Settings > Repository secrets", file=sys.stderr)
+        else:
+            print("💡 Tip: Create a .env file with GOOGLE_API_KEY=your_api_key", file=sys.stderr)
+        raise
+# Create parameters instance
+parameters = _get_parameters()

content_analyzer/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .document_parser import DocumentProcessor
2	+
3	+ __all__ = ["DocumentProcessor"]

content_analyzer/document_parser.py ADDED Viewed

	@@ -0,0 +1,842 @@

+import pickle
+import hashlib
+import logging
+from pathlib import Path
+from typing import List, Optional
+from datetime import datetime, timedelta
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from configuration.parameters import parameters
+from configuration.definitions import MAX_TOTAL_SIZE, ALLOWED_TYPES
+import concurrent.futures
+from PIL import Image
+import gc
+from google.genai import types
+logger = logging.getLogger(__name__)
+def preprocess_image(image, max_dim=1000):
+    """Downscale image to max_dim before OpenCV processing."""
+    if max(image.size) > max_dim:
+        ratio = max_dim / max(image.size)
+        new_size = tuple(int(dim * ratio) for dim in image.size)
+        return image.resize(new_size, Image.Resampling.LANCZOS)
+    return image
+def detect_chart_on_page(args):
+    """
+    Top-level function for parallel local chart detection (required for ProcessPoolExecutor).
+    Returns the page number, the PIL image, and the detection result.
+    """
+    page_num, image = args
+    from content_analyzer.visual_detector import LocalChartDetector
+    # Downscale image before detection to save memory
+    image = preprocess_image(image, max_dim=1000)
+    detection_result = LocalChartDetector.detect_charts(image)
+    # Do NOT delete image here; it will be saved in the main process
+    return (page_num, image, detection_result)
+def analyze_batch(batch_tuple):
+    """
+    Top-level function for parallel Gemini batch analysis (future-proof for process pools).
+    """
+    batch, batch_num, total_batches, gemini_client, file_path, parameters, stats = batch_tuple
+    try:
+        import logging
+        logger = logging.getLogger(__name__)
+        from PIL import Image
+        from google.genai import types
+        images = [Image.open(image_path) for _, image_path, _ in batch]
+        prompt = f"""
+Analyze the following {len(batch)} chart(s)/graph(s) in order.
+For EACH chart, provide comprehensive analysis separated by the marker "---CHART N---".
+For each chart include:
+**Chart Type**: [line/bar/pie/bubble/scatter/etc]
+**Title**: [chart title]
+**X-axis**: [label and units]
+**Y-axis**: [label and units]
+**Data Points**: [extract ALL visible data with exact values]
+**Legend**: [list all series/categories]
+**Trends**: [key patterns, trends, insights]
+**Key Values**: [maximum, minimum, significant values]
+**Context**: [any annotations or notes]
+Format exactly as:
+---CHART 1---
+[analysis]
+---CHART 2---
+[analysis]
+---CHART 3---
+[analysis]
+"""
+        # For batch analysis:
+        chart_response = gemini_client.models.generate_content(
+            model=parameters.CHART_VISION_MODEL,
+            contents=[prompt] + images,
+            config=types.GenerateContentConfig(
+                max_output_tokens=parameters.CHART_MAX_TOKENS * len(batch)
+            )
+        )
+        stats['batch_api_calls'] += 1
+        response_text = chart_response.text
+        parts = response_text.split('---CHART ')
+        batch_docs = []
+        for idx, (page_num, image_path, detection_result) in enumerate(batch):
+            if idx + 1 < len(parts):
+                analysis_text = parts[idx + 1]
+                if '---CHART' in analysis_text:
+                    analysis_text = analysis_text.split('---CHART')[0]
+                lines = analysis_text.split('\n')
+                if lines and '---' in lines[0]:
+                    lines = lines[1:]
+                analysis = '\n'.join(lines).strip()
+            else:
+                analysis = "Analysis unavailable (parsing error)"
+            chart_types_str = ", ".join(detection_result['chart_types']) or "Unknown"
+            confidence = detection_result['confidence']
+            chart_doc = Document(
+                page_content=f"""### 📊 Chart Analysis (Page {page_num})\n\n**Detection Method**: Hybrid (Local OpenCV + Gemini Batch Analysis)\n**Local Confidence**: {confidence:.0%}\n**Detected Types**: {chart_types_str}\n**Batch Size**: {len(batch)} charts analyzed together\n\n---\n\n{analysis}\n""",
+                metadata={
+                    "source": file_path,
+                    "page": page_num,
+                    "type": "chart",
+                    "extraction_method": "hybrid_batch",
+                    "detection_confidence": confidence,
+                    "batch_size": len(batch)
+                }
+            )
+            batch_docs.append(chart_doc)
+            stats['charts_analyzed_gemini'] += 1
+        for img in images:
+            img.close()
+        logger.info(f"✅ Batch {batch_num} complete ({len(batch)} charts analyzed)")
+        return (batch_num - 1, batch_docs)
+    except Exception as e:
+        logger = logging.getLogger(__name__)
+        logger.error(f"Batch analysis failed: {e}, trying sequential fallback...")
+        return (batch_num - 1, [])
+class DocumentProcessor:
+    """
+    Processes documents by splitting them into manageable chunks and caching
+    the results to avoid reprocessing. Handles chart extraction using local
+    OpenCV detection and Gemini Vision API with parallelization for speed.
+    """
+    # Cache metadata version - increment when cache format changes
+    CACHE_VERSION = 4  # Incremented for chart extraction support
+    def __init__(self):
+        """Initialize the document processor with cache directory and splitter configuration."""
+        self.cache_dir = Path(parameters.CACHE_DIR)
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self.splitter = RecursiveCharacterTextSplitter(
+            chunk_size=parameters.CHUNK_SIZE,
+            chunk_overlap=parameters.CHUNK_OVERLAP,
+            length_function=len,
+            is_separator_regex=False,
+        )
+        self.gemini_client = None
+        self.genai_module = None  # Store the module reference
+        if parameters.ENABLE_CHART_EXTRACTION:
+            self._init_gemini_vision()
+        logger.debug(f"DocumentProcessor initialized with cache dir: {self.cache_dir}")
+        logger.debug(f"Chunk size: {parameters.CHUNK_SIZE}, Chunk overlap: {parameters.CHUNK_OVERLAP}")
+        logger.debug(f"Chart extraction: {'enabled' if parameters.ENABLE_CHART_EXTRACTION else 'disabled'}")
+    def _init_gemini_vision(self):
+        """Initialize Gemini Vision client for chart analysis."""
+        genai = None
+        try:
+            # Use the new google.genai package
+            import google.genai as genai
+            logger.debug("✅ Loaded google.genai (new package)")
+        except ImportError as e:
+            logger.warning(f"google-genai not installed: {e}")
+            logger.info("Install with: pip install google-genai")
+            parameters.ENABLE_CHART_EXTRACTION = False
+            return
+        self.genai_module = genai
+        try:
+            from google import genai
+            self.gemini_client = genai.Client(api_key=parameters.GOOGLE_API_KEY)
+            logger.info(f"✅ Gemini Vision client initialized")
+        except Exception as e:
+            logger.error(f"❌ Failed to initialize Gemini Vision client: {e}")
+            parameters.ENABLE_CHART_EXTRACTION = False
+    def validate_files(self, files: List) -> bool:
+        """
+        Validate that uploaded files meet size and type requirements.
+        Args:
+            files: List of uploaded file objects
+        Returns:
+            bool: True if all validations pass
+        Raises:
+            ValueError: If validation fails
+        """
+        if not files:
+            raise ValueError("No files provided")
+        total_size = 0
+        for file in files:
+            # Get file size
+            if hasattr(file, 'size'):
+                file_size = file.size
+            else:
+                # Fallback: read file to get size
+                try:
+                    with open(file.name, 'rb') as f:
+                        file_size = len(f.read())
+                except Exception as e:
+                    logger.error(f"Failed to determine file size for {file.name}: {e}")
+                    raise ValueError(f"Cannot read file: {file.name}")
+            # Check individual file size
+            if file_size > parameters.MAX_FILE_SIZE:
+                raise ValueError(
+                    f"File {file.name} exceeds maximum size "
+                    f"({file_size / 1024 / 1024:.2f}MB > {parameters.MAX_FILE_SIZE / 1024 / 1024:.2f}MB)"
+                )
+            # Check file type
+            file_ext = Path(file.name).suffix.lower()
+            if file_ext not in ALLOWED_TYPES:
+                raise ValueError(
+                    f"File type {file_ext} not supported. Allowed types: {ALLOWED_TYPES}"
+                )
+            total_size += file_size
+        # Check total size
+        if total_size > parameters.MAX_TOTAL_SIZE:
+            raise ValueError(
+                f"Total file size exceeds maximum "
+                f"({total_size / 1024 / 1024:.2f}MB > {parameters.MAX_TOTAL_SIZE / 1024 / 1024:.2f}MB)"
+            )
+        logger.info(f"Validation passed for {len(files)} files (total: {total_size / 1024 / 1024:.2f}MB)")
+        return True
+    def _generate_hash(self, content: bytes) -> str:
+        """Generate SHA-256 hash of file content."""
+        return hashlib.sha256(content).hexdigest()
+    def _is_cache_valid(self, cache_path: Path) -> bool:
+        """Check if a cache file exists and is still valid (not expired)."""
+        if not cache_path.exists():
+            logger.debug(f"Cache miss: {cache_path.name}")
+            return False
+        file_age = datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)
+        if file_age > timedelta(days=parameters.CACHE_EXPIRE_DAYS):
+            logger.info(f"Cache expired (age: {file_age.days} days): {cache_path.name}")
+            cache_path.unlink()
+            return False
+        logger.debug(f"Cache hit: {cache_path.name} (age: {file_age.days} days)")
+        return True
+    def _load_from_cache(self, cache_path: Path) -> List:
+        """Loads chunks from a pickle file, handling potential corruption."""
+        try:
+            with open(cache_path, "rb") as f:
+                data = pickle.load(f)
+            if "chunks" not in data or "timestamp" not in data:
+                raise KeyError("Cache file missing 'chunks' or 'timestamp' key.")
+            logger.info(f"Loaded {len(data['chunks'])} chunks from cache: {cache_path.name}")
+            return data["chunks"]
+        except (pickle.UnpicklingError, KeyError, EOFError) as e:
+            logger.warning(f"Cache corruption detected in {cache_path.name}: {e}. Deleting cache.")
+            cache_path.unlink()
+            return []
+        except Exception as e:
+            logger.error(f"Unexpected error loading cache {cache_path.name}: {e}", exc_info=True)
+            if cache_path.exists():
+                cache_path.unlink()
+            return []
+    def _save_to_cache(self, chunks: List, cache_path: Path):
+        """Saves chunks to a pickle file."""
+        try:
+            with open(cache_path, "wb") as f:
+                pickle.dump({
+                    "timestamp": datetime.now().timestamp(),
+                    "chunks": chunks
+                }, f)
+            logger.info(f"Successfully cached {len(chunks)} chunks to {cache_path.name}")
+        except Exception as e:
+            logger.error(f"Failed to save cache to {cache_path.name}: {e}", exc_info=True)
+    def _process_file(self, file, progress_callback=None) -> List[Document]:
+        file_ext = Path(file.name).suffix.lower()
+        if file_ext not in ALLOWED_TYPES:
+            logger.warning(f"Skipping unsupported file type: {file.name}")
+            return []
+        try:
+            documents = []
+            if file_ext == '.pdf':
+                import concurrent.futures
+                results = {}
+                def run_pdfplumber():
+                    return self._load_pdf_with_pdfplumber(file.name)
+                def run_charts():
+                    logger.info(f"ENABLE_CHART_EXTRACTION={parameters.ENABLE_CHART_EXTRACTION}, gemini_client={self.gemini_client is not None}")
+                    if parameters.ENABLE_CHART_EXTRACTION and self.gemini_client:
+                        return self._extract_charts_from_pdf(file.name)
+                    return []
+                try:
+                    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+                        future_pdf = executor.submit(run_pdfplumber)
+                        future_charts = executor.submit(run_charts)
+                        try:
+                            docs = future_pdf.result()
+                        except MemoryError as e:
+                            logger.error(f"Out of memory in PDFPlumber thread: {e}. Falling back to sequential.")
+                            docs = self._load_pdf_with_pdfplumber(file.name)
+                        try:
+                            chart_docs = future_charts.result()
+                        except MemoryError as e:
+                            logger.error(f"Out of memory in chart extraction thread: {e}. Falling back to sequential.")
+                            chart_docs = self._extract_charts_from_pdf(file.name)
+                        documents = docs or []
+                        if chart_docs:
+                            documents.extend(chart_docs)
+                            logger.info(f"📊 Added {len(chart_docs)} chart descriptions to {file.name}")
+                except MemoryError as e:
+                    logger.error(f"Out of memory in parallel PDF processing: {e}. Falling back to sequential.")
+                    documents = self._load_pdf_with_pdfplumber(file.name)
+                    if parameters.ENABLE_CHART_EXTRACTION and self.gemini_client:
+                        chart_docs = self._extract_charts_from_pdf(file.name)
+                        if chart_docs:
+                            documents.extend(chart_docs)
+                            logger.info(f"📊 Added {len(chart_docs)} chart descriptions to {file.name}")
+            else:
+                from langchain_community.document_loaders import (
+                    Docx2txtLoader,
+                    TextLoader,
+                )
+                loader_map = {
+                    '.docx': Docx2txtLoader,
+                    '.txt': TextLoader,
+                    '.md': TextLoader,
+                }
+                loader_class = loader_map.get(file_ext)
+                if not loader_class:
+                    logger.warning(f"No loader found for {file_ext}")
+                    return []
+                logger.info(f"Loading {file_ext} file: {file.name}")
+                loader = loader_class(file.name)
+                documents = loader.load()
+            if not documents:
+                logger.warning(f"No content extracted from {file.name}")
+                return []
+            all_chunks = []
+            total_docs = len(documents)
+            file_hash = self._generate_hash(file.name.encode())  # Unique per file
+            for i, doc in enumerate(documents):
+                page_chunks = self.splitter.split_text(doc.page_content)
+                total_chunks = len(page_chunks)
+                for j, chunk in enumerate(page_chunks):
+                    chunk_id = f"{file_hash}_{doc.metadata.get('page', i + 1)}_{j}"
+                    chunk_doc = Document(
+                        page_content=chunk,
+                        metadata={
+                            "source": doc.metadata.get("source", file.name),
+                            "page": doc.metadata.get("page", i + 1),
+                            "type": doc.metadata.get("type", "text"),
+                            "chunk_id": chunk_id
+                        }
+                    )
+                    all_chunks.append(chunk_doc)
+                    if progress_callback:
+                        percent = int(100 * ((i + (j + 1) / total_chunks) / total_docs))
+                        step = f"Splitting page {i+1} into chunks"
+                        progress_callback(percent, step)
+            logger.info(f"Processed {file.name}: {len(documents)} page(s) → {len(all_chunks)} chunk(s)")
+            return all_chunks
+        except ImportError as e:
+            logger.error(f"Required loader not installed for {file_ext}: {e}")
+            return []
+        except Exception as e:
+            logger.error(f"Failed to process {file.name}: {e}", exc_info=True)
+            raise
+    def _extract_charts_from_pdf(self, file_path: str) -> List[Document]:
+        """
+        Extract and analyze charts/graphs from PDF with true batch processing and parallelism.
+        PHASE 1: Parallel local chart detection (CPU-bound, uses ProcessPoolExecutor)
+        PHASE 2: Parallel Gemini batch analysis (I/O-bound, uses ThreadPoolExecutor)
+        """
+        file_hash = self._generate_hash(file_path.encode())
+        def deduplicate_charts_by_title(chart_chunks):
+            seen_titles = set()
+            unique_chunks = []
+            import re
+            for chunk in chart_chunks:
+                match = re.search(r"\*\*Title\*\*:\s*(.+)", chunk.page_content)
+                title = match.group(1).strip() if match else None
+                if title and title not in seen_titles:
+                    seen_titles.add(title)
+                    unique_chunks.append(chunk)
+                elif not title:
+                    unique_chunks.append(chunk)
+            return unique_chunks
+        try:
+            from pdf2image import convert_from_path
+            from PIL import Image
+            import pdfplumber
+            import tempfile
+            import os
+            # Import local detector if enabled
+            use_local = parameters.CHART_USE_LOCAL_DETECTION
+            if use_local:
+                try:
+                    from content_analyzer.visual_detector import LocalChartDetector
+                    logger.info(f"📊 [BATCH MODE] Local detection → Temp cache → Batch analysis")
+                except ImportError:
+                    logger.warning("Local chart detector not available, falling back to Gemini")
+                    use_local = False
+            # Track statistics
+            stats = {
+                'pages_scanned': 0,
+                'charts_detected_local': 0,
+                'charts_analyzed_gemini': 0,
+                'api_calls_saved': 0,
+                'batch_api_calls': 0
+            }
+            # Get PDF page count
+            with pdfplumber.open(file_path) as pdf:
+                total_pages = len(pdf.pages)
+            logger.info(f"Processing {total_pages} pages for chart detection...")
+            # Create temp directory for chart images
+            temp_dir = tempfile.mkdtemp(prefix='charts_')
+            detected_charts = []  # [(page_num, image_path, detection_result), ...]
+            try:
+                # === PHASE 1: PARALLEL LOCAL CHART DETECTION (CPU-BOUND) ===
+                logger.info("Phase 1: Detecting charts and caching to disk...")
+                batch_size = parameters.CHART_BATCH_SIZE
+                page_image_tuples = []
+                for start_page in range(1, total_pages + 1, batch_size):
+                    end_page = min(start_page + batch_size - 1, total_pages)
+                    try:
+                        images = convert_from_path(
+                            file_path,
+                            dpi=parameters.CHART_DPI,
+                            first_page=start_page,
+                            last_page=end_page,
+                            fmt='jpeg',
+                            jpegopt={'quality': 85, 'optimize': True}
+                        )
+                        for idx, image in enumerate(images):
+                            page_num = start_page + idx
+                            stats['pages_scanned'] += 1
+                            # Resize if needed
+                            max_dimension = parameters.CHART_MAX_IMAGE_SIZE
+                            if max(image.size) > max_dimension:
+                                ratio = max_dimension / max(image.size)
+                                new_size = tuple(int(dim * ratio) for dim in image.size)
+                                image = image.resize(new_size, Image.Resampling.LANCZOS)
+                            page_image_tuples.append((page_num, image))
+                        del images
+                    except Exception as e:
+                        logger.warning(f"Failed to process pages {start_page}-{end_page}: {e}")
+                        continue
+                detected_charts = []
+                if use_local and parameters.CHART_SKIP_GEMINI_DETECTION and page_image_tuples:
+                    logger.info("Parallel local chart detection using ProcessPoolExecutor...")
+                    # Limit parallelism to avoid memory errors
+                    with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
+                        results = list(executor.map(detect_chart_on_page, page_image_tuples))
+                    for page_num, image, detection_result in results:
+                        if not detection_result['has_chart']:
+                            logger.debug(f"Page {page_num}: No chart detected (skipping)")
+                            stats['api_calls_saved'] += 1
+                            continue
+                        confidence = detection_result['confidence']
+                        if confidence < parameters.CHART_MIN_CONFIDENCE:
+                            logger.debug(f"Page {page_num}: Low confidence ({confidence:.0%}), skipping")
+                            stats['api_calls_saved'] += 1
+                            continue
+                        logger.info(f"📈 Chart detected on page {page_num} (confidence: {confidence:.0%})")
+                        stats['charts_detected_local'] += 1
+                        image_path = os.path.join(temp_dir, f'chart_page_{page_num}.jpg')
+                        image.save(image_path, 'JPEG', quality=90)
+                        detected_charts.append((page_num, image_path, detection_result))
+                        # Release memory
+                        del image
+                        gc.collect()
+                else:
+                    # Fallback: sequential detection
+                    for page_num, image in page_image_tuples:
+                        if use_local and parameters.CHART_SKIP_GEMINI_DETECTION:
+                            detection_result = LocalChartDetector.detect_charts(image)
+                            if not detection_result['has_chart']:
+                                logger.debug(f"Page {page_num}: No chart detected (skipping)")
+                                stats['api_calls_saved'] += 1
+                                continue
+                            confidence = detection_result['confidence']
+                            if confidence < parameters.CHART_MIN_CONFIDENCE:
+                                logger.debug(f"Page {page_num}: Low confidence ({confidence:.0%}), skipping")
+                                stats['api_calls_saved'] += 1
+                                continue
+                            logger.info(f"📈 Chart detected on page {page_num} (confidence: {confidence:.0%})")
+                            stats['charts_detected_local'] += 1
+                            image_path = os.path.join(temp_dir, f'chart_page_{page_num}.jpg')
+                            image.save(image_path, 'JPEG', quality=90)
+                            detected_charts.append((page_num, image_path, detection_result))
+                logger.info(f"Phase 1 complete: {len(detected_charts)} charts detected and cached")
+                # === PHASE 2: PARALLEL GEMINI BATCH ANALYSIS (I/O-BOUND) ===
+                if not detected_charts or not self.gemini_client:
+                    return []
+                logger.info(f"Phase 2: Batch analyzing {len(detected_charts)} charts...")
+                chart_documents = []
+                if parameters.CHART_ENABLE_BATCH_ANALYSIS and len(detected_charts) > 1:
+                    # Batch processing with parallel Gemini API calls
+                    gemini_batch_size = parameters.CHART_GEMINI_BATCH_SIZE
+                    batches = [detected_charts[i:i + gemini_batch_size] for i in range(0, len(detected_charts), gemini_batch_size)]
+                    # Prepare batch tuples with batch_num and total_batches
+                    batch_tuples = [
+                        (batch, idx + 1, len(batches), self.gemini_client, file_path, parameters, stats)
+                        for idx, batch in enumerate(batches)
+                    ]
+                    results = [None] * len(batches)
+                    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
+                        future_to_idx = {executor.submit(analyze_batch, batch_tuple): idx for idx, batch_tuple in enumerate(batch_tuples)}
+                        for future in concurrent.futures.as_completed(future_to_idx):
+                            idx = future_to_idx[future]
+                            try:
+                                batch_idx, batch_docs = future.result()
+                                results[batch_idx] = batch_docs
+                            except Exception as exc:
+                                logger.error(f"Batch {idx} generated an exception: {exc}")
+                    # Flatten results and filter out None
+                    chart_index = 0
+                    for batch_docs in results:
+                        if batch_docs:
+                            for doc in batch_docs:
+                                doc.metadata["chunk_id"] = f"{file_hash}_{doc.metadata.get('page', 0)}_{chart_index}"
+                                chart_documents.append(doc)
+                                chart_index += 1
+                else:
+                    # Sequential processing (batch disabled or single chart)
+                    for chart_index, (page_num, image_path, detection_result) in enumerate(detected_charts):
+                        try:
+                            img = Image.open(image_path)
+                            extraction_prompt = """Analyze this chart/graph in comprehensive detail:
+                            **Chart Type**: [type]
+                            **Title**: [title]
+                            **Axes**: [X and Y labels/units]
+                            **Data Points**: [extract all visible data]
+                            **Legend**: [series/categories]
+                            **Trends**: [key patterns and insights]
+                            **Key Values**: [max, min, significant]
+                            **Context**: [annotations or notes]
+                            """
+                            chart_response = self.gemini_client.models.generate_content(
+                                model=parameters.CHART_VISION_MODEL,
+                                contents=[extraction_prompt, img],
+                                config=types.GenerateContentConfig(
+                                    max_output_tokens=parameters.CHART_MAX_TOKENS
+                                )
+                            )
+                            chart_types_str = ", ".join(detection_result['chart_types']) or "Unknown"
+                            chart_doc = Document(
+                                page_content=f"""### \U0001F4CA Chart Analysis (Page {page_num})\n\n**Detection Method**: Hybrid (Local OpenCV + Gemini Sequential)\n**Local Confidence**: {detection_result['confidence']:.0%}\n**Detected Types**: {chart_types_str}\n\n---\n\n{chart_response.text}\n""",
+                                metadata={
+                                    "source": file_path,
+                                    "page": page_num,
+                                    "type": "chart",
+                                    "extraction_method": "hybrid_sequential",
+                                    "chunk_id": f"{file_hash}_{page_num}_{chart_index}"
+                                }
+                            )
+                            chart_documents.append(chart_doc)
+                            stats['charts_analyzed_gemini'] += 1
+                            img.close()
+                            logger.info(f"✅ Analyzed chart on page {page_num}")
+                        except Exception as e:
+                            logger.error(f"Failed to analyze page {page_num}: {e}")
+                # Log statistics
+                if use_local and parameters.CHART_SKIP_GEMINI_DETECTION:
+                    cost_saved = stats['api_calls_saved'] * 0.0125
+                    actual_cost = stats['batch_api_calls'] * 0.0125 if stats['batch_api_calls'] > 0 else stats['charts_analyzed_gemini'] * 0.0125
+                    if stats['batch_api_calls'] > 0:
+                        efficiency = stats['charts_analyzed_gemini'] / stats['batch_api_calls']
+                    else:
+                        efficiency = 1.0
+                    logger.info(f"""
+📊 Chart Extraction Complete (HYBRID + BATCH MODE):
+   Pages scanned: {stats['pages_scanned']}
+   Charts detected (local): {stats['charts_detected_local']}
+   Charts analyzed (Gemini): {stats['charts_analyzed_gemini']}
+   Batch API calls: {stats['batch_api_calls']}
+   Charts per API call: {efficiency:.1f}
+   API calls saved (detection): {stats['api_calls_saved']}
+   Estimated cost savings: ${cost_saved:.3f}
+   Actual API cost: ${actual_cost:.3f}
+""")
+                # After chart_documents is created (batch or sequential), deduplicate by title:
+                chart_documents = deduplicate_charts_by_title(chart_documents)
+                return chart_documents
+            finally:
+                # Only clean up after all analysis is done
+                try:
+                    import shutil
+                    shutil.rmtree(temp_dir)
+                    logger.debug(f"Cleaned up temp directory: {temp_dir}")
+                except Exception as e:
+                    logger.warning(f"Failed to clean temp directory {temp_dir}: {e}")
+        except ImportError as e:
+            logger.warning(f"Dependencies missing for chart extraction: {e}")
+            return []
+        except MemoryError as e:
+            logger.error(f"Out of memory while processing {file_path}. Try reducing DPI or batch size.")
+            return []
+        except Exception as e:
+            logger.error(f"Chart extraction failed for {file_path}: {e}", exc_info=True)
+            return []
+    def _load_pdf_with_pdfplumber(self, file_path: str) -> List[Document]:
+        """
+        Load PDF using pdfplumber for text and table extraction.
+        Uses multiple table detection strategies for complex tables.
+        """
+        import pdfplumber
+        logger.info(f"[PDFPLUMBER] Processing: {file_path}")
+        file_hash = self._generate_hash(file_path.encode())
+        # Strategy 1: Line-based (default) - for tables with visible borders
+        default_parameters = {}
+        # Strategy 2: Text-based - for borderless tables with aligned text
+        text_parameters = {
+            "vertical_strategy": "text",
+            "horizontal_strategy": "text",
+            "snap_tolerance": 5,
+            "join_tolerance": 5,
+            "edge_min_length": 3,
+            "min_words_vertical": 2,
+            "min_words_horizontal": 1,
+            "text_tolerance": 3,
+            "intersection_tolerance": 5,
+        }
+        # Strategy 3: Lines + text hybrid - for complex tables
+        hybrid_parameters = {
+            "vertical_strategy": "lines_strict",
+            "horizontal_strategy": "text",
+            "snap_tolerance": 5,
+            "join_tolerance": 5,
+            "min_words_horizontal": 1,
+        }
+        all_content = []
+        total_tables = 0
+        with pdfplumber.open(file_path) as pdf:
+            for page_num, page in enumerate(pdf.pages, 1):
+                page_content = [f"## Page {page_num}"]
+                page_tables = []
+                table_hashes = set()  # Track unique tables
+                def add_table_if_unique(table, strategy_name):
+                    """Add table if not already found."""
+                    if not table or len(table) < 2:
+                        return False
+                    # Create hash of table content
+                    table_str = str(table)
+                    table_hash = hash(table_str)
+                    if table_hash not in table_hashes:
+                        table_hashes.add(table_hash)
+                        page_tables.append((table, strategy_name))
+                        return True
+                    return False
+                # --- Robust per-page error handling ---
+                try:
+                    # Strategy 1: Default line-based detection
+                    try:
+                        default_tables = page.extract_tables()
+                        if default_tables:
+                            for t in default_tables:
+                                add_table_if_unique(t, "default")
+                    except Exception as e:
+                        logger.warning(f"Default strategy failed on page {page_num}: {e}")
+                    # Strategy 2: Text-based detection for borderless tables
+                    try:
+                        text_tables = page.extract_tables(text_parameters)
+                        if text_tables:
+                            for t in text_tables:
+                                add_table_if_unique(t, "text")
+                    except Exception as e:
+                        logger.warning(f"Text strategy failed on page {page_num}: {e}")
+                    # Strategy 3: Hybrid detection
+                    try:
+                        hybrid_tables = page.extract_tables(hybrid_parameters)
+                        if hybrid_tables:
+                            for t in hybrid_tables:
+                                add_table_if_unique(t, "hybrid")
+                    except Exception as e:
+                        logger.warning(f"Hybrid strategy failed on page {page_num}: {e}")
+                    # Strategy 4: Use find_tables() for more control
+                    try:
+                        found_tables = page.find_tables(text_parameters)
+                        if found_tables:
+                            for ft in found_tables:
+                                t = ft.extract()
+                                add_table_if_unique(t, "find_tables")
+                    except Exception as e:
+                        logger.warning(f"find_tables() failed on page {page_num}: {e}")
+                    # Convert tables to markdown
+                    for table, strategy in page_tables:
+                        total_tables += 1
+                        md_table = self._table_to_markdown(table, page_num, total_tables)
+                        if md_table:
+                            page_content.append(md_table)
+                    # Extract text
+                    try:
+                        text = page.extract_text()
+                        if text:
+                            page_content.append(text.strip())
+                    except Exception as e:
+                        logger.warning(f"Text extraction failed on page {page_num}: {e}")
+                    if len(page_content) > 1:
+                        combined = "\n\n".join(page_content)
+                        chunk_id = f"{file_hash}_{page_num}_0"
+                        doc = Document(
+                            page_content=combined,
+                            metadata={
+                                "source": file_path,
+                                "page": page_num,
+                                "loader": "pdfplumber",
+                                "tables_count": total_tables,
+                                "type": "text",
+                                "chunk_id": chunk_id
+                            }
+                        )
+                        all_content.append(doc)
+                except Exception as e:
+                    logger.warning(f"Skipping page {page_num} due to error: {e}")
+                    continue
+        logger.info(f"[PDFPLUMBER] Extracted {len(all_content)} chunks, {total_tables} tables")
+        return all_content
+    def _table_to_markdown(self, table: List[List], page_num: int, table_idx: int) -> str:
+        """Convert a table (list of rows) to markdown format."""
+        if not table or len(table) < 1:
+            return ""
+        # Clean up cells
+        cleaned_table = []
+        for row in table:
+            if row:
+                cleaned_row = []
+                for cell in row:
+                    if cell:
+                        cell_text = str(cell).replace('\n', ' ').replace('\r', ' ').replace('|', '\\|').strip()
+                        cleaned_row.append(cell_text)
+                    else:
+                        cleaned_row.append("")
+                if any(cleaned_row):
+                    cleaned_table.append(cleaned_row)
+        if len(cleaned_table) < 1:
+            return ""
+        # Determine max columns and pad rows
+        max_cols = max(len(row) for row in cleaned_table)
+        for row in cleaned_table:
+            while len(row) < max_cols:
+                row.append("")
+        # Build markdown table
+        md_lines = [f"### Table {table_idx} (Page {page_num})"]
+        md_lines.append("| " + " | ".join(cleaned_table[0]) + " |")
+        md_lines.append("| " + " | ".join(["---"] * max_cols) + " |")
+        for row in cleaned_table[1:]:
+            md_lines.append("| " + " | ".join(row) + " |")
+        return "\n".join(md_lines)
+    def process(self, files: List, progress_callback=None) -> List[Document]:
+        """
+        Process multiple files with caching and deduplication.
+        """
+        self.validate_files(files)
+        all_chunks = []
+        seen_hashes = set()
+        logger.info(f"Processing {len(files)} file(s)...")
+        for file in files:
+            try:
+                with open(file.name, 'rb') as f:
+                    file_content = f.read()
+                    file_hash = self._generate_hash(file_content)
+                cache_path = self.cache_dir / f"{file_hash}.pkl"
+                if self._is_cache_valid(cache_path):
+                    chunks = self._load_from_cache(cache_path)
+                    if chunks:
+                        logger.info(f"Using cached chunks for {file.name}")
+                    else:
+                        chunks = self._process_file(file, progress_callback=progress_callback)
+                        self._save_to_cache(chunks, cache_path)
+                else:
+                    logger.info(f"Processing and caching: {file.name}")
+                    chunks = self._process_file(file, progress_callback=progress_callback)
+                    self._save_to_cache(chunks, cache_path)
+                for chunk in chunks:
+                    chunk_hash = self._generate_hash(chunk.page_content.encode())
+                    if chunk_hash not in seen_hashes:
+                        seen_hashes.add(chunk_hash)
+                        all_chunks.append(chunk)
+            except Exception as e:
+                logger.error(f"Failed to process {file.name}: {e}", exc_info=True)
+                continue
+        logger.info(f"Processing complete: {len(all_chunks)} unique chunks from {len(files)} file(s)")
+        return all_chunks
+def run_pdfplumber(file_name):
+    from content_analyzer.document_parser import DocumentProcessor
+    processor = DocumentProcessor()
+    return processor._load_pdf_with_pdfplumber(file_name)
+def run_charts(file_name, enable_chart_extraction, gemini_client):
+    from content_analyzer.document_parser import DocumentProcessor
+    processor = DocumentProcessor()
+    processor.gemini_client = gemini_client
+    if enable_chart_extraction and gemini_client:
+        return processor._extract_charts_from_pdf(file_name)
+    return []

content_analyzer/visual_detector.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""
+Local Chart Detection Module - NO API CALLS
+Uses OpenCV and image analysis for chart detection without any LLM cost.
+This module provides FREE chart detection as an alternative to expensive Gemini Vision API calls.
+Author: SmartDoc AI
+License: MIT
+"""
+import logging
+from typing import Dict, Any
+logger = logging.getLogger(__name__)
+class LocalChartDetector:
+    """
+    Detects charts in images using OpenCV - completely free, no API calls.
+    Detection Features:
+        - Edge detection (Canny)
+        - Line detection (HoughLinesP)
+        - Circle detection (HoughCircles)
+        - Contour analysis for shapes
+        - Axis pattern recognition
+    Detectable Chart Types:
+        - Line charts (multiple organized lines)
+        - Bar charts (rectangular shapes)
+        - Pie charts (circular patterns)
+        - Scatter plots (lines + circles)
+        - Charts with axes (H/V line patterns)
+        - Bubble charts (circles with variable size)
+        - Zone diagrams (areas with color coding)
+    """
+    @staticmethod
+    def detect_charts(image) -> Dict[str, Any]:
+        """
+        Detects complex charts and visualizations only - rejects tables, maps, and simple graphics.
+        Returns a dictionary with detection results and features.
+        """
+        import time
+        start_time = time.time()
+        try:
+            import cv2
+            import numpy as np
+            from PIL import Image as PILImage
+            # --- Image Preparation ---
+            # Convert PIL image to OpenCV format if needed
+            if isinstance(image, PILImage.Image):
+                image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            else:
+                image_cv = image
+            height, width = image_cv.shape[:2]
+            gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
+            # --- Edge Detection ---
+            edges = cv2.Canny(gray, 50, 150)
+            # --- Edge Density Calculation ---
+            w_half = width // 2
+            left_region = edges[:, :w_half]
+            right_region = edges[:, w_half:]
+            left_edge_density = np.sum(left_region > 0) / (left_region.shape[0] * left_region.shape[1])
+            right_edge_density = np.sum(right_region > 0) / (right_region.shape[0] * right_region.shape[1])
+            overall_edge_density = np.sum(edges > 0) / (edges.shape[0] * edges.shape[1])
+            has_text_region = (
+                (left_edge_density > 0.08 and right_edge_density > 0.08) or
+                overall_edge_density > 0.15
+            )
+            # --- Line Detection ---
+            lines = cv2.HoughLinesP(
+                edges,
+                rho=1,
+                theta=np.pi/180,
+                threshold=100,
+                minLineLength=100,
+                maxLineGap=10
+            )
+            line_count = len(lines) if lines is not None else 0
+            diagonal_lines = 0
+            line_angles = []
+            if lines is not None:
+                for line in lines:
+                    x1, y1, x2, y2 = line[0]
+                    angle = np.abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)
+                    if 10 < angle < 80 or 100 < angle < 170:
+                        diagonal_lines += 1
+                        line_angles.append(angle)
+            # --- Circle Detection (Optimized) ---
+            run_circles = diagonal_lines >= 1 or line_count >= 6 or overall_edge_density > 0.08
+            circle_count = 0
+            circles = None
+            if run_circles:
+                scale = 0.5 if max(height, width) > 800 else 1.0
+                small_gray = cv2.resize(gray, (int(width*scale), int(height*scale)), interpolation=cv2.INTER_AREA) if scale < 1.0 else gray
+                circles = cv2.HoughCircles(
+                    small_gray,
+                    cv2.HOUGH_GRADIENT,
+                    dp=2.5,
+                    minDist=60,
+                    param1=50,
+                    param2=55,
+                    minRadius=18,
+                    maxRadius=100
+                )
+                if circles is not None:
+                    circle_count = circles.shape[2]
+            # --- Color Diversity Analysis ---
+            hsv = cv2.cvtColor(image_cv, cv2.COLOR_BGR2HSV)
+            hist = cv2.calcHist([hsv], [0], None, [180], [0, 180])
+            color_peaks = np.sum(hist > np.mean(hist) * 2)
+            # --- Contour Detection ---
+            contours, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+            significant_contours = 0
+            rectangle_contours = 0
+            similar_rectangles = []
+            large_contours = 0
+            small_scattered_contours = 0
+            for contour in contours:
+                area = cv2.contourArea(contour)
+                if area < 500:
+                    small_scattered_contours += 1
+                elif 1500 < area < 40000:
+                    significant_contours += 1
+                    peri = cv2.arcLength(contour, True)
+                    approx = cv2.approxPolyDP(contour, 0.04 * peri, True)
+                    if len(approx) == 4:
+                        rectangle_contours += 1
+                        x, y, w, h = cv2.boundingRect(contour)
+                        similar_rectangles.append((w, h, area))
+                elif 40000 < area < 500000:
+                    large_contours += 1
+            # --- Bar Chart Pattern Detection ---
+            bar_pattern = False
+            if len(similar_rectangles) >= 6:
+                widths = [r[0] for r in similar_rectangles]
+                heights = [r[1] for r in similar_rectangles]
+                width_std = np.std(widths)
+                height_std = np.std(heights)
+                avg_width = np.mean(widths)
+                avg_height = np.mean(heights)
+                if (width_std < avg_width * 0.3 or height_std < avg_height * 0.3):
+                    bar_pattern = True
+            # --- Line Classification ---
+            horizontal_lines = 0
+            vertical_lines = 0
+            diagonal_lines = 0
+            line_angles = []
+            very_short_lines = 0
+            if lines is not None:
+                for line in lines:
+                    x1, y1, x2, y2 = line[0]
+                    length = np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
+                    if length < 50:
+                        very_short_lines += 1
+                        continue
+                    if length < 80:
+                        continue
+                    angle = np.abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)
+                    line_angles.append(angle)
+                    if angle < 10 or angle > 170:
+                        horizontal_lines += 1
+                    elif 80 < angle < 100:
+                        vertical_lines += 1
+                    else:
+                        diagonal_lines += 1
+            angle_variance = np.var(line_angles) if len(line_angles) > 2 else 0
+            # --- Debug Logging ---
+            logger.debug(f"Chart detection features: lines={line_count}, diagonal_lines={diagonal_lines}, circles={circle_count}, horizontal_lines={horizontal_lines}, vertical_lines={vertical_lines}, color_peaks={color_peaks}, angle_variance={angle_variance}")
+            # --- Chart Heuristics and Classification ---
+            chart_types = []
+            confidence = 0.0
+            description = ""
+            rejection_reason = ""
+            # Negative checks (text slides, decorative backgrounds, tables)
+            if has_text_region and circle_count < 2 and diagonal_lines < 2 and not bar_pattern:
+                if small_scattered_contours > 100 or very_short_lines > 50:
+                    rejection_reason = f"Text slide with decorative background (overall density: {overall_edge_density:.2%})"
+                    logger.debug(f"Rejected: {rejection_reason}")
+                    return _chart_result(False, 0.0, [], rejection_reason, line_count, circle_count, overall_edge_density)
+            if very_short_lines > 50 and circle_count < 2 and diagonal_lines < 3 and line_count < 10:
+                rejection_reason = f"Decorative network background ({very_short_lines} tiny lines, no data elements)"
+                logger.debug(f"Rejected: {rejection_reason}")
+                return _chart_result(False, 0.0, [], rejection_reason, line_count, circle_count, overall_edge_density)
+            if horizontal_lines > 12 and vertical_lines > 12 and circle_count == 0 and diagonal_lines < 2:
+                grid_lines = horizontal_lines + vertical_lines
+                total_lines = line_count
+                grid_ratio = grid_lines / max(total_lines, 1)
+                if grid_ratio > 0.75:
+                    rejection_reason = f"Simple table pattern (H:{horizontal_lines}, V:{vertical_lines})"
+                    logger.debug(f"Rejected: {rejection_reason}")
+                    return _chart_result(False, 0.0, [], rejection_reason, line_count, circle_count, overall_edge_density)
+            # Positive chart heuristics (bubble, scatter, line, pie, bar, complex)
+            # RELAXED: Detect as line chart if 2+ diagonal lines and angle variance > 40, or 1+ diagonal line and 1+ axis
+            if (
+                (diagonal_lines >= 2 and angle_variance > 40) or
+                (diagonal_lines >= 1 and (horizontal_lines >= 1 or vertical_lines >= 1))
+            ):
+                chart_types.append("line_chart")
+                confidence = max(confidence, min(0.88, 0.6 + (diagonal_lines / 40)))
+                if (horizontal_lines >= 1 or vertical_lines >= 1):
+                    confidence = min(0.95, confidence + 0.08)
+                if not description:
+                    description = f"Line chart: {diagonal_lines} diagonal lines, axes: {horizontal_lines+vertical_lines}, variance: {angle_variance:.0f}"
+            if circle_count >= 5:
+                chart_types.append("bubble_chart")
+                confidence = min(0.92, 0.70 + (min(circle_count, 20) * 0.01))
+                description = f"Bubble chart: {circle_count} circles"
+                if color_peaks > 5:
+                    confidence = min(0.95, confidence + 0.1)
+                    description += f", {int(color_peaks)} color zones"
+                if large_contours > 2:
+                    confidence = min(0.97, confidence + 0.05)
+                    chart_types.append("zone_diagram")
+                    description += f", {large_contours} colored regions"
+            elif circle_count >= 3 and diagonal_lines > 2:
+                chart_types.append("scatter_plot")
+                confidence = max(confidence, 0.75)
+                description = f"Scatter plot: {circle_count} data points"
+            if circle_count > 0 and circle_count < 5:
+                if "bubble_chart" not in chart_types:
+                    chart_types.append("pie_chart")
+                    confidence = max(confidence, 0.80)
+                    if not description:
+                        description = f"Pie chart: {circle_count} circular pattern(s)"
+            if bar_pattern and rectangle_contours >= 6:
+                chart_types.append("bar_chart")
+                confidence = max(confidence, 0.75 + (min(rectangle_contours, 12) / 40))
+                if not description:
+                    description = f"Bar chart: {rectangle_contours} bars"
+            if circle_count >= 3 and large_contours >= 2 and color_peaks > 5:
+                chart_types.append("complex_visualization")
+                confidence = max(confidence, 0.85)
+                if not description:
+                    description = "Complex visualization with zones and data points"
+            has_moderate_axes = (1 <= horizontal_lines <= 6 or 1 <= vertical_lines <= 6)
+            has_real_data = (circle_count >= 3 or diagonal_lines >= 2 or bar_pattern)
+            if has_moderate_axes and has_real_data and confidence > 0.3:
+                confidence = min(0.90, confidence + 0.10)
+                if not description:
+                    description = f"Chart with axes and data elements"
+            # Final chart determination
+            strong_indicator = (
+                (diagonal_lines >= 2 and angle_variance > 40) or
+                (diagonal_lines >= 1 and (horizontal_lines >= 1 or vertical_lines >= 1)) or
+                circle_count >= 5 or
+                (circle_count >= 3 and large_contours >= 2) or
+                bar_pattern or
+                (circle_count >= 3 and color_peaks > 5)
+            )
+            has_chart = (
+                len(chart_types) > 0 and
+                confidence > 0.4 and
+                strong_indicator
+            )
+            total_time = time.time() - start_time
+            if has_chart:
+                logger.info(f"?? OpenCV detection: {total_time*1000:.0f}ms (lines:{line_count}, diagonal_lines:{diagonal_lines}, circles:{circle_count}, axes:{horizontal_lines+vertical_lines}, angle_variance:{angle_variance})")
+            else:
+                logger.debug(f"?? OpenCV detection: {total_time*1000:.0f}ms (rejected)")
+            return {
+                'has_chart': has_chart,
+                'confidence': float(confidence),
+                'chart_types': list(set(chart_types)),
+                'description': description or "Potential chart detected",
+                'features': {
+                    'lines': line_count,
+                    'diagonal_lines': diagonal_lines,
+                    'circles': circle_count,
+                    'contours': significant_contours,
+                    'rectangles': rectangle_contours,
+                    'horizontal_lines': horizontal_lines,
+                    'vertical_lines': vertical_lines,
+                    'angle_variance': float(angle_variance),
+                    'bar_pattern': bar_pattern,
+                    'large_contours': large_contours,
+                    'color_peaks': int(color_peaks),
+                    'text_region': has_text_region,
+                    'very_short_lines': very_short_lines,
+                    'overall_edge_density': float(overall_edge_density),
+                    'detection_time_ms': float(total_time * 1000)
+                }
+            }
+        except ImportError as e:
+            logger.warning(f"OpenCV not installed: {e}")
+            logger.info("Install with: pip install opencv-python")
+            return {
+                'has_chart': False,
+                'confidence': 0.0,
+                'chart_types': [],
+                'description': 'OpenCV required for local detection',
+                'features': {},
+                'error': 'opencv_not_installed'
+            }
+        except Exception as e:
+            logger.error(f"Chart detection error: {e}")
+            return {
+                'has_chart': False,
+                'confidence': 0.0,
+                'chart_types': [],
+                'description': f'Detection error: {str(e)}',
+                'features': {},
+                'error': str(e)
+            }
+def _chart_result(has_chart, confidence, chart_types, description, line_count, circle_count, overall_edge_density):
+    """Helper to return a standard chart detection result dict."""
+    return {
+        'has_chart': has_chart,
+        'confidence': confidence,
+        'chart_types': chart_types,
+        'description': description,
+        'features': {
+            'lines': line_count,
+            'circles': circle_count,
+            'overall_edge_density': float(overall_edge_density)
+        }
+    }
+# Detection configuration thresholds (BALANCED - detect real charts, reject pure text)
+DETECTION_CONFIG = {
+    'min_circles_bubble_chart': 5,
+    'min_circles_scatter': 3,
+    'min_diagonal_lines': 5,           # Lowered from 8 for line charts
+    'min_angle_variance': 150,         # Lowered from 200 for line charts
+    'min_rectangle_contours': 6,
+    'min_confidence_threshold': 0.4,   # Lowered from 0.5
+    'max_grid_ratio': 0.75,
+    'max_text_edge_density_both': 0.08,  # Both sides text
+    'max_text_edge_density_overall': 0.15,  # Entire page text
+    'min_very_short_lines_mesh': 50,
+    'axis_confidence_bonus': 0.10,
+    'min_line_length': 80,
+    'contour_area_min': 1500,
+    'contour_area_max': 40000,
+    'large_contour_min': 40000,
+    'large_contour_max': 500000,
+    'circle_radius_min': 15,
+    'circle_radius_max': 300,
+    'min_bar_chart_bars': 6,
+    'min_color_peaks': 5
+}

core/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .logging import logger
2	+
3	+ __all__ = ["logger"]

core/diagnostics.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""
+Health check utilities for DocChat.
+This module provides diagnostics check functions that can be used
+to verify the application is running correctly.
+"""
+import logging
+from typing import Dict, Any
+from datetime import datetime
+logger = logging.getLogger(__name__)
+def check_diagnostics() -> Dict[str, Any]:
+    """
+    Perform a comprehensive diagnostics check of the application.
+    Returns:
+        Dict with diagnostics status and component information
+    """
+    diagnostics_status = {
+        "status": "diagnosticsy",
+        "timestamp": datetime.utcnow().isoformat(),
+        "components": {}
+    }
+    # Check parameters
+    try:
+        from configuration.parameters import parameters
+        diagnostics_status["components"]["parameters"] = {
+            "status": "ok",
+            "chroma_db_path": parameters.CHROMA_DB_PATH,
+            "log_level": parameters.LOG_LEVEL
+        }
+    except Exception as e:
+        diagnostics_status["components"]["parameters"] = {
+            "status": "error",
+            "error": str(e)
+        }
+        diagnostics_status["status"] = "undiagnosticsy"
+    # Check ChromaDB directory
+    try:
+        from pathlib import Path
+        chroma_path = Path(parameters.CHROMA_DB_PATH)
+        diagnostics_status["components"]["chroma_db"] = {
+            "status": "ok",
+            "path_exists": chroma_path.exists(),
+            "is_writable": chroma_path.exists() and chroma_path.is_dir()
+        }
+    except Exception as e:
+        diagnostics_status["components"]["chroma_db"] = {
+            "status": "error",
+            "error": str(e)
+        }
+    # Check cache directory
+    try:
+        cache_path = Path(parameters.CACHE_DIR)
+        diagnostics_status["components"]["cache"] = {
+            "status": "ok",
+            "path_exists": cache_path.exists(),
+            "is_writable": cache_path.exists() and cache_path.is_dir()
+        }
+    except Exception as e:
+        diagnostics_status["components"]["cache"] = {
+            "status": "error",
+            "error": str(e)
+        }
+    # Check if required packages are importable
+    required_packages = [
+        "langchain",
+        "langchain_google_genai",
+        "chromadb",
+        "gradio"
+    ]
+    packages_status = {}
+    for package in required_packages:
+        try:
+            __import__(package)
+            packages_status[package] = "ok"
+        except ImportError as e:
+            packages_status[package] = f"missing: {e}"
+            diagnostics_status["status"] = "degraded"
+    diagnostics_status["components"]["packages"] = packages_status
+    return diagnostics_status
+def check_api_key() -> Dict[str, Any]:
+    """
+    Check if the Google API key is configured and valid format.
+    Returns:
+        Dict with API key status (does not expose the key)
+    """
+    try:
+        from configuration.parameters import parameters
+        api_key = parameters.GOOGLE_API_KEY
+        if not api_key:
+            return {"status": "missing", "message": "GOOGLE_API_KEY not set"}
+        if len(api_key) < 20:
+            return {"status": "invalid", "message": "API key appears too short"}
+        # Mask the key for logging (show first 4 and last 4 chars)
+        masked = f"{api_key[:4]}...{api_key[-4:]}"
+        return {
+            "status": "configured",
+            "masked_key": masked,
+            "length": len(api_key)
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+if __name__ == "__main__":
+    # Run diagnostics check when executed directly
+    import json
+    print(json.dumps(check_diagnostics(), indent=2))

core/lifecycle.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""
+Signal handling and graceful lifecycle utilities.
+This module provides graceful lifecycle handling for the DocChat application,
+ensuring resources are properly cleaned up when the application is terminated.
+"""
+import signal
+import sys
+import logging
+import atexit
+from typing import Callable, List, Optional
+from pathlib import Path
+logger = logging.getLogger(__name__)
+class ShutdownHandler:
+    """
+    Manages graceful lifecycle of the application.
+    Registers cleanup callbacks that are executed when the application
+    receives a termination signal (SIGINT, SIGTERM) or exits normally.
+    """
+    _instance: Optional['ShutdownHandler'] = None
+    def __new__(cls) -> 'ShutdownHandler':
+        """Singleton pattern to ensure only one handler exists."""
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._initialized = False
+        return cls._instance
+    def __init__(self) -> None:
+        """Initialize the lifecycle handler."""
+        if self._initialized:
+            return
+        self._cleanup_callbacks: List[Callable] = []
+        self._lifecycle_in_progress: bool = False
+        self._initialized = True
+        # Register signal handlers
+        signal.signal(signal.SIGINT, self._signal_handler)
+        signal.signal(signal.SIGTERM, self._signal_handler)
+        # Register atexit handler for normal exits
+        atexit.register(self._atexit_handler)
+        logger.info("[SHUTDOWN] ShutdownHandler initialized")
+    def register_cleanup(self, callback: Callable, name: str = "") -> None:
+        """
+        Register a cleanup callback to be called on lifecycle.
+        Args:
+            callback: Function to call during lifecycle
+            name: Optional name for logging purposes
+        """
+        self._cleanup_callbacks.append((callback, name))
+        logger.debug(f"[SHUTDOWN] Registered cleanup callback: {name or callback.__name__}")
+    def _signal_handler(self, signum: int, frame) -> None:
+        """
+        Handle termination signals.
+        Args:
+            signum: Signal number
+            frame: Current stack frame
+        """
+        signal_name = signal.Signals(signum).name
+        logger.info(f"[SHUTDOWN] Received {signal_name}, initiating graceful lifecycle...")
+        self._execute_cleanup()
+        sys.exit(0)
+    def _atexit_handler(self) -> None:
+        """Handle normal application exit."""
+        if not self._lifecycle_in_progress:
+            logger.info("[SHUTDOWN] Application exiting normally, running cleanup...")
+            self._execute_cleanup()
+    def _execute_cleanup(self) -> None:
+        """Execute all registered cleanup callbacks."""
+        if self._lifecycle_in_progress:
+            return
+        self._lifecycle_in_progress = True
+        logger.info(f"[SHUTDOWN] Executing {len(self._cleanup_callbacks)} cleanup callbacks...")
+        for callback, name in reversed(self._cleanup_callbacks):
+            try:
+                callback_name = name or callback.__name__
+                logger.debug(f"[SHUTDOWN] Running cleanup: {callback_name}")
+                callback()
+                logger.debug(f"[SHUTDOWN] ? Cleanup completed: {callback_name}")
+            except Exception as e:
+                logger.error(f"[SHUTDOWN] ? Cleanup failed: {e}", exc_info=True)
+        logger.info("[SHUTDOWN] ? All cleanup callbacks executed")
+def cleanup_chroma_db() -> None:
+    """Clean up ChromaDB connections."""
+    try:
+        # ChromaDB cleanup if needed
+        logger.info("[CLEANUP] Cleaning up ChromaDB...")
+        # ChromaDB uses SQLite which handles cleanup automatically
+        logger.info("[CLEANUP] ? ChromaDB cleanup complete")
+    except Exception as e:
+        logger.error(f"[CLEANUP] ChromaDB cleanup failed: {e}")
+def cleanup_temp_files() -> None:
+    """Clean up temporary files created during processing."""
+    try:
+        import tempfile
+        import shutil
+        # Clean up any temp directories we created
+        temp_base = Path(tempfile.gettempdir())
+        # Only clean up directories that match our pattern
+        # Be conservative to avoid deleting user data
+        logger.info("[CLEANUP] Temporary file cleanup complete")
+    except Exception as e:
+        logger.error(f"[CLEANUP] Temp file cleanup failed: {e}")
+def cleanup_logging() -> None:
+    """Flush and close all log handlers."""
+    try:
+        logger.info("[CLEANUP] Flushing log handlers...")
+        # Get root logger and flush all handlers
+        root_logger = logging.getLogger()
+        for handler in root_logger.handlers:
+            handler.flush()
+        logger.info("[CLEANUP] ? Log handlers flushed")
+    except Exception as e:
+        # Can't log this since logging might be broken
+        print(f"[CLEANUP] Log handler cleanup failed: {e}", file=sys.stderr)
+def initialize_lifecycle_handler() -> ShutdownHandler:
+    """
+    Initialize the lifecycle handler with default cleanup callbacks.
+    Returns:
+        The initialized ShutdownHandler instance
+    """
+    handler = ShutdownHandler()
+    # Register default cleanup callbacks (order matters - reverse execution)
+    handler.register_cleanup(cleanup_logging, "Logging cleanup")
+    handler.register_cleanup(cleanup_temp_files, "Temp files cleanup")
+    handler.register_cleanup(cleanup_chroma_db, "ChromaDB cleanup")
+    return handler

core/logger.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+Logging utility module.
+This module provides a centralized logger instance using the standard library logging.
+The logging configuration is handled by config/logger_setup.py which should be
+called at application startup.
+Usage:
+    from core.logging import logger
+    logger.info("Your message here")
+"""
+import logging
+# Get a logger for the smartdoc module
+# The actual configuration (handlers, formatters) is done in config/logger_setup.py
+logger = logging.getLogger("smartdoc")

dependencies.txt ADDED Viewed

	@@ -0,0 +1,52 @@

+# Core dependencies
+aiofiles>=23.2.1
+aiohttp>=3.11.0
+annotated-types>=0.7.0
+anyio>=4.8.0
+# PDF Processing
+pdfplumber>=0.11.0
+pdf2image>=1.17.0
+Pillow>=10.0.0
+# Computer Vision for local chart detection (cost optimization)
+opencv-python>=4.8.0
+# LangChain ecosystem
+langchain>=0.3.16
+langchain-core>=0.3.32
+langchain-text-splitters>=0.3.5
+langchain-google-genai>=2.0.0
+langchain-community>=0.3.16
+# Google AI for chart analysis
+google-generativeai>=0.8.0
+# Vector store
+chromadb>=0.6.3
+# Web framework
+gradio>=5.13.0
+# Data processing
+pandas>=2.1.4
+numpy>=1.26.4
+beautifulsoup4>=4.12.3
+# Document loaders
+python-docx>=1.1.2
+docx2txt>=0.8
+# Configuration
+pydantic>=2.11.10,<2.12.5
+pydantic-settings>=2.10.1,<3.0.0
+python-dotenv>=1.0.1
+# BM25 retriever
+rank-bm25>=0.2.2
+# Utilities
+tqdm>=4.67.0
+requests>=2.32.0
+tiktoken>=0.8.0
+tenacity>=9.0.0

intelligence/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .knowledge_synthesizer import ResearchAgent
+from .accuracy_verifier import VerificationAgent
+from .orchestrator import AgentWorkflow
+__all__ = ["ResearchAgent", "VerificationAgent", "AgentWorkflow"]

intelligence/accuracy_verifier.py ADDED Viewed

	@@ -0,0 +1,362 @@

+"""
+Verification agent module for answer validation against source documents.
+"""
+from typing import Dict, List, Optional, Literal
+from langchain_core.documents import Document
+from langchain_google_genai import ChatGoogleGenerativeAI
+from pydantic import BaseModel, Field
+import logging
+from configuration.parameters import parameters
+logger = logging.getLogger(__name__)
+class VerificationResult(BaseModel):
+    """Structured output model for verification results."""
+    supported: Literal["YES", "NO", "PARTIAL"] = Field(
+        description="Whether the answer is supported by the context"
+    )
+    confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
+        default="MEDIUM",
+        description="Confidence level in the verification result"
+    )
+    unsupported_claims: List[str] = Field(
+        default_factory=list,
+        description="Claims not supported by context"
+    )
+    contradictions: List[str] = Field(
+        default_factory=list,
+        description="Contradictions between answer and context"
+    )
+    relevant: Literal["YES", "NO"] = Field(
+        description="Whether the answer is relevant to the question"
+    )
+    completeness: Literal["COMPLETE", "PARTIAL", "INCOMPLETE"] = Field(
+        default="PARTIAL",
+        description="How completely the answer addresses the question"
+    )
+    additional_details: str = Field(
+        default="",
+        description="Additional explanations and reasoning"
+    )
+class BestAnswerSelection(BaseModel):
+    """Structured output model for selecting the best answer from candidates."""
+    selected_index: int = Field(
+        description="The index (0-based) of the best answer from the candidates list"
+    )
+    reasoning: str = Field(
+        description="Explanation of why this answer was selected as the best"
+    )
+    confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
+        default="MEDIUM",
+        description="Confidence level in the selection"
+    )
+    comparison_summary: str = Field(
+        default="",
+        description="Brief comparison of the candidate answers"
+    )
+class VerificationAgent:
+    """Agent for verifying answers against source documents."""
+    def __init__(
+        self,
+        llm: Optional[ChatGoogleGenerativeAI] = None,
+        max_context_chars: int = None,
+        max_output_tokens: int = None,
+    ) -> None:
+        """Initialize the verification agent."""
+        logger.info("Initializing VerificationAgent...")
+        self.max_context_chars = max_context_chars or parameters.VERIFICATION_MAX_CONTEXT_CHARS
+        self.max_output_tokens = max_output_tokens or parameters.VERIFICATION_MAX_OUTPUT_TOKENS
+        base_llm = llm or ChatGoogleGenerativeAI(
+            model=parameters.VERIFICATION_AGENT_MODEL,
+            google_api_key=parameters.GOOGLE_API_KEY,
+            temperature=0,
+            max_output_tokens=self.max_output_tokens,
+        )
+        self.llm = base_llm
+        self.structured_llm = base_llm.with_structured_output(VerificationResult)
+        self.selection_llm = base_llm.with_structured_output(BestAnswerSelection)
+        logger.info(f"VerificationAgent initialized (model={parameters.VERIFICATION_AGENT_MODEL})")
+    def generate_prompt(self, answer: str, context: str, question: Optional[str] = None) -> str:
+        """Generate verification prompt."""
+        question_section = f"\n**Original Question:** {question}\n" if question else ""
+        return f"""Verify the following answer against the provided context.
+**Check for:**
+1. Factual support (YES/NO/PARTIAL)
+2. Confidence level (HIGH/MEDIUM/LOW)
+3. Unsupported claims
+4. Contradictions
+5. Relevance to question
+6. Completeness (COMPLETE/PARTIAL/INCOMPLETE)
+**Scoring:**
+- HIGH: All claims directly stated, no ambiguity
+- MEDIUM: Most claims supported, some inferred
+- LOW: Significant claims unsupported
+{question_section}
+**Answer to Verify:**
+{answer}
+**Context:**
+{context}
+Provide your verification analysis."""
+    def format_verification_report(self, verification: VerificationResult) -> str:
+        """Format verification result into readable report."""
+        report = f"**Supported:** {verification.supported}\n"
+        report += f"**Confidence:** {verification.confidence}\n"
+        report += f"**Unsupported Claims:** {', '.join(verification.unsupported_claims) or 'None'}\n"
+        report += f"**Contradictions:** {', '.join(verification.contradictions) or 'None'}\n"
+        report += f"**Relevant:** {verification.relevant}\n"
+        report += f"**Completeness:** {verification.completeness}\n"
+        report += f"**Additional Details:** {verification.additional_details or 'None'}\n"
+        return report
+    def generate_feedback_for_research(self, verification: VerificationResult) -> Optional[str]:
+        """Generate feedback for research agent if improvements needed."""
+        feedback_parts = []
+        if verification.supported == "NO":
+            feedback_parts.append("Answer lacks sufficient support from documents.")
+        elif verification.supported == "PARTIAL":
+            feedback_parts.append("Some parts are not well supported.")
+        if verification.unsupported_claims:
+            claims_str = "; ".join(verification.unsupported_claims[:3])
+            feedback_parts.append(f"Unsupported: {claims_str}")
+        if verification.contradictions:
+            contradictions_str = "; ".join(verification.contradictions[:3])
+            feedback_parts.append(f"Contradictions: {contradictions_str}")
+        if verification.completeness == "INCOMPLETE":
+            feedback_parts.append("Answer is incomplete.")
+        if verification.confidence == "LOW":
+            feedback_parts.append("Focus on directly verifiable claims.")
+        # Always add additional_details if present, even if other feedback exists
+        if verification.additional_details:
+            feedback_parts.append(f"Additional Details: {verification.additional_details}")
+        return " | ".join(feedback_parts) if feedback_parts else None
+    def should_retry_research(self, verification: VerificationResult) -> bool:
+        """Determine if research should be retried."""
+        if verification.supported == "NO" or verification.relevant == "NO":
+            return True
+        if verification.confidence == "LOW" and (
+            verification.unsupported_claims or verification.contradictions
+        ):
+            return True
+        if verification.supported == "PARTIAL" and verification.contradictions:
+            return True
+        return False
+    def check(self, answer: str, documents: List[Document], question: Optional[str] = None) -> Dict:
+        """
+        Verify answer against provided documents.
+        Args:
+            answer: The answer to verify
+            documents: Source documents for verification
+            question: Optional original question
+        Returns:
+            Dict with verification report, context, and metadata
+        """
+        logger.info(f"Verifying answer ({len(answer)} chars) against {len(documents)} documents")
+        context = "\n\n".join([doc.page_content for doc in documents])
+        if len(context) > self.max_context_chars:
+            logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}")
+            context = context[:self.max_context_chars]
+        prompt = self.generate_prompt(answer, context, question)
+        try:
+            logger.debug("Calling LLM for verification...")
+            verification_result: VerificationResult = self.structured_llm.invoke(prompt)
+            logger.info(f"Verification: {verification_result.supported} ({verification_result.confidence})")
+        except Exception as e:
+            logger.error(f"Structured output failed: {e}")
+            try:
+                response = self.llm.invoke(prompt)
+                report = response.content if hasattr(response, "content") else str(response)
+                verification_result = self._parse_unstructured_response(report.strip())
+            except Exception as fallback_error:
+                logger.error(f"Fallback failed: {fallback_error}")
+                verification_result = VerificationResult(
+                    supported="NO",
+                    confidence="LOW",
+                    relevant="NO",
+                    completeness="INCOMPLETE",
+                    additional_details=f"Verification failed: {str(e)}"
+                )
+        verification_report = self.format_verification_report(verification_result)
+        feedback = self.generate_feedback_for_research(verification_result)
+        if feedback:
+            logger.debug(f"Generated feedback: {feedback[:80]}...")
+        return {
+            "verification_report": verification_report,
+            "context_used": context,
+            "structured_result": verification_result.model_dump(),
+            "should_retry": self.should_retry_research(verification_result),
+            "feedback": feedback
+        }
+    def select_best_answer(
+        self,
+        candidate_answers: List[str],
+        documents: List[Document],
+        question: str
+    ) -> Dict:
+        """
+        Select the best answer from multiple candidates based on verification criteria.
+        Args:
+            candidate_answers: List of candidate answers to evaluate
+            documents: Source documents for verification
+            question: The original question
+        Returns:
+            Dict with selected answer, index, reasoning, and verification details
+        """
+        logger.info(f"Selecting best answer from {len(candidate_answers)} candidates")
+        if len(candidate_answers) == 0:
+            logger.warning("No candidate answers provided")
+            return {
+                "selected_answer": "No answers were generated.",
+                "selected_index": -1,
+                "reasoning": "No candidates available",
+                "confidence": "LOW"
+            }
+        if len(candidate_answers) == 1:
+            logger.info("Only one candidate, returning it directly")
+            return {
+                "selected_answer": candidate_answers[0],
+                "selected_index": 0,
+                "reasoning": "Only one candidate answer was provided",
+                "confidence": "MEDIUM"
+            }
+        context = "\n\n".join([doc.page_content for doc in documents])
+        if len(context) > self.max_context_chars:
+            logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}")
+            context = context[:self.max_context_chars]
+        candidates_text = ""
+        for i, answer in enumerate(candidate_answers):
+            candidates_text += f"\n**Candidate {i + 1}:**\n{answer}\n"
+        prompt = f"""You are evaluating multiple candidate answers to select the best one.
+**Original Question:** {question}
+**Candidate Answers:**
+{candidates_text}
+**Source Context:**
+{context}
+**Evaluation Criteria:**
+1. **Factual Accuracy**: Which answer is most accurately supported by the context?
+2. **Completeness**: Which answer most thoroughly addresses the question?
+3. **Relevance**: Which answer stays most focused on what was asked?
+4. **No Contradictions**: Which answer has the fewest contradictions with the source?
+5. **Clarity**: Which answer is clearest and most well-structured?
+Select the best answer by providing its index (0-based) and explain your reasoning."""
+        try:
+            logger.debug("Calling LLM for best answer selection...")
+            selection_result: BestAnswerSelection = self.selection_llm.invoke(prompt)
+            selected_index = selection_result.selected_index
+            if selected_index < 0 or selected_index >= len(candidate_answers):
+                logger.warning(f"Invalid selection index {selected_index}, defaulting to 0")
+                selected_index = 0
+            logger.info(f"Selected candidate {selected_index + 1} with {selection_result.confidence} confidence")
+            return {
+                "selected_answer": candidate_answers[selected_index],
+                "selected_index": selected_index,
+                "reasoning": selection_result.reasoning,
+                "confidence": selection_result.confidence,
+                "comparison_summary": selection_result.comparison_summary
+            }
+        except Exception as e:
+            logger.error(f"Best answer selection failed: {e}")
+            # Fallback: return the first candidate
+            return {
+                "selected_answer": candidate_answers[0],
+                "selected_index": 0,
+                "reasoning": f"Selection failed, using first candidate: {str(e)}",
+                "confidence": "LOW"
+            }
+    def _parse_unstructured_response(self, response_text: str) -> VerificationResult:
+        """Parse unstructured response into VerificationResult (fallback)."""
+        try:
+            data = {
+                "supported": "NO",
+                "confidence": "LOW",
+                "unsupported_claims": [],
+                "contradictions": [],
+                "relevant": "NO",
+                "completeness": "INCOMPLETE",
+                "additional_details": ""
+            }
+            for line in response_text.split('\n'):
+                if ':' not in line:
+                    continue
+                key, value = line.split(':', 1)
+                key = key.strip().lower().replace(' ', '_')
+                value = value.strip().upper()
+                if key == "SUPPORTED":
+                    data["supported"] = "YES" if "YES" in value else ("PARTIAL" if "PARTIAL" in value else "NO")
+                elif key == "CONFIDENCE":
+                    data["confidence"] = "HIGH" if "HIGH" in value else ("MEDIUM" if "MEDIUM" in value else "LOW")
+                elif key == "RELEVANT":
+                    data["relevant"] = "YES" if "YES" in value else "NO"
+                elif key == "COMPLETENESS":
+                    if "COMPLETE" in value and "INCOMPLETE" not in value:
+                        data["completeness"] = "COMPLETE"
+                    elif "PARTIAL" in value:
+                        data["completeness"] = "PARTIAL"
+            return VerificationResult(**data)
+        except Exception as e:
+            logger.error(f"Failed to parse response: {e}")
+            return VerificationResult(
+                supported="NO",
+                confidence="LOW",
+                relevant="NO",
+                completeness="INCOMPLETE",
+                additional_details="Failed to parse verification response"
+            )

intelligence/context_validator.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+Relevance checker module for document retrieval quality assessment.
+"""
+from langchain_google_genai import ChatGoogleGenerativeAI
+from pydantic import BaseModel, Field
+from typing import Literal, Optional, List
+import logging
+from configuration.parameters import parameters
+logger = logging.getLogger(__name__)
+def estimate_tokens(text: str, chars_per_token: int = 4) -> int:
+    """Estimate token count from text length."""
+    return len(text) // chars_per_token
+# ============================================================================
+# Structured Output Models
+# ============================================================================
+class ContextValidationClassification(BaseModel):
+    """Structured output for context validation classification."""
+    classification: Literal["CAN_ANSWER", "PARTIAL", "NO_MATCH"] = Field(
+        description=(
+            "CAN_ANSWER: Passages contain enough info to fully answer. "
+            "PARTIAL: Passages mention the topic but incomplete. "
+            "NO_MATCH: Passages don't discuss the topic at all."
+        )
+    )
+    confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
+        default="MEDIUM",
+        description="Confidence level in the classification"
+    )
+    reasoning: str = Field(
+        default="",
+        description="Brief explanation for the classification"
+    )
+class ContextQueryExpansion(BaseModel):
+    """Structured output for query expansion/rewriting."""
+    rewritten_query: str = Field(
+        description="A rephrased version of the original query"
+    )
+    key_terms: List[str] = Field(
+        default_factory=list,
+        description="Key terms and synonyms to search for"
+    )
+    search_strategy: str = Field(
+        default="",
+        description="Brief explanation of the search approach"
+    )
+class ContextValidator:
+    """
+    Checks context relevance of retrieved documents to a user's question.
+    Uses Gemini model with structured output to classify coverage
+    and provides query rewriting for improved retrieval.
+    """
+    VALID_LABELS = {"CAN_ANSWER", "PARTIAL", "NO_MATCH"}
+    def __init__(self):
+        """Initialize the context validator."""
+        logger.info("Initializing ContextValidator...")
+        base_llm = ChatGoogleGenerativeAI(
+            model=parameters.RELEVANCE_CHECKER_MODEL,
+            google_api_key=parameters.GOOGLE_API_KEY,
+            temperature=0,
+            max_output_tokens=100,
+        )
+        self.llm = base_llm
+        self.structured_llm = base_llm.with_structured_output(ContextValidationClassification)
+        self.query_expansion_llm = base_llm.with_structured_output(ContextQueryExpansion)
+        logger.info(f"ContextValidator initialized (model={parameters.RELEVANCE_CHECKER_MODEL})")
+    def context_query_rewrite(self, original_query: str, context_hint: Optional[str] = None) -> Optional[ContextQueryExpansion]:
+        """
+        Rewrite a query to potentially retrieve better results.
+        Args:
+            original_query: The original user query
+            context_hint: Optional hint about available documents
+        Returns:
+            ContextQueryExpansion with rewritten query, or None on failure
+        """
+        logger.debug(f"Rewriting query: {original_query[:80]}...")
+        context_section = f"\n**Available Context:** {context_hint}\n" if context_hint else ""
+        prompt = f"""Rewrite this query to improve document retrieval.
+**Original Query:** {original_query}
+{context_section}
+**Instructions:**
+1. Rephrase to be more specific and searchable
+2. Extract key terms and add synonyms
+3. Consider exact phrases in formal documents"""
+        try:
+            result: ContextQueryExpansion = self.query_expansion_llm.invoke(prompt)
+            logger.debug(f"Query rewritten: {result.rewritten_query[:60]}...")
+            return result
+        except Exception as e:
+            logger.error(f"Query rewrite failed: {e}")
+            return None
+    def context_validate(self, question: str, retriever, k: int = 3) -> str:
+        """
+        Retrieve top-k passages and classify coverage.
+        Args:
+            question: The user's question
+            retriever: The retriever for fetching documents
+            k: Number of top documents to consider
+        Returns:
+            Classification: "CAN_ANSWER", "PARTIAL", or "NO_MATCH"
+        """
+        if not question or not question.strip():
+            logger.warning("Empty question provided")
+            return "NO_MATCH"
+        if k < 1:
+            k = 3
+        logger.info(f"Checking context relevance for: {question[:60]}...")
+        # Retrieve documents
+        try:
+            top_docs = retriever.invoke(question)
+        except Exception as e:
+            logger.error(f"Retriever invocation failed: {e}")
+            return "NO_MATCH"
+        if not top_docs:
+            logger.info("No documents returned")
+            return "NO_MATCH"
+        logger.debug(f"Retrieved {len(top_docs)} documents")
+        passages = "\n\n".join(doc.page_content for doc in top_docs[:k])
+        prompt = f"""Classify how well the passages address the question.
+**Question:** {question}
+**Passages:**
+{passages}
+Classify as CAN_ANSWER (fully answers), PARTIAL (mentions topic), or NO_MATCH (unrelated)."""
+        try:
+            result: ContextValidationClassification = self.structured_llm.invoke(prompt)
+            logger.info(f"Context relevance: {result.classification} ({result.confidence})")
+            return result.classification
+        except Exception as e:
+            logger.error(f"Structured output failed: {e}")
+            # Fallback to text parsing
+            try:
+                response = self.llm.invoke(prompt)
+                raw_response = response.content if hasattr(response, "content") else str(response)
+                llm_response = raw_response.strip().upper()
+                for label in self.VALID_LABELS:
+                    if label in llm_response:
+                        logger.info(f"Fallback classification: {label}")
+                        return label
+                return "NO_MATCH"
+            except Exception as fallback_error:
+                logger.error(f"Fallback failed: {fallback_error}")
+                return "NO_MATCH"
+    def context_validate_with_rewrite(self, question: str, retriever, k: int = 3, max_rewrites: int = 1) -> dict:
+        """
+        Check relevance with automatic query rewriting if needed.
+        Args:
+            question: The user's question
+            retriever: The retriever to use
+            k: Number of top documents
+            max_rewrites: Maximum rewrite attempts
+        Returns:
+            Dict with classification, query_used, and was_rewritten
+        """
+        classification = self.context_validate(question, retriever, k)
+        if classification == "CAN_ANSWER" or max_rewrites <= 0:
+            return {
+                "classification": classification,
+                "query_used": question,
+                "was_rewritten": False
+            }
+        # Try query rewriting for poor results
+        if classification in ["PARTIAL", "NO_MATCH"]:
+            logger.info("Attempting query rewrite...")
+            expansion = self.context_query_rewrite(question)
+            if expansion and expansion.rewritten_query != question:
+                new_classification = self.context_validate(expansion.rewritten_query, retriever, k)
+                if self._is_better_classification(new_classification, classification):
+                    logger.info(f"Rewrite improved: {classification} -> {new_classification}")
+                    return {
+                        "classification": new_classification,
+                        "query_used": expansion.rewritten_query,
+                        "was_rewritten": True,
+                        "key_terms": expansion.key_terms
+                    }
+        return {
+            "classification": classification,
+            "query_used": question,
+            "was_rewritten": False
+        }
+    def _is_better_classification(self, new: str, old: str) -> bool:
+        """Check if new classification is better than old."""
+        ranking = {"NO_MATCH": 0, "PARTIAL": 1, "CAN_ANSWER": 2}
+        return ranking.get(new, 0) > ranking.get(old, 0)

intelligence/knowledge_synthesizer.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from typing import Dict, List, Optional
+import logging
+from langchain_core.documents import Document
+from langchain_google_genai import ChatGoogleGenerativeAI
+from configuration.parameters import parameters
+logger = logging.getLogger(__name__)
+def estimate_tokens(text: str, chars_per_token: int = 4) -> int:
+    """
+    Estimate token count from text length.
+    """
+    return len(text) // chars_per_token
+class ResearchAgent:
+    """
+    ResearchAgent generates answers to user questions using Gemini LLM,
+    focusing on extracting factual, source-cited information from documents.
+    """
+    def __init__(
+        self,
+        llm: Optional[ChatGoogleGenerativeAI] = None,
+        top_k: int = None,
+        max_context_chars: int = None,
+        max_output_tokens: int = None,
+    ) -> None:
+        """
+        Initialize the research agent with the Gemini model and configuration.
+        """
+        logger.info("[RESEARCH_AGENT] Initializing...")
+        self.top_k = top_k or parameters.RESEARCH_TOP_K
+        self.max_context_chars = max_context_chars or parameters.RESEARCH_MAX_CONTEXT_CHARS
+        self.max_output_tokens = max_output_tokens or parameters.RESEARCH_MAX_OUTPUT_TOKENS
+        self.llm = llm or ChatGoogleGenerativeAI(
+            model=parameters.RESEARCH_AGENT_MODEL,
+            google_api_key=parameters.GOOGLE_API_KEY,
+            temperature=0.2,
+            max_output_tokens=self.max_output_tokens
+        )
+        logger.info(f"[RESEARCH_AGENT] ✓ Initialized (top_k={self.top_k}, model={parameters.RESEARCH_AGENT_MODEL})")
+    def sanitize_response(self, response_text: str) -> str:
+        """
+        Sanitize the LLM's response by stripping unnecessary whitespace.
+        """
+        return response_text.strip()
+    def generate_prompt(self, question: str, context: str, feedback: Optional[str] = None) -> str:
+        """
+        Generate a structured prompt for the LLM to generate a precise and factual answer.
+        Includes special instructions for handling tables, charts, and visualizations.
+        """
+        base_prompt = f"""You are an AI assistant designed to provide precise and factual answers based on the given context.
+**Instructions:**
+- Answer the following question using only the provided context.
+- Be clear, concise, and factual.
+- Return as much information as you can get from the context.
+- Only include claims that are directly supported by the context.
+**IMPORTANT - Data Consolidation:**
+- If multiple charts, tables, or data sources provide similar information, CONSOLIDATE the data and provide a single, unified answer.
+- DO NOT list or compare values from multiple versions of the same charts/tables separately.
+- Present only the most relevant or consensus value for each data point, unless there is a clear, significant difference that must be explained.
+- If there are minor discrepancies, choose the value that appears most frequently or is best supported by the context, and mention only that value.
+**IMPORTANT - Chart and Page Reference:**
+- When referencing data from a chart, always indicate the chart's heading or title, and also include the page title if available.
+- Do NOT use phrases like "another chart" or "a different chart". Always refer to the chart by its heading/title and the page title if you need to mention the source.
+**CRITICAL - Table, Chart, and Visualization Handling:**
+- Pay VERY CLOSE attention to any tables in the context (formatted with | characters or markdown table format).
+- Tables contain structured data - read them carefully row by row, column by column.
+- Extract and cite specific numbers, percentages, scores, and ratings from tables.
+- If a numbered table (Table 1, Table 4, etc.) is relevant, explicitly mention it and provide the exact values.
+- **Analyze complex charts and visualizations** when present in the context:
+  - Look for chart descriptions, data points, trends, and patterns
+  - Extract specific values from line charts, bar charts, pie charts, and scatter plots
+  - Identify trends, correlations, and relationships shown in visualizations
+  - Note any zones, quadrants, or regions in complex diagrams
+  - Reference chart titles, axis labels, and legends when citing data
+  - Compare multiple visualizations if relevant to the question
+"""
+        if feedback:
+            base_prompt += f"""
+**IMPORTANT - Previous Answer Feedback:**
+Your previous answer had issues that need to be addressed:
+{feedback}
+Please generate an improved answer that:
+1. Addresses the unsupported claims by finding support in the context tables and charts
+2. Fixes any contradictions with the source material
+3. Ensures all statements are verifiable from the context
+4. Look carefully at ALL tables and visualizations - the data you need may be in a numbered table or chart description
+5. Read table data and chart descriptions carefully - each row/data point represents specific information
+"""
+        base_prompt += f"""
+**Question:** {question}
+**Context (pay special attention to tables marked with ### Table, chart descriptions, and data visualizations):**
+{context}
+**Provide your answer below (cite specific table numbers, chart references, and exact values from the tables and visualizations):**
+"""
+        return base_prompt
+    def generate(
+        self,
+        question: str,
+        documents: List[Document],
+        feedback: Optional[str] = None,
+        previous_answer: Optional[str] = None
+    ) -> Dict:
+        """
+        Generate an initial answer using the provided documents.
+        Args:
+            question: The user's question
+            documents: List of relevant documents
+            feedback: Optional feedback from verification agent for re-research
+            previous_answer: Optional previous answer that failed verification
+        Returns:
+            Dict with 'draft_answer' and 'context_used'
+        """
+        logger.info(f"[RESEARCH_AGENT] Generating answer for: {question[:80]}...")
+        logger.debug(f"[RESEARCH_AGENT] Documents: {len(documents)}, Feedback: {feedback is not None}")
+        if not documents:
+            logger.warning("[RESEARCH_AGENT] No documents provided")
+            return {
+                "draft_answer": "I could not find supporting documents to answer this question.",
+                "context_used": ""
+            }
+        # Combine the top document contents into one string
+        context = "\n\n".join([doc.page_content for doc in documents[:self.top_k]])
+        # Truncate context if too long
+        if len(context) > self.max_context_chars:
+            logger.debug(f"[RESEARCH_AGENT] Context truncated: {len(context)} -> {self.max_context_chars} chars")
+            context = context[:self.max_context_chars]
+        # Create a prompt for the LLM (with optional feedback)
+        prompt = self.generate_prompt(question, context, feedback)
+        # Call the LLM to generate the answer
+        try:
+            response = self.llm.invoke(prompt)
+            content = response.content if hasattr(response, "content") else str(response)
+            answer = content.strip()
+            logger.info("[RESEARCH_AGENT] Answer generated successfully")
+        except Exception as e:
+            logger.error(f"[RESEARCH_AGENT] LLM call failed: {e}", exc_info=True)
+            raise RuntimeError("Failed to generate answer due to a model error.") from e
+        # Sanitize the response
+        draft_answer = self.sanitize_response(answer) if answer else "I cannot answer this question based on the provided documents."
+        logger.debug(f"[RESEARCH_AGENT] Answer length: {len(draft_answer)} chars")
+        return {
+            "draft_answer": draft_answer,
+            "context_used": context
+        }

intelligence/orchestrator.py ADDED Viewed

	@@ -0,0 +1,388 @@

+"""
+Agent orchestrator orchestration using LangGraph.
+Defines the multi-agent orchestrator that:
+1. Checks document relevance
+2. Generates multiple answer candidates using research agent
+3. Selects the best answer through verification
+4. Provides feedback loop for iterative improvement
+"""
+from langgraph.graph import StateGraph, END
+from typing import TypedDict, List, Dict, Any, Optional
+from langchain_core.documents import Document
+from langchain_core.retrievers import BaseRetriever
+import logging
+from .knowledge_synthesizer import ResearchAgent
+from .accuracy_verifier import VerificationAgent
+from .context_validator import ContextValidator
+from langchain_google_genai import ChatGoogleGenerativeAI
+from configuration.parameters import parameters
+logger = logging.getLogger(__name__)
+class AgentState(TypedDict):
+    """State object passed between orchestrator nodes."""
+    question: str
+    documents: List[Document]
+    draft_answer: str
+    verification_report: str
+    is_relevant: bool
+    retriever: BaseRetriever
+    feedback: Optional[str]
+    research_attempts: int
+    query_used: str
+    candidate_answers: List[str]
+    selection_reasoning: str
+    # For multi-question support
+    is_multi_query: bool
+    sub_queries: List[str]
+    sub_answers: List[str]
+class AgentWorkflow:
+    """
+    Orchestrates multi-agent orchestrator for document Q&A.
+    Workflow:
+    1. Relevance Check - Determines if documents can answer the question
+    2. Research - Generates multiple answer candidates using document context
+    3. Verification - Selects the best answer from candidates
+    """
+    MAX_RESEARCH_ATTEMPTS: int = 7
+    NUM_RESEARCH_CANDIDATES: int = 3
+    def __init__(self, num_candidates: int = None) -> None:
+        """Initialize orchestrator with required agents."""
+        logger.info("Initializing AgentWorkflow...")
+        self.researcher = ResearchAgent()
+        self.verifier = VerificationAgent()
+        self.context_validator = ContextValidator()
+        self.compiled_orchestrator = None
+        self.llm = ChatGoogleGenerativeAI(
+            model=parameters.LLM_MODEL_NAME,
+            google_api_key=parameters.GOOGLE_API_KEY,
+            temperature=0.1,
+            max_output_tokens=256
+        )
+        if num_candidates is not None:
+            self.NUM_RESEARCH_CANDIDATES = num_candidates
+        logger.info(f"AgentWorkflow initialized (candidates={self.NUM_RESEARCH_CANDIDATES})")
+    def build_orchestrator(self) -> Any:
+        """Create and compile the orchestrator graph."""
+        logger.debug("Building orchestrator graph...")
+        orchestrator = StateGraph(AgentState)
+        orchestrator.add_node("detect_query_type", self._detect_query_type)
+        orchestrator.add_node("process_sub_queries", self._process_sub_queries_step)
+        orchestrator.add_node("combine_answers", self._combine_answers_step)
+        orchestrator.add_node("check_relevance", self._check_relevance_step)
+        orchestrator.add_node("research", self._research_step)
+        orchestrator.add_node("verify", self._verification_step)
+        orchestrator.set_entry_point("detect_query_type")
+        orchestrator.add_conditional_edges(
+            "detect_query_type",
+            lambda state: "multi" if state.get("is_multi_query") else "single",
+            {"multi": "process_sub_queries", "single": "check_relevance"}
+        )
+        orchestrator.add_edge("process_sub_queries", "combine_answers")
+        orchestrator.add_edge("combine_answers", END)
+        orchestrator.add_conditional_edges(
+            "check_relevance",
+            self._decide_after_relevance_check,
+            {"relevant": "research", "irrelevant": END}
+        )
+        orchestrator.add_edge("research", "verify")
+        orchestrator.add_conditional_edges(
+            "verify",
+            self._decide_next_step,
+            {"re_research": "research", "end": END}
+        )
+        return orchestrator.compile()
+    def _detect_query_type(self, state: AgentState) -> Dict[str, Any]:
+        """
+        Use LLM to detect if the question is multi-part and decompose it if so.
+        """
+        prompt = f"""
+You are an expert assistant for document Q&A. Analyze the following question and determine:
+1. Is it a single question or does it contain multiple sub-questions?
+2. If it contains multiple questions, decompose it into a list of clear, standalone sub-questions (no overlap, no ambiguity).
+Return your answer as a JSON object with two fields:
+- is_multi_query: true or false
+- sub_queries: a list of strings (the sub-questions, or a single-item list if only one)
+Question: {state['question']}
+"""
+        try:
+            response = self.llm.invoke(prompt)
+            import json
+            content = response.content if hasattr(response, "content") else str(response)
+            # Try to extract JSON from the response
+            start = content.find('{')
+            end = content.rfind('}')
+            if start != -1 and end != -1:
+                json_str = content[start:end+1]
+                result = json.loads(json_str)
+                is_multi = bool(result.get("is_multi_query", False))
+                sub_queries = result.get("sub_queries", [])
+            else:
+                # Fallback: treat as single question
+                is_multi = False
+                sub_queries = [state["question"]]
+        except Exception as e:
+            logger.error(f"LLM decomposition failed: {e}")
+            is_multi = False
+            sub_queries = [state["question"]]
+        if is_multi:
+            logger.info(f"[LLM Decompose] Multi-question detected: {len(sub_queries)} sub-queries")
+        else:
+            logger.info("[LLM Decompose] Single question detected; no decomposition needed.")
+        return {"is_multi_query": is_multi, "sub_queries": sub_queries}
+    def _process_sub_queries_step(self, state: AgentState) -> Dict[str, Any]:
+        sub_answers = []
+        logger.info(f"[Decompose] Processing {len(state['sub_queries'])} sub-queries...")
+        for sub_query in state["sub_queries"]:
+            logger.info(f"[Decompose] Processing sub-query: {sub_query}")
+            sub_state = state.copy()
+            sub_state["question"] = sub_query
+            rel = self._check_relevance_step(sub_state)
+            if not rel.get("is_relevant"):
+                logger.warning(f"[Decompose] Sub-query not relevant: {sub_query}")
+                sub_answers.append(rel.get("draft_answer", "No answer found."))
+                continue
+            sub_state.update(rel)
+            research = self._research_step(sub_state)
+            sub_state.update(research)
+            verify = self._verification_step(sub_state)
+            sub_state.update(verify)
+            sub_answers.append(sub_state["draft_answer"])
+        logger.info(f"[Decompose] Sub-query answers: {sub_answers}")
+        return {"sub_answers": sub_answers}
+    def _combine_answers_step(self, state: AgentState) -> Dict[str, Any]:
+        logger.info(f"[Decompose] Combining {len(state['sub_answers'])} sub-answers into final answer.")
+        combined = "\n\n".join(f"Q{i+1}: {q}\nA: {a}" for i, (q, a) in enumerate(zip(state["sub_queries"], state["sub_answers"])))
+        return {"draft_answer": combined, "verification_report": "Multi-question answer combined."}
+    def _check_relevance_step(self, state: AgentState) -> Dict[str, Any]:
+        """Check if retrieved documents are relevant to the question."""
+        logger.debug("Checking context relevance...")
+        result = self.context_validator.context_validate_with_rewrite(
+            question=state["question"],
+            retriever=state["retriever"],
+            k=20,
+            max_rewrites=1
+        )
+        classification = result["classification"]
+        query_used = result["query_used"]
+        was_rewritten = result.get("was_rewritten", False)
+        logger.info(f"Relevance: {classification}")
+        if was_rewritten:
+            logger.debug(f"Query rewritten: {query_used[:60]}...")
+        if classification in ["CAN_ANSWER", "PARTIAL"]:
+            if was_rewritten:
+                documents = state["retriever"].invoke(query_used)
+                return {"is_relevant": True, "query_used": query_used, "documents": documents}
+            return {"is_relevant": True, "query_used": state["question"]}
+        else:
+            return {
+                "is_relevant": False,
+                "query_used": state["question"],
+                "draft_answer": "This question isn't related to the uploaded documents. Please ask another question."
+            }
+    def _decide_after_relevance_check(self, state: AgentState) -> str:
+        """Decide next step after relevance check."""
+        return "relevant" if state["is_relevant"] else "irrelevant"
+    def full_pipeline(self, question: str, retriever: BaseRetriever) -> Dict[str, str]:
+        """
+        Execute the full Q&A pipeline.
+        Args:
+            question: The user's question
+            retriever: The retriever for document lookup
+        Returns:
+            Dict with 'draft_answer' and 'verification_report'
+        """
+        try:
+            if self.compiled_orchestrator is None:
+                self.compiled_orchestrator = self.build_orchestrator()
+            logger.info(f"Starting pipeline: {question[:80]}...")
+            documents = retriever.invoke(question)
+            logger.info(f"Retrieved {len(documents)} documents")
+            initial_state: AgentState = {
+                "question": question,
+                "documents": documents,
+                "draft_answer": "",
+                "verification_report": "",
+                "is_relevant": False,
+                "retriever": retriever,
+                "feedback": None,
+                "research_attempts": 0,
+                "query_used": question,
+                "candidate_answers": [],
+                "selection_reasoning": "",
+                "is_multi_query": False,
+                "sub_queries": [],
+                "sub_answers": []
+            }
+            final_state = self.compiled_orchestrator.invoke(initial_state)
+            logger.info(f"Pipeline completed (attempts: {final_state.get('research_attempts', 1)})")
+            return {
+                "draft_answer": final_state["draft_answer"],
+                "verification_report": final_state["verification_report"]
+            }
+        except Exception as e:
+            logger.error(f"Pipeline failed: {e}", exc_info=True)
+            raise RuntimeError(f"Workflow execution failed: {e}") from e
+    def _research_step(self, state: AgentState) -> Dict[str, Any]:
+        """Generate multiple answer candidates using the research agent."""
+        attempts = state.get("research_attempts", 0) + 1
+        feedback = state.get("feedback")
+        previous_answer = state.get("draft_answer") if feedback else None
+        # Consolidate contradictions and unsupported claims into feedback
+        contradictions = state.get("contradictions_for_research", [])
+        unsupported_claims = state.get("unsupported_claims_for_research", [])
+        feedback_for_research = state.get("feedback_for_research", feedback)
+        extra_feedback = ""
+        if contradictions:
+            extra_feedback += " Contradictions: " + "; ".join(contradictions) + "."
+        if unsupported_claims:
+            extra_feedback += " Unsupported Claims: " + "; ".join(unsupported_claims) + "."
+        # If feedback_for_research is present, append extra_feedback; otherwise, use extra_feedback only
+        if feedback_for_research:
+            feedback_for_research = feedback_for_research + extra_feedback
+        else:
+            feedback_for_research = extra_feedback.strip()
+        logger.info(f"Research step (attempt {attempts}/{self.MAX_RESEARCH_ATTEMPTS})")
+        logger.info(f"Generating {self.NUM_RESEARCH_CANDIDATES} candidate answers...")
+        candidate_answers = []
+        for i in range(self.NUM_RESEARCH_CANDIDATES):
+            logger.info(f"Generating candidate {i + 1}/{self.NUM_RESEARCH_CANDIDATES}")
+            result = self.researcher.generate(
+                question=state["question"],
+                documents=state["documents"],
+                feedback=feedback_for_research,
+                previous_answer=previous_answer
+            )
+            candidate_answers.append(result["draft_answer"])
+        logger.info(f"Generated {len(candidate_answers)} candidate answers")
+        return {
+            "candidate_answers": candidate_answers,
+            "research_attempts": attempts,
+            "feedback": None
+        }
+    def _verification_step(self, state: AgentState) -> Dict[str, Any]:
+        """Select the best answer from candidates and verify it."""
+        logger.debug("Selecting best answer from candidates...")
+        candidate_answers = state.get("candidate_answers", [])
+        if not candidate_answers:
+            logger.warning("No candidate answers found, using draft_answer")
+            candidate_answers = [state.get("draft_answer", "")]
+        # Select the best answer from candidates
+        selection_result = self.verifier.select_best_answer(
+            candidate_answers=candidate_answers,
+            documents=state["documents"],
+            question=state["question"]
+        )
+        best_answer = selection_result["selected_answer"]
+        selection_reasoning = selection_result.get("reasoning", "")
+        logger.info(f"Selected candidate {selection_result['selected_index'] + 1} as best answer")
+        # Verify the selected answer
+        verification_result = self.verifier.check(
+            answer=best_answer,
+            documents=state["documents"],
+            question=state["question"]
+        )
+        # Enhance verification report with selection info
+        verification_report = verification_result["verification_report"]
+        verification_report = f"**Candidates Evaluated:** {len(candidate_answers)}\n" + \
+                             f"**Selected Candidate:** {selection_result['selected_index'] + 1}\n" + \
+                             f"**Selection Confidence:** {selection_result.get('confidence', 'N/A')}\n" + \
+                             f"**Selection Reasoning:** {selection_reasoning}\n\n" + \
+                             verification_report
+        return {
+            "draft_answer": best_answer,
+            "verification_report": verification_report,
+            "feedback": verification_result.get("feedback"),
+            "selection_reasoning": selection_reasoning
+        }
+    def _decide_next_step(self, state: AgentState) -> str:
+        """Decide whether to re-research or end orchestrator."""
+        verification_report = state["verification_report"]
+        research_attempts = state.get("research_attempts", 1)
+        feedback = state.get("feedback")
+        needs_re_research = False
+        # Extract contradictions and unsupported claims for feedback
+        contradictions = []
+        unsupported_claims = []
+        import re
+        for line in verification_report.splitlines():
+            if line.startswith("**Contradictions:"):
+                contradictions = [c.strip() for c in line.split(":", 1)[-1].split(",") if c.strip() and c.strip().lower() != "none"]
+            if line.startswith("**Unsupported Claims:"):
+                unsupported_claims = [u.strip() for u in line.split(":", 1)[-1].split(",") if u.strip() and u.strip().lower() != "none"]
+        if "Supported: NO" in verification_report:
+            needs_re_research = True
+            logger.warning("[Re-Research] Answer not supported; triggering re-research.")
+        elif "Relevant: NO" in verification_report:
+            needs_re_research = True
+            logger.warning("[Re-Research] Answer not relevant; triggering re-research.")
+        elif "Confidence: LOW" in verification_report and "Supported: PARTIAL" in verification_report:
+            needs_re_research = True
+            logger.warning("[Re-Research] Low confidence with partial support; triggering re-research.")
+        elif "Completeness: INCOMPLETE" in verification_report:
+            needs_re_research = True
+            logger.warning("[Re-Research] Answer is incomplete; triggering re-research.")
+        elif "Completeness: PARTIAL" in verification_report:
+            needs_re_research = True
+            logger.warning("[Re-Research] Answer is partially complete; triggering re-research.")
+        if feedback and not needs_re_research:
+            if "contradiction" in feedback.lower() or "unsupported" in feedback.lower():
+                needs_re_research = True
+                logger.warning("[Re-Research] Feedback indicates contradiction/unsupported; triggering re-research.")
+        # Store extra feedback for research node
+        state["contradictions_for_research"] = contradictions
+        state["unsupported_claims_for_research"] = unsupported_claims
+        state["feedback_for_research"] = feedback
+        if needs_re_research and research_attempts < self.MAX_RESEARCH_ATTEMPTS:
+            logger.info(f"[Re-Research] Re-researching (attempt {research_attempts + 1})")
+            return "re_research"
+        elif needs_re_research:
+            logger.warning("[Re-Research] Max attempts reached, returning best effort.")
+            return "end"
+        else:
+            logger.info("[Re-Research] Verification passed; ending workflow.")
+            return "end"

main.py ADDED Viewed

	@@ -0,0 +1,986 @@

+import configuration.logger_setup
+import logging
+logger = logging.getLogger(__name__)
+import hashlib
+import socket
+from typing import List, Dict
+import os
+import shutil
+from pathlib import Path
+from datetime import datetime
+import time
+import random
+from content_analyzer.document_parser import DocumentProcessor
+from search_engine.indexer import RetrieverBuilder
+from intelligence.orchestrator import AgentWorkflow
+from configuration import definitions, parameters
+import gradio as gr
+# Example data for demo
+EXAMPLES = {
+    "Generative AI and Jobs": {
+        "question": "Which occupations are most likely to be automated by AI?",
+        "file_paths": ["samples/OIT-NASK-IAGen_WP140_web.pdf"]
+    },
+    "Energy and AI": {
+        "question": "What is the accuracy of AI models in coding?",
+        "file_paths": ["samples/EnergyandAI.pdf"]
+    },
+     "Digital Progress and Trends Report 2025": {
+        "question": "which country has most Gen Ai patents and which country has most total funding raised by AI start-ups?",
+        "file_paths": ["samples/Digital Progress and Trends Report 2025, Strengthening AI Foundations.pdf"]
+    }
+}
+def format_chat_history(history: List[Dict]) -> str:
+    """Format chat history as markdown for display."""
+    if not history:
+        return "*No conversation history yet. Ask a question to get started!*"
+    formatted = []
+    for i, entry in enumerate(history, 1):
+        timestamp = entry.get("timestamp", "")
+        question = entry.get("question", "")
+        answer = entry.get("answer", "")
+        confidence = entry.get("confidence", "N/A")
+        formatted.append(f"""
+---
+### 💬 Q{i} ({timestamp})
+**Question:** {question}
+**Answer:** {answer}
+*Confidence: {confidence}*
+""")
+    return "\n".join(formatted)
+def format_document_context(documents: List, question: str = "") -> str:
+    """Format retrieved documents with annotation highlighting."""
+    if not documents:
+        return "*No documents retrieved yet.*"
+    formatted = [f"### 📚 Retrieved Context ({len(documents)} chunks)\n"]
+    # Extract key terms from question for highlighting
+    key_terms = []
+    if question:
+        stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'in', 'on', 'at', 'to', 'for', 'of', 'and', 'or', 'what', 'how', 'why', 'when', 'where', 'which'}
+        key_terms = [word.lower() for word in question.split() if word.lower() not in stopwords and len(word) > 2]
+    for i, doc in enumerate(documents[:5], 1):
+        content = doc.page_content if hasattr(doc, 'page_content') else str(doc)
+        source = doc.metadata.get('source', 'Unknown') if hasattr(doc, 'metadata') else 'Unknown'
+        # Truncate long content
+        if len(content) > 500:
+            content = content[:500] + "..."
+        # Highlight key terms
+        highlighted_content = content
+        for term in key_terms[:5]:
+            import re
+            pattern = re.compile(re.escape(term), re.IGNORECASE)
+            highlighted_content = pattern.sub(f"**{term}**", highlighted_content)
+        formatted.append(f"""
+<details>
+<summary>📄 Chunk {i} - {os.path.basename(source)}</summary>
+{highlighted_content}
+</details>
+""")
+    if len(documents) > 5:
+        formatted.append(f"\n*... and {len(documents) - 5} more chunks*")
+    return "\n".join(formatted)
+def _get_file_hashes(uploaded_files: List) -> frozenset:
+    """Generate SHA-256 hashes for uploaded files."""
+    hashes = set()
+    for file in uploaded_files:
+        with open(file.name, "rb") as f:
+            hashes.add(hashlib.sha256(f.read()).hexdigest())
+    return frozenset(hashes)
+def _find_open_port(start_port: int, max_attempts: int = 20) -> int:
+    """Find an available TCP port starting from start_port."""
+    port = start_port
+    for _ in range(max_attempts):
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            try:
+                sock.bind(("127.0.0.1", port))
+                return port
+            except OSError:
+                port += 1
+    raise RuntimeError(f"Could not find an open port starting at {start_port}")
+def _setup_gradio_shim():
+    """Shim Gradio's JSON schema conversion to tolerate boolean additionalProperties values."""
+    import gradio as gr
+    from gradio_client import utils as grc_utils
+    _orig_json_schema_to_python_type = grc_utils._json_schema_to_python_type
+    def _json_schema_to_python_type_safe(schema, defs=None):
+        if isinstance(schema, bool):
+            return "Any" if schema else "Never"
+        return _orig_json_schema_to_python_type(schema, defs)
+    grc_utils._json_schema_to_python_type = _json_schema_to_python_type_safe
+def main():
+    """Main application entry point."""
+    _setup_gradio_shim()
+    logger.info("=" * 60)
+    logger.info("Starting SmartDoc AI application...")
+    logger.info("=" * 60)
+    # Initialize components
+    processor = DocumentProcessor()
+    retriever_indexer = RetrieverBuilder()
+    orchestrator = AgentWorkflow()
+    logger.info("All components initialized successfully")
+    # CSS styling - Clean, accessible light theme with professional colors
+    css = """
+    /* Global styling - Light, clean background */
+    .gradio-container {
+        background: linear-gradient(180deg, #f8fafc 0%, #e2e8f0 100%) !important;
+        font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
+    }
+    /* Title styles - Dark text for readability */
+    .app-title {
+        font-size: 2.2em !important;
+        text-align: center !important;
+        color: #1e293b !important;
+        font-weight: 700 !important;
+        margin-bottom: 8px !important;
+    }
+    .app-subtitle {
+        font-size: 1.1em !important;
+        text-align: center !important;
+        color: #0369a1 !important;
+        font-weight: 500 !important;
+    }
+    .app-description {
+        text-align: center;
+        color: #475569 !important;
+        font-size: 0.95em !important;
+        line-height: 1.6 !important;
+    }
+    /* Section headers */
+    .section-header {
+        color: #1e293b !important;
+        font-weight: 600 !important;
+        border-bottom: 2px solid #0ea5e9 !important;
+        padding-bottom: 8px !important;
+        margin-bottom: 16px !important;
+    }
+    /* Chat history panel - Clean white card with more height */
+    .chat-history {
+        min-height: 500px;
+        max-height: 600px;
+        overflow-y: auto;
+        border: 1px solid #cbd5e1;
+        border-radius: 12px;
+        padding: 20px;
+        background: #ffffff;
+        box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+        color: #334155 !important;
+    }
+    #chat-history {
+        min-height: 120px !important;
+        max-height: none !important;
+        height: auto !important;
+    }
+    .chat-history h3 {
+        color: #0f172a !important;
+    }
+    .chat-history strong {
+        color: #1e293b !important;
+    }
+    /* Document context panel */
+    .doc-context {
+        max-height: 380px;
+        overflow-y: auto;
+        border: 1px solid #cbd5e1;
+        border-radius: 12px;
+        padding: 20px;
+        background: #ffffff;
+        box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+        color: #334155 !important;
+    }
+    .doc-context details {
+        margin-bottom: 12px;
+        padding: 14px;
+        background: #f1f5f9;
+        border-radius: 8px;
+        border-left: 4px solid #0ea5e9;
+    }
+    .doc-context summary {
+        cursor: pointer;
+        font-weight: 600;
+        color: #0369a1 !important;
+    }
+    .doc-context p, .doc-context span {
+        color: #475569 !important;
+    }
+    /* Answer box - Success green accent, auto-height */
+    .answer-box > div:nth-child(2) {
+        border-left: 4px solid #10b981 !important;
+        padding: 16px 16px 16px 20px !important;
+        background: #f0fdf4 !important;
+        border-radius: 8px !important;
+        min-height: 100px;
+        color: #166534 !important;
+    }
+    .answer-box p, .answer-box li, .answer-box span {
+        color: #166534 !important;
+    }
+    .answer-box strong {
+        color: #14532d !important;
+    }
+    .answer-box h1, .answer-box h2, .answer-box h3, .answer-box h4 {
+        color: #15803d !important;
+    }
+    .answer-box code {
+        background: #dcfce7 !important;
+        color: #166534 !important;
+        padding: 2px 6px !important;
+        border-radius: 4px !important;
+    }
+    .answer-box pre {
+        background: #dcfce7 !important;
+        padding: 12px !important;
+        border-radius: 6px !important;
+        overflow-x: auto !important;
+    }
+    /* Verification box - Blue accent */
+    .verification-box >  div:nth-child(2) {
+        border-left: 4px solid #0ea5e9 !important;
+        padding: 16px 16px 16px 20px !important;
+        background: #f0f9ff !important;
+        border-radius: 8px !important;
+        min-height: 80px;
+        color: #0369a1 !important;
+    }
+    .verification-box p, .verification-box li, .verification-box span {
+        color: #0c4a6e !important;
+    }
+    .verification-box strong {
+        color: #075985 !important;
+    }
+    /* Stats panel - Professional blue gradient */
+    .stats-panel {
+        background: linear-gradient(135deg, #0369a1 0%, #0284c7 50%, #0ea5e9 100%) !important;
+        color: #ffffff !important;
+        padding: 20px !important;
+        border-radius: 12px !important;
+        text-align: center;
+        box-shadow: 0 4px 14px rgba(3, 105, 161, 0.3);
+    }
+    .stats-panel strong {
+        color: #ffffff !important;
+    }
+    /* Info panel */
+    .info-panel {
+        background: #eff6ff !important;
+        border: 1px solid #bfdbfe !important;
+        border-radius: 8px !important;
+        padding: 12px !important;
+        color: #1e40af !important;
+    }
+    /* Form elements */
+    .gr-input, .gr-textbox textarea {
+        background: #ffffff !important;
+        border: 1px solid #cbd5e1 !important;
+        border-radius: 8px !important;
+        color: #1e293b !important;
+    }
+    .gr-input:focus, .gr-textbox textarea:focus {
+        border-color: #0ea5e9 !important;
+        box-shadow: 0 0 0 3px rgba(14, 165, 233, 0.1) !important;
+    }
+    /* Labels */
+    label {
+        color: #374151 !important;
+        font-weight: 500 !important;
+    }
+    /* Dropdown - High contrast with darker background for visibility */
+    .gr-dropdown,
+    [data-testid="dropdown"],
+    .svelte-dropdown,dropdownExample
+    div[class*="dropdown"] {
+        background: #e0e7ff !important;
+        color: #1e293b !important;
+        border: 2px solid #1e40af !important;
+        border-radius: 8px !important;
+        box-shadow: 0 2px 8px rgba(30, 64, 175, 0.2) !important;
+    }
+    .gr-dropdown:hover,
+    [data-testid="dropdown"]:hover {
+        background: #c7d2fe !important;
+        border-color: #1d4ed8 !important;
+        box-shadow: 0 4px 12px rgba(30, 64, 175, 0.3) !important;
+    }
+    .gr-dropdown select,
+    .gr-dropdown input,
+    [data-testid="dropdown"] input {
+        color: #1e293b !important;
+        background: transparent !important;
+        font-weight: 500 !important;
+    }
+    /* Dropdown container and options */
+    [data-testid="dropdown"] span,
+    .dropdown-container span,
+    div[class*="dropdown"] span {
+        color: #1e293b !important;
+        font-weight: 500 !important;
+    }
+    /* Dropdown list options */
+    .gr-dropdown ul,
+    .dropdown-options,
+    ul[class*="dropdown"] {
+        background: #ffffff !important;
+        border: 2px solid #1e40af !important;
+        border-radius: 8px !important;
+        box-shadow: 0 4px 16px rgba(0, 0, 0, 0.15) !important;
+    }
+    .gr-dropdown li,
+    .dropdown-options li,
+    ul[class*="dropdown"] li {
+        color: #1e293b !important;
+        padding: 10px 14px !important;
+    }
+    .gr-dropdown li:hover,
+    ul[class*="dropdown"] li:hover {
+        background: #c7d2fe !important;
+        color: #1e40af !important;
+    }
+    /* Dropdown label */
+    .gr-dropdown label,
+    [data-testid="dropdown"] label {
+        color: #1e40af !important;
+        font-weight: 600 !important;
+    }
+    /* Tabs - Clean styling */
+    .tab-nav {
+        border-bottom: 2px solid #e2e8f0 !important;
+    }
+    .tab-nav button {
+        color: #64748b !important;
+        font-weight: 500 !important;
+        padding: 12px 20px !important;
+        border: none !important;
+        background: transparent !important;
+    }
+    .tab-nav button.selected {
+        color: #0369a1 !important;
+        border-bottom: 3px solid #0369a1 !important;
+        font-weight: 600 !important;
+    }
+    /* Markdown text */
+    .prose, .markdown-text {
+        color: #334155 !important;
+    }
+    .prose h1, .prose h2, .prose h3,
+    .markdown-text h1, .markdown-text h2, .markdown-text h3 {
+        color: #1e293b !important;
+    }
+    .prose strong, .markdown-text strong {
+        color: #0f172a !important;
+    }
+    /* Scrollbar styling */
+    ::-webkit-scrollbar {
+        width: 8px;
+        height: 8px;
+    }
+    ::-webkit-scrollbar-track {
+        background: #f1f5f9;
+        border-radius: 4px;
+    }
+    ::-webkit-scrollbar-thumb {
+        background: #94a3b8;
+        border-radius: 4px;
+    }
+    ::-webkit-scrollbar-thumb:hover {
+        background: #64748b;
+    }
+    button.secondary {
+        background: #1e40af !important;
+        color: #ffffff !important;
+        border: none !important;
+        border-radius: 8px !important;
+        font-weight: 600 !important;
+        box-shadow: 0 2px 6px rgba(30, 64, 175, 0.3) !important;
+        padding: 12px 20px !important;
+        min-height: 44px !important;
+    }
+    button.secondary:hover {
+        background: #1d4ed8 !important;
+        box-shadow: 0 4px 10px rgba(30, 64, 175, 0.4) !important;
+    }
+    /* Left side input boxes with borders */
+    .left-panel-box {
+        background: #fafafa !important;
+        border: 2px solid #94a3b8 !important;
+        border-radius: 10px !important;
+        padding: 14px !important;
+        margin-bottom: 8px !important;
+    }
+    .left-panel-box:hover {
+        border-color: #64748b !important;
+        box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1) !important;
+    }
+    /* File upload box with border */
+    .file-upload-box {
+        background: #f8fafc !important;
+        border: 2px dashed #64748b !important;
+        border-radius: 10px !important;
+        padding: 14px !important;
+    }
+    .file-upload-box:hover {
+        border-color: #0369a1 !important;
+        border-style: solid !important;
+        background: #f0f9ff !important;
+    }
+    /* Question input box with border */
+    .question-box {
+        background: #fffbeb !important;
+        border: 2px solid #f59e0b !important;
+        border-radius: 10px !important;
+        padding: 14px !important;
+    }
+    .question-box:hover {
+        border-color: #d97706 !important;
+        box-shadow: 0 2px 8px rgba(245, 158, 11, 0.2) !important;
+    }
+    /* Dropdown Example - Beige background on 3rd parent container */
+    .dropdownExample {
+        background: #f5f5dc !important;
+        padding: 16px !important;
+        border-radius: 8px !important;
+        border: 2px solid #d1d5db !important;
+        margin-bottom: 16px !important;
+    }
+    """
+    js = """
+    function createGradioAnimation() {
+        var container = document.createElement('div');
+        container.id = 'gradio-animation';
+        container.style.fontSize = '2.4em';
+        container.style.fontWeight = '700';
+        container.style.textAlign = 'center';
+        container.style.marginBottom = '20px';
+        container.style.marginTop = '10px';
+        container.style.color = '#0369a1';
+        container.style.letterSpacing = '-0.02em';
+        var text = '📄 SmartDoc AI';
+        for (var i = 0; i < text.length; i++) {
+            (function(i){
+                setTimeout(function(){
+                    var letter = document.createElement('span');
+                    letter.style.opacity = '0';
+                    letter.style.transition = 'opacity 0.2s ease';
+                    letter.innerText = text[i];
+                    container.appendChild(letter);
+                    setTimeout(function() { letter.style.opacity = '1'; }, 50);
+                }, i * 80);
+            })(i);
+        }
+        var gradioContainer = document.querySelector('.gradio-container');
+        gradioContainer.insertBefore(container, gradioContainer.firstChild);
+        return 'Animation created';
+    }
+    (() => {
+  const upload_messages = [
+    "Crunching your documents...",
+    "Warming up the AI...",
+    "Extracting knowledge...",
+    "Scanning for insights...",
+    "Preparing your data...",
+    "Looking for answers...",
+    "Analyzing file structure...",
+    "Reading your files...",
+    "Indexing content...",
+    "Almost ready..."
+  ];
+  let intervalId = null;
+  let timerId = null;
+  let startMs = null;
+  let lastMsg = null;
+  function pickMsg() {
+    if (upload_messages.length === 0) return "";
+    if (upload_messages.length === 1) return upload_messages[0];
+    let m;
+    do {
+      m = upload_messages[Math.floor(Math.random() * upload_messages.length)];
+    } while (m === lastMsg);
+    lastMsg = m;
+    return m;
+  }
+  function getMsgSpan() {
+    const root = document.getElementById("processing-message");
+    if (!root) return null;
+    return root.querySelector("#processing-msg");
+  }
+  function getTimerSpan() {
+    const root = document.getElementById("processing-message");
+    if (!root) return null;
+    return root.querySelector("#processing-timer");
+  }
+  function setMsg(text) {
+    const span = getMsgSpan();
+    if (!span) return;
+    span.textContent = text;
+  }
+  function formatElapsed(startMs) {
+    const s = (Date.now() - startMs) / 1000;
+    return `${s.toFixed(1)}s elapsed`;
+  }
+  function startRotationAndTimer() {
+    stopRotationAndTimer();
+    setMsg(pickMsg());
+    startMs = Date.now();
+    intervalId = setInterval(() => setMsg(pickMsg()), 2000);
+    const timerSpan = getTimerSpan();
+    if (timerSpan) {
+      timerSpan.textContent = formatElapsed(startMs);
+      timerId = setInterval(() => {
+        timerSpan.textContent = formatElapsed(startMs);
+      }, 200);
+    }
+  }
+  function stopRotationAndTimer() {
+    if (intervalId) {
+      clearInterval(intervalId);
+      intervalId = null;
+    }
+    if (timerId) {
+      clearInterval(timerId);
+      timerId = null;
+    }
+    const timerSpan = getTimerSpan();
+    if (timerSpan) timerSpan.textContent = "";
+  }
+  // Auto start/stop based on visibility of the processing box
+  function watchProcessingBox() {
+    const root = document.getElementById("processing-message");
+    if (!root) {
+      setTimeout(watchProcessingBox, 250);
+      return;
+    }
+    const isVisible = () => root.offsetParent !== null;
+    let prev = isVisible();
+    if (prev) startRotationAndTimer();
+    const obs = new MutationObserver(() => {
+      const now = isVisible();
+      if (now && !prev) startRotationAndTimer();
+      if (!now && prev) stopRotationAndTimer();
+      prev = now;
+    });
+    obs.observe(root, { attributes: true, attributeFilter: ["style", "class"] });
+  }
+  window.smartdocStartRotationAndTimer = startRotationAndTimer;
+  window.smartdocStopRotationAndTimer = stopRotationAndTimer;
+  watchProcessingBox();
+})();
+    """
+    with gr.Blocks(theme=gr.themes.Soft(), title="SmartDoc AI", css=css, js=js) as demo:
+        gr.Markdown("### SmartDoc AI - Document Q&A", elem_classes="app-title")
+        gr.Markdown("Upload your documents and ask questions. Answers will appear below, just like a chat.", elem_classes="app-description")
+        gr.Markdown("---")
+        # Examples dropdown
+        example_dropdown = gr.Dropdown(
+            label="Quick Start - Choose an Example",
+            choices=list(EXAMPLES.keys()),
+            value=None,
+            info="Select a pre-loaded example to try"
+        )
+        loaded_file_info = gr.Markdown("", elem_classes="info-panel", visible=False)
+        files = gr.Files(label="Upload your files", file_types=definitions.ALLOWED_TYPES)
+        question = gr.Textbox(label="Ask a question", lines=2, placeholder="Type your question here...")
+        chat = gr.Chatbot(label="Answers", elem_id="chat-history")
+        submit_btn = gr.Button("Get Answer", variant="primary")
+        processing_message = gr.HTML("", elem_id="processing-message", visible=False)
+        doc_context_display = gr.Markdown("*Submit a question to see which document sections were referenced*", elem_classes="doc-context", visible=False)
+        refresh_context_btn = gr.Button("Refresh Sources", variant="secondary", visible=False)
+        with gr.Tab("Context"):
+            pass  # No .render() calls here; components are already defined and used in outputs
+        session_state = gr.State({
+            "file_hashes": frozenset(),
+            "retriever": None,
+            "chat_history": [],
+            "last_documents": [],
+            "total_questions": 0,
+            "session_start": datetime.now().strftime("%Y-%m-%d %H:%M")
+        })
+        def process_question(question_text, uploaded_files, chat_history):
+            import time
+            import random
+            chat_history = chat_history or []
+            upload_messages = [
+                "Crunching your documents...",
+                "Warming up the AI...",
+                "Extracting knowledge...",
+                "Scanning for insights...",
+                "Preparing your data...",
+                "Looking for answers...",
+                "Analyzing file structure...",
+                "Reading your files...",
+                "Indexing content...",
+                "Almost ready..."
+            ]
+            last_msg = None
+            start_time = time.time()
+            msg = random.choice([m for m in upload_messages if m != last_msg])
+            last_msg = msg
+            yield (
+                chat_history,
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(interactive=False),
+                gr.update(interactive=False),
+                gr.update(interactive=False),
+                gr.update(interactive=False),
+                gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
+  <span id="processing-msg"></span>
+  <span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
+</div>''', visible=True)
+            )
+            try:
+                if not question_text.strip():
+                    chat_history.append({"role": "user", "content": question_text})
+                    chat_history.append({"role": "assistant", "content": "Please enter a question."})
+                    yield (
+                        chat_history,
+                        gr.update(visible=False),
+                        gr.update(visible=False),
+                        gr.update(interactive=True),
+                        gr.update(interactive=True),
+                        gr.update(interactive=True),
+                        gr.update(interactive=True),
+                        gr.update(value="", visible=False)
+                    )
+                    return
+                if not uploaded_files:
+                    chat_history.append({"role": "user", "content": question_text})
+                    chat_history.append({"role": "assistant", "content": "Please upload at least one document."})
+                    yield (
+                        chat_history,
+                        gr.update(visible=False),
+                        gr.update(visible=False),
+                        gr.update(interactive=True),
+                        gr.update(interactive=True),
+                        gr.update(interactive=True),
+                        gr.update(interactive=True),
+                        gr.update(value="", visible=False)
+                    )
+                    return
+                # Stage 2: Chunking with per-chunk progress and rotating status
+                all_chunks = []
+                seen_hashes = set()
+                total_chunks = 0
+                chunk_counts = []
+                for file in uploaded_files:
+                    with open(file.name, 'rb') as f:
+                        file_content = f.read()
+                        file_hash = processor._generate_hash(file_content)
+                    cache_path = processor.cache_dir / f"{file_hash}.pkl"
+                    if processor._is_cache_valid(cache_path):
+                        chunks = processor._load_from_cache(cache_path)
+                        if not chunks:
+                            chunks = processor._process_file(file)
+                            processor._save_to_cache(chunks, cache_path)
+                    else:
+                        chunks = processor._process_file(file)
+                        processor._save_to_cache(chunks, cache_path)
+                    chunk_counts.append(len(chunks))
+                    total_chunks += len(chunks)
+                if total_chunks == 0:
+                    total_chunks = 1
+                chunk_idx = 0
+                msg = random.choice(upload_messages)
+                for file, file_chunk_count in zip(uploaded_files, chunk_counts):
+                    with open(file.name, 'rb') as f:
+                        file_content = f.read()
+                        file_hash = processor._generate_hash(file_content)
+                    cache_path = processor.cache_dir / f"{file_hash}.pkl"
+                    if processor._is_cache_valid(cache_path):
+                        chunks = processor._load_from_cache(cache_path)
+                        if not chunks:
+                            chunks = processor._process_file(file)
+                            processor._save_to_cache(chunks, cache_path)
+                    else:
+                        chunks = processor._process_file(file)
+                        processor._save_to_cache(chunks, cache_path)
+                    for chunk in chunks:
+                        chunk_hash = processor._generate_hash(chunk.page_content.encode())
+                        if chunk_hash not in seen_hashes:
+                            seen_hashes.add(chunk_hash)
+                            all_chunks.append(chunk)
+                        # else: skip duplicate chunk
+                        chunk_idx += 1
+                        # Rotate status message every 10 seconds
+                        elapsed = time.time() - start_time
+                        if chunk_idx == 1 or (elapsed // 10) > ((elapsed-1) // 10):
+                            msg = random.choice([m for m in upload_messages if m != last_msg])
+                            last_msg = msg
+                        # When yielding progress, always do:
+                        yield (
+                            chat_history,
+                            gr.update(visible=False),
+                            gr.update(visible=False),
+                            gr.update(interactive=False),
+                            gr.update(interactive=False),
+                            gr.update(interactive=False),
+                            gr.update(interactive=False),
+                            gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
+  <span id="processing-msg"></span>
+  <span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
+</div>''', visible=True)
+                        )
+                # After all chunks, show 100%
+                elapsed = time.time() - start_time
+                yield (
+                    chat_history,
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
+  <span id="processing-msg"></span>
+  <span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
+</div>''', visible=True)
+                )
+                # Stage 3: Building Retriever
+                elapsed = time.time() - start_time
+                yield (
+                    chat_history,
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(value=(
+                        '<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04); display:flex; align-items:center;">'
+                        '<img src="https://media.giphy.com/media/26ufnwz3wDUli7GU0/giphy.gif" alt="AI working" style="height:40px; margin-right:16px;">'
+                        '<span id="processing-msg"></span>'
+                        '</div>'
+                    ), visible=True)
+                )
+                retriever = retriever_indexer.build_hybrid_retriever(all_chunks)
+                # Stage 4: Generating Answer
+                elapsed = time.time() - start_time
+                yield (
+                    chat_history,
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
+  <span id="processing-msg"></span>
+  <span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
+</div>''', visible=True)
+                )
+                result = orchestrator.full_pipeline(question=question_text, retriever=retriever)
+                answer = result["draft_answer"]
+                # Stage 5: Verifying Answer
+                elapsed = time.time() - start_time
+                yield (
+                    chat_history,
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(interactive=False),
+                    gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
+  <span id="processing-msg"></span>
+  <span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
+</div>''', visible=True)
+                )
+                verification = result.get("verification_report", "No verification details available.")
+                logger.info(f"Verification (internal):\n{verification}")
+                # Do not display verification to user, only use internally
+                chat_history.append({"role": "user", "content": question_text})
+                chat_history.append({"role": "assistant", "content": f"**Answer:**\n{answer}"})
+                session_state.value["last_documents"] = retriever.invoke(question_text)
+                # Final: Show results and make context tab visible
+                total_elapsed = time.time() - start_time
+                yield (
+                    chat_history,
+                    gr.update(visible=True),  # doc_context_display
+                    gr.update(visible=True),  # refresh_context_btn
+                    gr.update(interactive=True),
+                    gr.update(interactive=True),
+                    gr.update(interactive=True),
+                    gr.update(interactive=True),
+                    gr.update(value='''<div style="background:#fff; border-radius:8px; padding:18px 24px; margin-top:32px; color:#1e293b; font-size:1.2em; font-weight:600; box-shadow:0 2px 8px rgba(0,0,0,0.04);">
+  <span id="processing-msg"></span>
+  <span id="processing-timer" style="opacity:0.8; margin-left:8px;"></span>
+</div>''', visible=True)
+                )
+                time.sleep(1.5)
+                yield (
+                    chat_history,
+                    gr.update(visible=True),
+                    gr.update(visible=True),
+                    gr.update(interactive=True),
+                    gr.update(interactive=True),
+                    gr.update(interactive=True),
+                    gr.update(interactive=True),
+                    gr.update(value="", visible=False)
+                )
+            except Exception as e:
+                logger.error(f"Processing error: {e}", exc_info=True)
+                chat_history.append({"role": "user", "content": question_text})
+                chat_history.append({"role": "assistant", "content": f"Error: {str(e)}"})
+                yield (
+                    chat_history,
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(interactive=True),
+                    gr.update(interactive=True),
+                    gr.update(interactive=True),
+                    gr.update(interactive=True),
+                    gr.update(value="", visible=False)
+                )
+        submit_btn.click(
+            fn=process_question,
+            inputs=[question, files, chat],
+            outputs=[chat, doc_context_display, refresh_context_btn, submit_btn, question, files, example_dropdown, processing_message],
+            queue=True,
+            show_progress=True
+        )
+        def refresh_context():
+            docs = session_state.value.get("last_documents", [])
+            last_question = ""
+            for msg in reversed(chat.value or []):
+                if msg["role"] == "user":
+                    last_question = msg["content"]
+                    break
+            return format_document_context(docs, last_question)
+        refresh_context_btn.click(
+            fn=refresh_context,
+            inputs=[],
+            outputs=[doc_context_display]
+        )
+        def load_example(example_key):
+            if not example_key or example_key not in EXAMPLES:
+                return [], "", "Select a valid example from the dropdown above"
+            ex_data = EXAMPLES[example_key]
+            question_text = ex_data["question"]
+            file_paths = ex_data["file_paths"]
+            import tempfile
+            temp_dir = tempfile.mkdtemp()
+            copied_files = []
+            file_info_text = f"Loaded: {example_key}\n\n"
+            for source_file_path in file_paths:
+                abs_source = os.path.abspath(source_file_path)
+                if os.path.exists(abs_source):
+                    filename = os.path.basename(abs_source)
+                    temp_file_path = os.path.join(temp_dir, filename)
+                    shutil.copy2(abs_source, temp_file_path)
+                    copied_files.append(temp_file_path)
+                    file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
+                    file_info_text += f"{filename} ({file_size_mb:.2f} MB)\n"
+                else:
+                    file_info_text += f"{source_file_path} not found\n"
+            if not copied_files:
+                return [], "", "Could not load example files"
+            return copied_files, question_text, file_info_text
+        # Remove the Load Example button and related logic
+        # Instead, load the example immediately when dropdown changes
+        example_dropdown.change(
+            fn=load_example,
+            inputs=[example_dropdown],
+            outputs=[files, question, loaded_file_info]
+        )
+    # Launch server - Compatible with both local and Hugging Face Spaces
+    # HF Spaces sets SPACE_ID environment variable
+    is_hf_space = os.environ.get("SPACE_ID") is not None
+    if is_hf_space:
+        # Hugging Face Spaces configuration
+        logger.info("Running on Hugging Face Spaces")
+        demo.launch(server_name="0.0.0.0", server_port=7860)
+    else:
+        # Local development configuration
+        configured_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
+        server_port = _find_open_port(configured_port)
+        logger.info(f"Launching Gradio on port {server_port}")
+        logger.info(f"Access the app at: http://127.0.0.1:{server_port}")
+        demo.launch(server_name="127.0.0.1", server_port=server_port, share=False)
+if __name__ == "__main__":
+    main()

maintenance.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import shutil
+import glob
+def clean_pycache():
+    """
+    Recursively finds and removes all __pycache__ directories and .pyc files
+    within the current working directory.
+    """
+    print("Starting Python cache cleanup...")
+    # Find and remove __pycache__ directories
+    pycache_dirs = glob.glob('**/__pycache__', recursive=True)
+    if not pycache_dirs:
+        print("No __pycache__ directories found.")
+    else:
+        for path in pycache_dirs:
+            try:
+                if os.path.isdir(path):
+                    shutil.rmtree(path)
+                    print(f"Removed directory: {path}")
+            except OSError as e:
+                print(f"Error removing directory {path}: {e}")
+    # Find and remove remaining .pyc files (less common)
+    pyc_files = glob.glob('**/*.pyc', recursive=True)
+    if not pyc_files:
+        print("No .pyc files found.")
+    else:
+        for path in pyc_files:
+            try:
+                if os.path.isfile(path):
+                    os.remove(path)
+                    print(f"Removed file: {path}")
+            except OSError as e:
+                print(f"Error removing file {path}: {e}")
+    print("Cleanup complete.")
+if __name__ == "__main__":
+    clean_pycache()

requirements.txt ADDED Viewed

	@@ -0,0 +1,53 @@

+# Core dependencies
+aiofiles>=23.2.1
+aiohttp>=3.11.0
+annotated-types>=0.7.0
+anyio>=4.8.0
+# PDF Processing
+pdfplumber>=0.11.0
+pdf2image>=1.17.0
+Pillow>=10.0.0
+# Computer Vision for local chart detection (cost optimization)
+opencv-python>=4.8.0
+# LangChain ecosystem
+langchain>=0.3.16
+langchain-core>=0.3.32
+langchain-text-splitters>=0.3.5
+langchain-google-genai>=2.0.0
+langchain-community>=0.3.16
+langchain-chroma>=0.1.0
+# Google AI for chart analysis
+google-generativeai>=0.8.0
+# Vector store
+chromadb>=0.6.3
+# Web framework
+gradio>=5.13.0
+# Data processing
+pandas>=2.1.4
+numpy>=1.26.4
+beautifulsoup4>=4.12.3
+# Document loaders
+python-docx>=1.1.2
+docx2txt>=0.8
+# Configuration
+pydantic>=2.11.10,<2.12.5
+pydantic-settings>=2.10.1,<3.0.0
+python-dotenv>=1.0.1
+# BM25 retriever
+rank-bm25>=0.2.2
+# Utilities
+tqdm>=4.67.0
+requests>=2.32.0
+tiktoken>=0.8.0
+tenacity>=9.0.0

search_engine/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .indexer import RetrieverBuilder
2	+
3	+ __all__ = ["RetrieverBuilder"]

search_engine/indexer.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Retriever indexer module for DocChat.
+Provides utilities for building different types of retrievers:
+- Vector-based retriever (ChromaDB + embeddings)
+- Hybrid retriever (BM25 + Vector with ensemble)
+"""
+import logging
+import sys
+from typing import List, Any
+import time
+import hashlib
+import os
+import json
+from langchain_core.documents import Document
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.callbacks import CallbackManagerForRetrieverRun
+from langchain_chroma import Chroma
+from langchain_community.retrievers import BM25Retriever
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+from langchain_core.vectorstores import VectorStoreRetriever
+from configuration.parameters import parameters
+logger = logging.getLogger(__name__)
+def doc_id(doc) -> str:
+    src = doc.metadata.get("source", "")
+    page = doc.metadata.get("page", "")
+    chunk = doc.metadata.get("chunk_id", "")
+    base = f"{src}::{page}::{chunk}"
+    return hashlib.sha256(base.encode("utf-8")).hexdigest()
+def content_hash(doc) -> str:
+    return hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest()
+def load_manifest(path):
+    if os.path.exists(path):
+        with open(path, "r") as f:
+            return json.load(f)
+    return {}
+def save_manifest(path, manifest):
+    with open(path, "w") as f:
+        json.dump(manifest, f)
+class EnsembleRetriever(BaseRetriever):
+    """
+    Custom Ensemble Retriever combining multiple retrievers with weighted RRF.
+    Attributes:
+        retrievers: List of retriever instances
+        weights: List of weights (should sum to 1.0)
+        c: RRF constant (default: 60)
+        k: Max documents to return (default: 10)
+    """
+    retrievers: List[Any]
+    weights: List[float]
+    c: int = 60
+    k: int = 10
+    class Config:
+        arbitrary_types_allowed = True
+    def _get_relevant_documents(
+        self,
+        query: str,
+        *,
+        run_manager: CallbackManagerForRetrieverRun = None
+    ) -> List[Document]:
+        """Retrieve and combine documents using weighted RRF, deduplicating charts by content and aggregating page numbers."""
+        logger.debug(f"[ENSEMBLE] Query: {query[:80]}...")
+        all_docs_with_scores = {}
+        retriever_names = ["BM25", "Vector"]
+        for idx, (retriever, weight) in enumerate(zip(self.retrievers, self.weights)):
+            retriever_name = retriever_names[idx] if idx < len(retriever_names) else f"Retriever_{idx}"
+            try:
+                docs = retriever.invoke(query)
+                logger.debug(f"[ENSEMBLE] {retriever_name}: {len(docs)} docs (weight: {weight})")
+                for rank, doc in enumerate(docs):
+                    # Deduplicate by content and source only
+                    doc_key = (doc.page_content, doc.metadata.get('source', ''))
+                    rrf_score = weight / (rank + 1 + self.c)
+                    if doc_key in all_docs_with_scores:
+                        existing_doc, existing_score = all_docs_with_scores[doc_key]
+                        # Aggregate page numbers
+                        existing_pages = set()
+                        if isinstance(existing_doc.metadata.get('page'), list):
+                            existing_pages.update(existing_doc.metadata['page'])
+                        else:
+                            existing_pages.add(existing_doc.metadata.get('page'))
+                        existing_pages.add(doc.metadata.get('page'))
+                        # Update metadata to include all pages
+                        existing_doc.metadata['page'] = sorted(p for p in existing_pages if p is not None)
+                        all_docs_with_scores[doc_key] = (existing_doc, existing_score + rrf_score)
+                    else:
+                        all_docs_with_scores[doc_key] = (doc, rrf_score)
+            except Exception as e:
+                logger.warning(f"[ENSEMBLE] {retriever_name} failed: {e}")
+                continue
+        sorted_docs = sorted(all_docs_with_scores.values(), key=lambda x: x[1], reverse=True)
+        result = [doc for doc, score in sorted_docs[:self.k]]
+        logger.debug(f"[ENSEMBLE] Returning {len(result)} documents")
+        return result
+class RetrieverBuilder:
+    """Builder class for creating document retrievers with caching."""
+    def __init__(self):
+        """Initialize with embeddings model."""
+        self.embeddings = GoogleGenerativeAIEmbeddings(
+            model="models/text-embedding-004",
+            google_api_key=parameters.GOOGLE_API_KEY,
+            batch_size=32,  # Enable batching for faster embedding computation
+        )
+        self._retriever_cache = {}  # {docset_hash: retriever}
+    def _hash_docs(self, docs):
+        # Create a hash of all document contents and metadata
+        m = hashlib.sha256()
+        for doc in docs:
+            m.update(doc.page_content.encode('utf-8'))
+            for k, v in sorted(doc.metadata.items()):
+                m.update(str(k).encode('utf-8'))
+                m.update(str(v).encode('utf-8'))
+        return m.hexdigest()
+    def build_hybrid_retriever(self, docs) -> EnsembleRetriever:
+        """
+        Build hybrid retriever using BM25 and vector search.
+        Args:
+            docs: List of documents to index
+        Returns:
+            EnsembleRetriever combining BM25 and vector search
+        """
+        logger.info(f"Building hybrid retriever with {len(docs)} documents...")
+        if not docs:
+            raise ValueError("No documents provided")
+        chroma_dir = parameters.CHROMA_DB_PATH
+        manifest_path = os.path.join(chroma_dir, "indexed_manifest.json")
+        os.makedirs(chroma_dir, exist_ok=True)
+        manifest = load_manifest(manifest_path)
+        vector_store = Chroma(
+            embedding_function=self.embeddings,
+            persist_directory=chroma_dir,
+        )
+        to_add = []
+        ids_to_add = []
+        to_delete_ids = []
+        current_ids = set()
+        for d in docs:
+            _id = doc_id(d)
+            _hash = content_hash(d)
+            current_ids.add(_id)
+            if _id not in manifest:
+                to_add.append(d)
+                ids_to_add.append(_id)
+                manifest[_id] = _hash
+            elif manifest[_id] != _hash:
+                to_delete_ids.append(_id)
+                to_add.append(d)
+                ids_to_add.append(_id)
+                manifest[_id] = _hash
+        if to_add:
+            # Safety net: de-dupe before add_documents
+            seen = set()
+            uniq_docs, uniq_ids = [], []
+            for doc, _id in zip(to_add, ids_to_add):
+                if _id in seen:
+                    continue
+                seen.add(_id)
+                uniq_docs.append(doc)
+                uniq_ids.append(_id)
+            # Debugging: show duplicate IDs and their sources
+            from collections import Counter
+            counts = Counter(ids_to_add)
+            dupes = [i for i, c in counts.items() if c > 1]
+            if dupes:
+                print("Duplicate IDs:", len(dupes))
+                for d in dupes[:10]:
+                    idxs = [k for k, x in enumerate(ids_to_add) if x == d]
+                    print("ID:", d, "examples:")
+                    for k in idxs[:3]:
+                        md = to_add[k].metadata
+                        print("  ", md.get("source"), md.get("page"), md.get("chunk_index"))
+            vector_store.add_documents(uniq_docs, ids=uniq_ids)
+        save_manifest(manifest_path, manifest)
+        # Create BM25 retriever
+        t_bm25_start = time.time()
+        texts = [doc.page_content for doc in docs]
+        metadatas = [doc.metadata for doc in docs]
+        bm25_retriever = BM25Retriever.from_texts(texts=texts, metadatas=metadatas)
+        bm25_retriever.k = parameters.BM25_SEARCH_K
+        t_bm25_end = time.time()
+        logger.info(f"[PROFILE] BM25 retriever creation: {t_bm25_end - t_bm25_start:.2f}s")
+        logger.debug(f"BM25 indexed {len(texts)} texts, k={bm25_retriever.k}")
+        t_vec_retr_start = time.time()
+        vector_retriever = vector_store.as_retriever(
+            search_type="mmr",
+            search_kwargs={
+                "k": parameters.VECTOR_Search_K_CHROMA,
+                "fetch_k": parameters.VECTOR_FETCH_K,
+                "lambda_mult": 0.7,
+            },
+        )
+        t_vec_retr_end = time.time()
+        logger.info(f"[PROFILE] Vector retriever creation: {t_vec_retr_end - t_vec_retr_start:.2f}s")
+        logger.debug("Vector retriever created")
+        t_ensemble_start = time.time()
+        hybrid_retriever = EnsembleRetriever(
+            retrievers=[bm25_retriever, vector_retriever],
+            weights=parameters.HYBRID_RETRIEVER_WEIGHTS,
+            k=parameters.VECTOR_SEARCH_K,
+        )
+        t_ensemble_end = time.time()
+        logger.info(f"[PROFILE] Ensemble retriever creation: {t_ensemble_end - t_ensemble_start:.2f}s")
+        logger.info(f"Hybrid retriever created (k={parameters.VECTOR_SEARCH_K})")
+        logger.info(f"[PROFILE] Total hybrid retriever build: {t_ensemble_end - t_bm25_start:.2f}s")
+        return hybrid_retriever

test_token_size.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/env python
+"""Test script to analyze token size of retrieved documents."""
+import sys
+sys.path.insert(0, r'd:\MultiRAGgent\docchat')
+from content_analyzer.document_parser import DocumentProcessor
+from search_engine.indexer import RetrieverBuilder
+from pathlib import Path
+# Initialize
+processor = DocumentProcessor()
+retriever_indexer = RetrieverBuilder()
+# Load the example document
+example_file = Path(r'd:\MultiRAGgent\docchat\examples\google-2024-environmental-report.pdf')
+print(f"\n{'='*80}")
+print("[TOKEN_ANALYSIS] Loading document: {example_file.name}")
+print(f"{'='*80}\n")
+# Process document
+chunks = processor.process([str(example_file)])
+print(f"[TOKEN_ANALYSIS] ✓ Loaded {len(chunks)} chunks from document")
+# Build retriever
+print(f"\n[TOKEN_ANALYSIS] Building hybrid retriever...")
+retriever = retriever_indexer.build_retriever_with_scores(chunks)
+print(f"[TOKEN_ANALYSIS] ✓ Retriever built\n")
+# Test retrieval
+question = "Retrieve the data center PUE efficiency values in Singapore 2nd facility in 2019 and 2022"
+print(f"[TOKEN_ANALYSIS] Question: {question}\n")
+retrieved_docs = retriever.invoke(question)
+# Calculate token metrics
+print(f"\n{'='*80}")
+print(f"[TOKEN_ANALYSIS] RETRIEVAL RESULTS")
+print(f"{'='*80}\n")
+print(f"[TOKEN_ANALYSIS] Retrieved {len(retrieved_docs)} documents")
+# Character and token analysis
+total_chars = sum(len(doc.page_content) for doc in retrieved_docs)
+# Different tokenization estimates
+tokens_gpt = total_chars / 4  # ~4 chars per token (GPT)
+tokens_gemini = total_chars / 3  # ~3 chars per token (Gemini - more aggressive)
+tokens_claude = total_chars / 4.5  # ~4.5 chars per token (Claude)
+if retrieved_docs:
+    avg_chars = total_chars // len(retrieved_docs)
+    avg_tokens_gemini = avg_chars // 3
+    print(f"\n[CHARACTER COUNT]")
+    print(f"  Total characters: {total_chars:,}")
+    print(f"  Average per doc:  {avg_chars:,} chars")
+    print(f"\n[TOKEN COUNT ESTIMATES]")
+    print(f"  Gemini (1 token ≈ 3 chars): {tokens_gemini:,.0f} tokens")
+    print(f"  GPT/Claude (1 token ≈ 4 chars): {tokens_gpt:,.0f} tokens")
+    print(f"  Average per doc (Gemini): {avg_tokens_gemini:,} tokens")
+    print(f"\n[QUOTA ANALYSIS]")
+    print(f"  Gemini free tier limit: 250,000 tokens/day")
+    print(f"  Your 64 docs use: {tokens_gemini:,.0f} tokens")
+    percentage = (tokens_gemini / 250000) * 100
+    print(f"  Percentage of daily quota: {percentage:.1f}%")
+    print(f"\n[DOCUMENT SIZE BREAKDOWN]")
+    for i, doc in enumerate(retrieved_docs[:5], 1):
+        chars = len(doc.page_content)
+        tokens = chars // 3
+        print(f"  Doc {i}: {chars:,} chars (~{tokens:,} tokens)")
+    if len(retrieved_docs) > 5:
+        print(f"  ... and {len(retrieved_docs) - 5} more documents")
+print(f"\n{'='*80}\n")

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Test fixtures and shared utilities for DocChat tests.
+"""
+import pytest
+from unittest.mock import MagicMock
+from langchain_core.documents import Document
+class FakeLLM:
+    """Mock LLM for testing without API calls."""
+    def __init__(self, content: str = "Test response"):
+        self.content = content
+        self.last_prompt = None
+        self.invoke_count = 0
+    def invoke(self, prompt: str):
+        self.last_prompt = prompt
+        self.invoke_count += 1
+        return type("Response", (), {"content": self.content})()
+class FakeRetriever:
+    """Mock retriever for testing without vector store."""
+    def __init__(self, documents: list = None):
+        self.documents = documents or []
+        self.invoke_count = 0
+        self.last_query = None
+    def invoke(self, query: str):
+        self.last_query = query
+        self.invoke_count += 1
+        return self.documents
+@pytest.fixture
+def sample_documents():
+    """Create sample documents for testing."""
+    return [
+        Document(
+            page_content="The data center in Singapore achieved a PUE of 1.12 in 2022.",
+            metadata={"source": "test.pdf", "page": 1}
+        ),
+        Document(
+            page_content="Carbon-free energy in Asia Pacific reached 45% in 2023.",
+            metadata={"source": "test.pdf", "page": 2}
+        ),
+        Document(
+            page_content="DeepSeek-R1 outperformed o1-mini on coding benchmarks.",
+            metadata={"source": "deepseek.pdf", "page": 1}
+        ),
+    ]
+@pytest.fixture
+def fake_llm():
+    """Create a fake LLM for testing."""
+    return FakeLLM("This is a test response.")
+@pytest.fixture
+def fake_retriever(sample_documents):
+    """Create a fake retriever with sample documents."""
+    return FakeRetriever(sample_documents)
+@pytest.fixture
+def empty_retriever():
+    """Create a fake retriever that returns no documents."""
+    return FakeRetriever([])

tests/test_accuracy_verifier.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+Tests for the VerificationAgent.
+"""
+import pytest
+from unittest.mock import MagicMock, patch
+from langchain_core.documents import Document
+# Import after setting up mocks to avoid API key validation
+import sys
+sys.path.insert(0, '.')
+class TestVerificationAgent:
+    """Test suite for VerificationAgent."""
+    @pytest.fixture
+    def mock_parameters(self, monkeypatch):
+        """Mock parameters to avoid API key requirement."""
+        monkeypatch.setenv("GOOGLE_API_KEY", "test_key_for_testing")
+    @pytest.fixture
+    def accuracy_verifier(self, mock_parameters, fake_llm):
+        """Create a VerificationAgent with mocked LLM."""
+        from intelligence.accuracy_verifier import VerificationAgent
+        return VerificationAgent(llm=fake_llm)
+    def test_check_with_supported_answer(self, accuracy_verifier, sample_documents):
+        """Test verification with an answer supported by documents."""
+        # Configure the fake LLM to return a supported response
+        accuracy_verifier.llm.content = """
+        Supported: YES
+        Unsupported Claims: []
+        Contradictions: []
+        Relevant: YES
+        Additional Details: The answer is well-supported by the context.
+        """
+        result = accuracy_verifier.check(
+            answer="The PUE in Singapore was 1.12 in 2022.",
+            documents=sample_documents
+        )
+        assert "verification_report" in result
+        assert "Supported: YES" in result["verification_report"]
+        assert "context_used" in result
+    def test_check_with_unsupported_answer(self, accuracy_verifier, sample_documents):
+        """Test verification with an unsupported answer."""
+        accuracy_verifier.llm.content = """
+        Supported: NO
+        Unsupported Claims: [The PUE was 1.5]
+        Contradictions: []
+        Relevant: YES
+        Additional Details: The claimed PUE value is not in the context.
+        """
+        result = accuracy_verifier.check(
+            answer="The PUE in Singapore was 1.5 in 2022.",
+            documents=sample_documents
+        )
+        assert "Supported: NO" in result["verification_report"]
+    def test_parse_verification_response_valid(self, accuracy_verifier):
+        """Test parsing a valid verification response."""
+        response = """
+        Supported: YES
+        Unsupported Claims: []
+        Contradictions: []
+        Relevant: YES
+        Additional Details: All claims verified.
+        """
+        parsed = accuracy_verifier.parse_verification_response(response)
+        assert parsed["Supported"] == "YES"
+        assert parsed["Relevant"] == "YES"
+        assert parsed["Unsupported Claims"] == []
+    def test_parse_verification_response_with_claims(self, accuracy_verifier):
+        """Test parsing response with unsupported claims."""
+        response = """
+        Supported: NO
+        Unsupported Claims: [claim1, claim2]
+        Contradictions: [contradiction1]
+        Relevant: YES
+        Additional Details: Multiple issues found.
+        """
+        parsed = accuracy_verifier.parse_verification_response(response)
+        assert parsed["Supported"] == "NO"
+        assert len(parsed["Unsupported Claims"]) == 2
+        assert len(parsed["Contradictions"]) == 1
+    def test_format_verification_report(self, accuracy_verifier):
+        """Test formatting a verification report."""
+        verification = {
+            "Supported": "YES",
+            "Unsupported Claims": [],
+            "Contradictions": [],
+            "Relevant": "YES",
+            "Additional Details": "Well verified."
+        }
+        report = accuracy_verifier.format_verification_report(verification)
+        assert "**Supported:** YES" in report
+        assert "**Relevant:** YES" in report
+        assert "**Unsupported Claims:** None" in report

tests/test_context_validator.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+Tests for the RelevanceChecker.
+"""
+import pytest
+from unittest.mock import MagicMock
+from langchain_core.documents import Document
+import sys
+sys.path.insert(0, '.')
+class TestRelevanceChecker:
+    """Test suite for RelevanceChecker."""
+    @pytest.fixture
+    def mock_parameters(self, monkeypatch):
+        """Mock parameters to avoid API key requirement."""
+        monkeypatch.setenv("GOOGLE_API_KEY", "test_key_for_testing")
+    @pytest.fixture
+    def context_validator(self, mock_parameters, fake_llm):
+        """Create a RelevanceChecker with mocked LLM."""
+        from intelligence.context_validator import RelevanceChecker
+        checker = RelevanceChecker()
+        checker.llm = fake_llm
+        return checker
+    def test_check_can_answer(self, context_validator, fake_retriever):
+        """Test when documents can fully answer the question."""
+        context_validator.llm.content = "CAN_ANSWER"
+        result = context_validator.check(
+            question="What is the PUE in Singapore?",
+            retriever=fake_retriever,
+            k=3
+        )
+        assert result == "CAN_ANSWER"
+        assert fake_retriever.invoke_count == 1
+    def test_check_partial_match(self, context_validator, fake_retriever):
+        """Test when documents partially match the question."""
+        context_validator.llm.content = "PARTIAL"
+        result = context_validator.check(
+            question="What is the historical trend of PUE?",
+            retriever=fake_retriever,
+            k=3
+        )
+        assert result == "PARTIAL"
+    def test_check_no_match(self, context_validator, fake_retriever):
+        """Test when documents don't match the question."""
+        context_validator.llm.content = "NO_MATCH"
+        result = context_validator.check(
+            question="What is the weather in Paris?",
+            retriever=fake_retriever,
+            k=3
+        )
+        assert result == "NO_MATCH"
+    def test_check_empty_question(self, context_validator, fake_retriever):
+        """Test with empty question returns NO_MATCH."""
+        result = context_validator.check(
+            question="",
+            retriever=fake_retriever,
+            k=3
+        )
+        assert result == "NO_MATCH"
+    def test_check_empty_retriever_results(self, context_validator, empty_retriever):
+        """Test when retriever returns no documents."""
+        result = context_validator.check(
+            question="Any question",
+            retriever=empty_retriever,
+            k=3
+        )
+        assert result == "NO_MATCH"
+    def test_check_invalid_llm_response(self, context_validator, fake_retriever):
+        """Test when LLM returns invalid response."""
+        context_validator.llm.content = "INVALID_LABEL"
+        result = context_validator.check(
+            question="What is the PUE?",
+            retriever=fake_retriever,
+            k=3
+        )
+        assert result == "NO_MATCH"
+    def test_check_retriever_exception(self, context_validator):
+        """Test when retriever throws an exception."""
+        failing_retriever = MagicMock()
+        failing_retriever.invoke.side_effect = Exception("Connection error")
+        result = context_validator.check(
+            question="Any question",
+            retriever=failing_retriever,
+            k=3
+        )
+        assert result == "NO_MATCH"
+    def test_check_invalid_k_value(self, context_validator, fake_retriever):
+        """Test with invalid k value defaults to 3."""
+        context_validator.llm.content = "CAN_ANSWER"
+        result = context_validator.check(
+            question="What is the PUE?",
+            retriever=fake_retriever,
+            k=-1
+        )
+        assert result == "CAN_ANSWER"

tests/test_knowledge_synthesizer.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import unittest
+try:
+    from langchain_core.documents import Document
+    from intelligence.knowledge_synthesizer import ResearchAgent
+    LANGCHAIN_AVAILABLE = True
+except ImportError:
+    Document = None  # type: ignore
+    ResearchAgent = None  # type: ignore
+    LANGCHAIN_AVAILABLE = False
+class FakeLLM:
+    """Simple stand-in for ChatGoogleGenerativeAI to avoid network calls."""
+    def __init__(self, content: str) -> None:
+        self.content = content
+        self.last_prompt = None
+    def invoke(self, prompt: str):
+        self.last_prompt = prompt
+        return type("Resp", (), {"content": self.content})
+@unittest.skipUnless(LANGCHAIN_AVAILABLE, "langchain not installed in this environment")
+class ResearchAgentTests(unittest.TestCase):
+    def test_generate_returns_stubbed_content_with_citations(self):
+        docs = [
+            Document(page_content="Alpha text", metadata={"id": "a1"}),
+            Document(page_content="Beta text", metadata={"source": "s1"}),
+        ]
+        llm = FakeLLM("Answer about alpha")
+        agent = ResearchAgent(llm=llm, top_k=1, max_context_chars=200)
+        result = agent.generate("What is alpha?", docs)
+        self.assertEqual(result["draft_answer"], "Answer about alpha")
+        self.assertIn("Alpha text", llm.last_prompt)
+    def test_generate_handles_no_documents(self):
+        llm = FakeLLM("unused")
+        agent = ResearchAgent(llm=llm)
+        result = agent.generate("Any question", [])
+        self.assertIn("could not find supporting documents", result["draft_answer"])
+if __name__ == "__main__":
+    unittest.main()

tests/test_visual_extraction.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""
+Test script for Gemini Vision chart extraction.
+This script demonstrates how to use the chart extraction feature
+and validates that it's working correctly.
+"""
+import logging
+import os
+import sys
+from pathlib import Path
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from content_analyzer.document_parser import DocumentProcessor
+from configuration.parameters import parameters
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def test_chart_extraction():
+    """Test chart extraction on a sample PDF with charts."""
+    logger.info("=" * 60)
+    logger.info("Testing Gemini Vision Chart Extraction")
+    logger.info("=" * 60)
+    # Check if chart extraction is enabled
+    if not parameters.ENABLE_CHART_EXTRACTION:
+        logger.warning("?? Chart extraction is DISABLED")
+        logger.info("Enable it by setting ENABLE_CHART_EXTRACTION=true in .env")
+        return
+    logger.info(f"? Chart extraction enabled")
+    logger.info(f"?? Using model: {parameters.CHART_VISION_MODEL}")
+    logger.info(f"?? Max tokens: {parameters.CHART_MAX_TOKENS}")
+    # Initialize processor
+    try:
+        processor = DocumentProcessor()
+        logger.info("? DocumentProcessor initialized")
+        if processor.gemini_client:
+            logger.info("? Gemini Vision client ready")
+        else:
+            logger.error("? Gemini Vision client not initialized")
+            return
+    except Exception as e:
+        logger.error(f"? Failed to initialize processor: {e}")
+        return
+    # Test with example PDF (if exists)
+    test_files = [
+        "examples/google-2024-environmental-report.pdf",
+        "examples/deppseek.pdf",
+        "test/sample_with_charts.pdf"
+    ]
+    found_file = None
+    for test_file in test_files:
+        if os.path.exists(test_file):
+            found_file = test_file
+            break
+    if not found_file:
+        logger.warning("?? No test PDF files found")
+        logger.info("Available test files:")
+        for tf in test_files:
+            logger.info(f"  - {tf}")
+        logger.info("\nTo test manually:")
+        logger.info("1. Place a PDF with charts in one of the above locations")
+        logger.info("2. Run this script again")
+        return
+    logger.info(f"\n?? Processing test file: {found_file}")
+    # Create mock file object
+    class MockFile:
+        def __init__(self, path):
+            self.name = path
+            self.size = os.path.getsize(path)
+    try:
+        # Process the file
+        mock_file = MockFile(found_file)
+        chunks = processor.process([mock_file])
+        logger.info(f"\n? Processing complete!")
+        logger.info(f"?? Total chunks extracted: {len(chunks)}")
+        # Count chart chunks
+        chart_chunks = [c for c in chunks if c.metadata.get("type") == "chart"]
+        text_chunks = [c for c in chunks if c.metadata.get("type") != "chart"]
+        logger.info(f"?? Chart chunks: {len(chart_chunks)}")
+        logger.info(f"?? Text chunks: {len(text_chunks)}")
+        # Display chart analyses
+        if chart_chunks:
+            logger.info(f"\n{'=' * 60}")
+            logger.info("?? CHART ANALYSES EXTRACTED:")
+            logger.info('=' * 60)
+            for i, chunk in enumerate(chart_chunks, 1):
+                logger.info(f"\n--- Chart {i} ---")
+                logger.info(f"Page: {chunk.metadata.get('page')}")
+                logger.info(f"Preview: {chunk.page_content[:200]}...")
+                logger.info("")
+        else:
+            logger.info("\n?? No charts detected in this document")
+            logger.info("This could mean:")
+            logger.info("  - Document contains no charts")
+            logger.info("  - Charts are embedded as tables (already extracted)")
+            logger.info("  - Charts are too complex for detection")
+        logger.info(f"\n{'=' * 60}")
+        logger.info("? Test completed successfully!")
+        logger.info('=' * 60)
+    except Exception as e:
+        logger.error(f"? Test failed: {e}", exc_info=True)
+def test_api_connection():
+    """Test Gemini API connection."""
+    logger.info("\n" + "=" * 60)
+    logger.info("Testing Gemini API Connection")
+    logger.info("=" * 60)
+    try:
+        import google.generativeai as genai
+        from PIL import Image
+        import io
+        genai.configure(api_key=parameters.GOOGLE_API_KEY)
+        model = genai.GenerativeModel(parameters.CHART_VISION_MODEL)
+        logger.info("? Gemini client initialized")
+        # Test with a simple text prompt
+        response = model.generate_content("Hello! Can you respond with 'API Working'?")
+        logger.info(f"? API Response: {response.text}")
+        logger.info("? Gemini API connection successful!")
+    except ImportError as e:
+        logger.error(f"? Missing dependency: {e}")
+        logger.info("Install with: pip install google-generativeai Pillow")
+    except Exception as e:
+        logger.error(f"? API test failed: {e}")
+        logger.info("Check your GOOGLE_API_KEY in .env file")
+if __name__ == "__main__":
+    print("\n?? SmartDoc AI - Chart Extraction Test Suite\n")
+    # Test 1: API Connection
+    test_api_connection()
+    # Test 2: Chart Extraction
+    test_chart_extraction()
+    print("\n? All tests completed!\n")

vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8fe3c8d74ae8a7762e6f389543f0f2c53e6127832955b377ed768f8759db70d
+size 16165996

vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:059abd7ab166731c13bd8dc4dc0724104918b450e9625ca4bc9f27ed0016170e
+size 100

vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc43535869cc54fbd80a6a47dac2fd0b07f4eeb0c028b5c96026b6cdc271832b
+size 463184

vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa6bfa281c8fe4e4977d5382b077dee4a3c4e5c750985cdf3d3660a6f92dab67
+size 20132

vector_store/33eccd62-a7fc-4b0d-a118-02552f5cad42/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffcd2c7be0de4c70919af69080b33cbd5c7487471058b2a70ee5bf95ab86ea00
+size 42436