Spaces:

akryldigital
/

audit_assistant

Running

App Files Files Community

Ara Yeroyan commited on Oct 29, 2025

Commit

f5df983

1 Parent(s): 26449fc

add src

Browse files

Files changed (22) hide show

src/__init__.py +10 -0
src/config/__init__.py +5 -0
src/config/collections.json +22 -0
src/config/loader.py +170 -0
src/config/settings.yaml +92 -0
src/llm/__init__.py +6 -0
src/llm/adapters.py +409 -0
src/llm/templates.py +232 -0
src/loader.py +115 -0
src/logging.py +193 -0
src/pipeline.py +731 -0
src/reporting/__init__.py +6 -0
src/reporting/feedback_schema.py +196 -0
src/reporting/metadata.py +216 -0
src/reporting/service.py +144 -0
src/reporting/snowflake_connector.py +305 -0
src/retrieval/__init__.py +15 -0
src/retrieval/colbert_cache.py +74 -0
src/retrieval/context.py +881 -0
src/retrieval/filter.py +975 -0
src/retrieval/hybrid.py +479 -0
src/vectorstore.py +266 -0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Audit QA Refactored Module
+A modular and maintainable RAG pipeline for audit report analysis.
+"""
+from .pipeline import PipelineManager
+from .config.loader import load_config
+__version__ = "2.0.0"
+__all__ = ["PipelineManager", "load_config"]

src/config/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Configuration management for Audit QA."""
+from .loader import load_config, get_nested_config
+__all__ = ["load_config", "get_nested_config"]

src/config/collections.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "docling": {
+    "model": "BAAI/bge-m3",
+    "description": "Default collection with BGE-M3 embedding model"
+  },
+  "modernbert-embed-base-akryl-matryoshka": {
+    "model": "Akryl/modernbert-embed-base-akryl-matryoshka",
+    "description": "ModernBERT embedding model with matryoshka representation"
+  },
+  "sentence-transformers-all-MiniLM-L6-v2": {
+    "model": "sentence-transformers/all-MiniLM-L6-v2",
+    "description": "Sentence transformers MiniLM model"
+  },
+  "sentence-transformers-all-mpnet-base-v2": {
+    "model": "sentence-transformers/all-mpnet-base-v2",
+    "description": "Sentence transformers MPNet model"
+  },
+  "BAAI-bge-m3": {
+    "model": "BAAI/bge-m3",
+    "description": "BAAI BGE-M3 multilingual embedding model"
+  }
+}

src/config/loader.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""Configuration loader for YAML settings."""
+import yaml
+import json
+from pathlib import Path
+from typing import Dict, Any, Optional
+from dotenv import load_dotenv
+import os
+load_dotenv()
+def load_config(config_path: str = None) -> Dict[str, Any]:
+    """
+    Load configuration from YAML file.
+    Args:
+        config_path: Path to config file. If None, uses default settings.yaml
+    Returns:
+        Dictionary containing configuration settings
+    """
+    if config_path is None:
+        # Default to settings.yaml in the same directory as this file
+        config_path = Path(__file__).parent / "settings.yaml"
+    config_path = Path(config_path)
+    if not config_path.exists():
+        raise FileNotFoundError(f"Configuration file not found: {config_path}")
+    with open(config_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+    # Replace environment variables in the content
+    import os
+    import re
+    def replace_env_vars(match):
+        env_var = match.group(1)
+        return os.getenv(env_var, match.group(0))  # Return original if env var not found
+    # Replace ${VAR} patterns with environment variables
+    content = re.sub(r'\$\{([^}]+)\}', replace_env_vars, content)
+    config = yaml.safe_load(content)
+    # Override with environment variables if they exist
+    config = _override_with_env_vars(config)
+    return config
+def _override_with_env_vars(config: Dict[str, Any]) -> Dict[str, Any]:
+    """Override config values with environment variables where available."""
+    # Map environment variables to config paths
+    env_mappings = {
+        'QDRANT_URL': ['qdrant', 'url'],
+        'QDRANT_COLLECTION': ['qdrant', 'collection_name'],
+        'QDRANT_API_KEY': ['qdrant', 'api_key'],
+        'RETRIEVER_MODEL': ['retriever', 'model'],
+        'RANKER_MODEL': ['ranker', 'model'],
+        'READER_TYPE': ['reader', 'default_type'],
+        'MAX_TOKENS': ['reader', 'max_tokens'],
+        'MISTRAL_API_KEY': ['reader', 'MISTRAL', 'api_key'],
+        'OPENAI_API_KEY': ['reader', 'OPENAI', 'api_key'],
+        'NEBIUS_API_KEY': ['reader', 'INF_PROVIDERS', 'api_key'],
+        'NVIDIA_SERVER_API_KEY': ['reader', 'NVIDIA', 'api_key'],
+        'SERVERLESS_API_KEY': ['reader', 'SERVERLESS', 'api_key'],
+        'DEDICATED_API_KEY': ['reader', 'DEDICATED', 'api_key'],
+        'OPENROUTER_API_KEY': ['reader', 'OPENROUTER', 'api_key'],
+    }
+    for env_var, config_path in env_mappings.items():
+        env_value = os.getenv(env_var)
+        if env_value:
+            # Navigate to the nested config location
+            current = config
+            for key in config_path[:-1]:
+                if key not in current:
+                    current[key] = {}
+                current = current[key]
+            # Set the final value, converting to appropriate type
+            final_key = config_path[-1]
+            if final_key in ['top_k', 'max_tokens', 'num_predict']:
+                current[final_key] = int(env_value)
+            elif final_key in ['normalize', 'prefer_grpc']:
+                current[final_key] = env_value.lower() in ('true', '1', 'yes')
+            elif final_key == 'temperature':
+                current[final_key] = float(env_value)
+            else:
+                current[final_key] = env_value
+    return config
+def get_nested_config(config: Dict[str, Any], path: str, default=None):
+    """
+    Get a nested configuration value using dot notation.
+    Args:
+        config: Configuration dictionary
+        path: Dot-separated path (e.g., 'reader.MISTRAL.model')
+        default: Default value if path not found
+    Returns:
+        Configuration value or default
+    """
+    keys = path.split('.')
+    current = config
+    try:
+        for key in keys:
+            current = current[key]
+        return current
+    except (KeyError, TypeError):
+        return default
+def load_collections_mapping() -> Dict[str, Dict[str, str]]:
+    """Load collections mapping from JSON file."""
+    collections_file = Path(__file__).parent / "collections.json"
+    if not collections_file.exists():
+        # Return default mapping if file doesn't exist
+        return {
+            "docling": {
+                "model": "sentence-transformers/all-MiniLM-L6-v2",
+                "description": "Default collection"
+            }
+        }
+    with open(collections_file, 'r') as f:
+        return json.load(f)
+def get_embedding_model_for_collection(collection_name: str) -> Optional[str]:
+    """Get embedding model for a specific collection name."""
+    collections = load_collections_mapping()
+    if collection_name in collections:
+        return collections[collection_name]["model"]
+    # Try to infer from collection name patterns
+    if "modernbert" in collection_name.lower():
+        return "Akryl/modernbert-embed-base-akryl-matryoshka"
+    elif "minilm" in collection_name.lower():
+        return "sentence-transformers/all-MiniLM-L6-v2"
+    elif "mpnet" in collection_name.lower():
+        return "sentence-transformers/all-mpnet-base-v2"
+    elif "bge" in collection_name.lower():
+        return "BAAI/bge-m3"
+    return None
+def get_collection_info(collection_name: str) -> Dict[str, str]:
+    """Get full collection information including model and description."""
+    collections = load_collections_mapping()
+    if collection_name in collections:
+        return collections[collection_name]
+    # Return inferred info for unknown collections
+    model = get_embedding_model_for_collection(collection_name)
+    return {
+        "model": model or "unknown",
+        "description": f"Auto-inferred collection: {collection_name}"
+    }

src/config/settings.yaml ADDED Viewed

	@@ -0,0 +1,92 @@

+# Audit QA Configuration
+# Converted from model_params.cfg to YAML format
+qdrant:
+  # url: "http://10.1.4.192:8803"`
+  url: "https://2c6d0136-b6ca-4400-bac5-1703f58abc43.europe-west3-0.gcp.cloud.qdrant.io"
+  collection_name: "docling"
+  prefer_grpc: true
+  api_key: "${QDRANT_API_KEY}"  # Load from environment variable
+retriever:
+  model: "BAAI/bge-m3"
+  normalize: true
+  top_k: 20
+retrieval:
+  use_reranking: true
+  reranker_model: "BAAI/bge-reranker-v2-m3"
+  reranker_top_k: 5
+ranker:
+  model: "BAAI/bge-reranker-v2-m3"
+  top_k: 5
+bm25:
+  top_k: 20
+hybrid:
+  default_mode: "vector_only"  # Options: vector_only, sparse_only, hybrid
+  default_alpha: 0.5           # Weight for vector scores (0.5 = equal weight)
+reader:
+  default_type: "OPENAI"
+  max_tokens: 768
+  # Different LLM provider configurations
+  INF_PROVIDERS:
+    model: "meta-llama/Llama-3.1-8B-Instruct"
+    provider: "nebius"
+  # Not working
+  NVIDIA:
+    model: "meta-llama/Llama-3.1-8B-Instruct"
+    endpoint: "https://huggingface.co/api/integrations/dgx/v1"
+  # Not working
+  DEDICATED:
+    model: "meta-llama/Llama-3.1-8B-Instruct"
+    endpoint: "https://qu2d8m6dmsollhly.us-east-1.aws.endpoints.huggingface.cloud"
+  MISTRAL:
+    model: "mistral-medium-latest"
+  OPENAI:
+    model: "gpt-4o-mini"
+  OLLAMA:
+    model: "mistral-small3.1:24b-instruct-2503-q8_0"
+    base_url: "http://10.1.4.192:11434/"
+    temperature: 0.8
+    num_predict: 256
+  OPENROUTER:
+    model: "moonshotai/kimi-k2:free"
+    base_url: "https://openrouter.ai/api/v1"
+    temperature: 0.7
+    max_tokens: 1000
+  #   site_url: "https://your-site.com"  # optional, for OpenRouter ranking
+  #   site_name: "Your Site Name"  # optional, for OpenRouter ranking
+app:
+  dropdown_default: "Annual Consolidated OAG 2024"
+# File paths
+paths:
+  chunks_file: "reports/docling_chunks.json"
+  reports_dir: "reports"
+# Feature toggles
+features:
+  enable_session: true
+  enable_logging: true
+# Logging and HuggingFace scheduler configuration
+logging:
+  json_dataset_dir: "json_dataset"
+  huggingface:
+    repo_id: "GIZ/spaces_logs"
+    repo_type: "dataset"
+    folder_path: "json_dataset"
+    path_in_repo: "audit_chatbot"
+    token_env_var: "SPACES_LOG"

src/llm/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""LLM adapters and utilities."""
+from .adapters import LLMRegistry, get_llm_client
+from .templates import get_message_template, PromptTemplate, create_audit_prompt
+__all__ = ["LLMRegistry", "get_llm_client", "get_message_template", "PromptTemplate", "create_audit_prompt"]

src/llm/adapters.py ADDED Viewed

	@@ -0,0 +1,409 @@

+"""LLM client adapters for different providers."""
+from typing import Dict, Any, List, Optional, Union
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+# LangChain imports
+from langchain_mistralai.chat_models import ChatMistralAI
+from langchain_openai.chat_models import ChatOpenAI
+from langchain_ollama import ChatOllama
+# Legacy client dependencies
+from huggingface_hub import InferenceClient
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.chat_models.huggingface import ChatHuggingFace
+# Configuration loader
+from ..config.loader import load_config
+# Load configuration once at module level
+_config = load_config()
+# Legacy client factory functions (inlined from auditqa_old.reader)
+def _create_inf_provider_client():
+    """Create INF_PROVIDERS client."""
+    reader_config = _config.get("reader", {})
+    inf_config = reader_config.get("INF_PROVIDERS", {})
+    api_key = inf_config.get("api_key")
+    if not api_key:
+        raise ValueError("INF_PROVIDERS api_key not found in configuration")
+    provider = inf_config.get("provider")
+    if not provider:
+        raise ValueError("INF_PROVIDERS provider not found in configuration")
+    return InferenceClient(
+        provider=provider,
+        api_key=api_key,
+        bill_to="GIZ",
+    )
+def _create_nvidia_client():
+    """Create NVIDIA client."""
+    reader_config = _config.get("reader", {})
+    nvidia_config = reader_config.get("NVIDIA", {})
+    api_key = nvidia_config.get("api_key")
+    if not api_key:
+        raise ValueError("NVIDIA api_key not found in configuration")
+    endpoint = nvidia_config.get("endpoint")
+    if not endpoint:
+        raise ValueError("NVIDIA endpoint not found in configuration")
+    return InferenceClient(
+        base_url=endpoint,
+        api_key=api_key
+    )
+def _create_serverless_client():
+    """Create serverless API client."""
+    reader_config = _config.get("reader", {})
+    serverless_config = reader_config.get("SERVERLESS", {})
+    api_key = serverless_config.get("api_key")
+    if not api_key:
+        raise ValueError("SERVERLESS api_key not found in configuration")
+    model_id = serverless_config.get("model", "meta-llama/Meta-Llama-3-8B-Instruct")
+    return InferenceClient(
+        model=model_id,
+        api_key=api_key,
+    )
+def _create_dedicated_endpoint_client():
+    """Create dedicated endpoint client."""
+    reader_config = _config.get("reader", {})
+    dedicated_config = reader_config.get("DEDICATED", {})
+    api_key = dedicated_config.get("api_key")
+    if not api_key:
+        raise ValueError("DEDICATED api_key not found in configuration")
+    endpoint = dedicated_config.get("endpoint")
+    if not endpoint:
+        raise ValueError("DEDICATED endpoint not found in configuration")
+    max_tokens = dedicated_config.get("max_tokens", 768)
+    # Set up the streaming callback handler
+    callback = StreamingStdOutCallbackHandler()
+    # Initialize the HuggingFaceEndpoint with streaming enabled
+    llm_qa = HuggingFaceEndpoint(
+        endpoint_url=endpoint,
+        max_new_tokens=int(max_tokens),
+        repetition_penalty=1.03,
+        timeout=70,
+        huggingfacehub_api_token=api_key,
+        streaming=True,
+        callbacks=[callback]
+    )
+    # Create a ChatHuggingFace instance with the streaming-enabled endpoint
+    return ChatHuggingFace(llm=llm_qa)
+@dataclass
+class LLMResponse:
+    """Standardized LLM response format."""
+    content: str
+    model: str
+    provider: str
+    metadata: Dict[str, Any] = None
+class BaseLLMAdapter(ABC):
+    """Base class for LLM adapters."""
+    def __init__(self, config: Dict[str, Any]):
+        self.config = config
+    @abstractmethod
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response from messages."""
+        pass
+    @abstractmethod
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response from messages."""
+        pass
+class MistralAdapter(BaseLLMAdapter):
+    """Adapter for Mistral AI models."""
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__(config)
+        self.model = ChatMistralAI(
+            model=config.get("model", "mistral-medium-latest")
+        )
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response using Mistral."""
+        response = self.model.invoke(messages)
+        return LLMResponse(
+            content=response.content,
+            model=self.config.get("model", "mistral-medium-latest"),
+            provider="mistral",
+            metadata={"usage": getattr(response, 'usage_metadata', {})}
+        )
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response using Mistral."""
+        for chunk in self.model.stream(messages):
+            if chunk.content:
+                yield chunk.content
+class OpenAIAdapter(BaseLLMAdapter):
+    """Adapter for OpenAI models."""
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__(config)
+        self.model = ChatOpenAI(
+            model=config.get("model", "gpt-4o-mini")
+        )
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response using OpenAI."""
+        response = self.model.invoke(messages)
+        return LLMResponse(
+            content=response.content,
+            model=self.config.get("model", "gpt-4o-mini"),
+            provider="openai",
+            metadata={"usage": getattr(response, 'usage_metadata', {})}
+        )
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response using OpenAI."""
+        for chunk in self.model.stream(messages):
+            if chunk.content:
+                yield chunk.content
+class OllamaAdapter(BaseLLMAdapter):
+    """Adapter for Ollama models."""
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__(config)
+        self.model = ChatOllama(
+            model=config.get("model", "mistral-small3.1:24b-instruct-2503-q8_0"),
+            base_url=config.get("base_url", "http://localhost:11434/"),
+            temperature=config.get("temperature", 0.8),
+            num_predict=config.get("num_predict", 256)
+        )
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response using Ollama."""
+        response = self.model.invoke(messages)
+        return LLMResponse(
+            content=response.content,
+            model=self.config.get("model", "mistral-small3.1:24b-instruct-2503-q8_0"),
+            provider="ollama",
+            metadata={}
+        )
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response using Ollama."""
+        for chunk in self.model.stream(messages):
+            if chunk.content:
+                yield chunk.content
+class OpenRouterAdapter(BaseLLMAdapter):
+    """Adapter for OpenRouter models."""
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__(config)
+        # Prepare custom headers for OpenRouter (optional)
+        headers = {}
+        if config.get("site_url"):
+            headers["HTTP-Referer"] = config["site_url"]
+        if config.get("site_name"):
+            headers["X-Title"] = config["site_name"]
+        # Initialize ChatOpenAI with OpenRouter configuration
+        self.model = ChatOpenAI(
+            model=config.get("model", "openai/gpt-3.5-turbo"),
+            api_key=config.get("api_key"),
+            base_url=config.get("base_url", "https://openrouter.ai/api/v1"),
+            default_headers= headers if headers else {},
+            temperature=config.get("temperature", 0.7),
+            max_tokens=config.get("max_tokens", 1000)
+        )
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response using OpenRouter."""
+        response = self.model.invoke(messages)
+        return LLMResponse(
+            content=response.content,
+            model=self.config.get("model", "openai/gpt-3.5-turbo"),
+            provider="openrouter",
+            metadata={"usage": getattr(response, 'usage_metadata', {})}
+        )
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response using OpenRouter."""
+        for chunk in self.model.stream(messages):
+            if chunk.content:
+                yield chunk.content
+class LegacyAdapter(BaseLLMAdapter):
+    """Adapter for legacy LLM clients (INF_PROVIDERS, NVIDIA, etc.)."""
+    def __init__(self, config: Dict[str, Any], client_type: str):
+        super().__init__(config)
+        self.client_type = client_type
+        self.client = self._create_client()
+    def _create_client(self):
+        """Create legacy client based on type."""
+        if self.client_type == "INF_PROVIDERS":
+            return _create_inf_provider_client()
+        elif self.client_type == "NVIDIA":
+            return _create_nvidia_client()
+        elif self.client_type == "DEDICATED":
+            return _create_dedicated_endpoint_client()
+        else:  # SERVERLESS
+            return _create_serverless_client()
+    def generate(self, messages: List[Dict[str, str]], **kwargs) -> LLMResponse:
+        """Generate response using legacy client."""
+        max_tokens = kwargs.get('max_tokens', self.config.get('max_tokens', 768))
+        if self.client_type == "INF_PROVIDERS":
+            response = self.client.chat.completions.create(
+                model=self.config.get("model"),
+                messages=messages,
+                max_tokens=max_tokens
+            )
+            content = response.choices[0].message.content
+        elif self.client_type == "NVIDIA":
+            response = self.client.chat_completion(
+                model=self.config.get("model"),
+                messages=messages,
+                max_tokens=max_tokens
+            )
+            content = response.choices[0].message.content
+        else:  # DEDICATED or SERVERLESS
+            response = self.client.chat_completion(
+                messages=messages,
+                max_tokens=max_tokens
+            )
+            content = response.choices[0].message.content
+        return LLMResponse(
+            content=content,
+            model=self.config.get("model", "unknown"),
+            provider=self.client_type.lower(),
+            metadata={}
+        )
+    def stream_generate(self, messages: List[Dict[str, str]], **kwargs):
+        """Generate streaming response using legacy client."""
+        # Legacy clients may not support streaming in the same way
+        # This is a simplified implementation
+        response = self.generate(messages, **kwargs)
+        words = response.content.split()
+        for word in words:
+            yield word + " "
+class LLMRegistry:
+    """Registry for managing different LLM adapters."""
+    def __init__(self):
+        self.adapters = {}
+        self.adapter_configs = {}
+    def register_adapter(self, name: str, adapter_class: type, config: Dict[str, Any]):
+        """Register an LLM adapter (lazy instantiation)."""
+        self.adapter_configs[name] = (adapter_class, config)
+    def get_adapter(self, name: str) -> BaseLLMAdapter:
+        """Get an LLM adapter by name (lazy instantiation)."""
+        if name not in self.adapter_configs:
+            raise ValueError(f"Unknown LLM adapter: {name}")
+        # Lazy instantiation - only create when needed
+        if name not in self.adapters:
+            adapter_class, config = self.adapter_configs[name]
+            self.adapters[name] = adapter_class(config)
+        return self.adapters[name]
+    def list_adapters(self) -> List[str]:
+        """List available adapter names."""
+        return list(self.adapter_configs.keys())
+def create_llm_registry(config: Dict[str, Any]) -> LLMRegistry:
+    """
+    Create and populate LLM registry from configuration.
+    Args:
+        config: Configuration dictionary
+    Returns:
+        Populated LLMRegistry
+    """
+    registry = LLMRegistry()
+    reader_config = config.get("reader", {})
+    # Register simple adapters
+    if "MISTRAL" in reader_config:
+        registry.register_adapter("mistral", MistralAdapter, reader_config["MISTRAL"])
+    if "OPENAI" in reader_config:
+        registry.register_adapter("openai", OpenAIAdapter, reader_config["OPENAI"])
+    if "OLLAMA" in reader_config:
+        registry.register_adapter("ollama", OllamaAdapter, reader_config["OLLAMA"])
+    if "OPENROUTER" in reader_config:
+        registry.register_adapter("openrouter", OpenRouterAdapter, reader_config["OPENROUTER"])
+    # Register legacy adapters
+    # legacy_types = ["INF_PROVIDERS", "NVIDIA", "DEDICATED"]
+    legacy_types = ["INF_PROVIDERS"]
+    for legacy_type in legacy_types:
+        if legacy_type in reader_config:
+            registry.register_adapter(
+                legacy_type.lower(),
+                lambda cfg, lt=legacy_type: LegacyAdapter(cfg, lt),
+                reader_config[legacy_type]
+            )
+    return registry
+def get_llm_client(provider: str, config: Dict[str, Any]) -> BaseLLMAdapter:
+    """
+    Get LLM client for specified provider.
+    Args:
+        provider: Provider name (mistral, openai, ollama, etc.)
+        config: Configuration dictionary
+    Returns:
+        LLM adapter instance
+    """
+    registry = create_llm_registry(config)
+    return registry.get_adapter(provider)

src/llm/templates.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""LLM prompt templates and message formatting utilities."""
+from typing import List, Dict, Any, Union
+from dataclasses import dataclass
+from langchain.schema import SystemMessage, HumanMessage
+@dataclass
+class PromptTemplate:
+    """Template for managing prompts with variables."""
+    system_prompt: str
+    user_prompt_template: str
+    def format(self, **kwargs) -> tuple:
+        """Format the template with provided variables."""
+        formatted_user = self.user_prompt_template.format(**kwargs)
+        return self.system_prompt, formatted_user
+# Default system prompt for audit Q&A
+DEFAULT_AUDIT_SYSTEM_PROMPT = """
+You are AuditQ&A, an AI Assistant for audit reports. Answer questions directly and factually based on the provided context.
+Guidelines:
+- Answer directly and concisely (2-3 sentences maximum)
+- Use specific facts and numbers from the context
+- Cite sources using [Doc i] format
+- Be factual, not opinionated
+- Avoid phrases like "From my point of view", "I think", "It seems"
+Examples:
+Query: "What challenges arise from contradictory PDM implementation guidelines?"
+Context: [Retrieved documents about PDM guidelines contradictions]
+Answer: "Contradictory PDM implementation guidelines cause challenges during implementation, as entities receive numerous and often conflicting directives from different authorities. For example, guidelines on transfer of funds to PDM SACCOs differ between the PDM Secretariat and PSST, and there are conflicting directives on fund diversion from various authorities."
+Query: "What was the supplementary funding obtained for the wage budget?"
+Context: [Retrieved documents about wage budget funding]
+Answer: "The supplementary funding obtained for the wage budget was UGX.2,208,040,656."
+Now answer the following question based on the provided context:
+"""
+# Default user prompt template
+DEFAULT_USER_PROMPT_TEMPLATE = """Passages:
+{context}
+-----------------------
+Question: {question} - Explained to audit expert
+Answer in english with the passages citations:
+"""
+def create_audit_prompt(context_list: List[str], query: str) -> List[Dict[str, str]]:
+    """
+    Create audit Q&A prompt messages from context and query.
+    Args:
+        context_list: List of context passages
+        query: User query
+    Returns:
+        List of message dictionaries for LLM
+    """
+    # Join context passages with numbering
+    numbered_context = []
+    for i, passage in enumerate(context_list, 1):
+        numbered_context.append(f"Doc {i}: {passage}")
+    context_str = "\n\n".join(numbered_context)
+    # Format user prompt
+    user_prompt = DEFAULT_USER_PROMPT_TEMPLATE.format(
+        context=context_str,
+        question=query
+    )
+    # Return as message format
+    messages = [
+        {"role": "system", "content": DEFAULT_AUDIT_SYSTEM_PROMPT},
+        {"role": "user", "content": user_prompt}
+    ]
+    return messages
+def get_message_template(
+    provider_type: str,
+    system_prompt: str,
+    user_prompt: str
+) -> List[Union[Dict[str, str], SystemMessage, HumanMessage]]:
+    """
+    Get message template based on LLM provider type.
+    Args:
+        provider_type: Type of LLM provider
+        system_prompt: System prompt content
+        user_prompt: User prompt content
+    Returns:
+        List of messages in the appropriate format for the provider
+    """
+    provider_type = provider_type.upper()
+    if provider_type in ['NVIDIA', 'INF_PROVIDERS', 'MISTRAL', 'OPENAI', 'OPENROUTER']:
+        # Dictionary format for API-based providers
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+    elif provider_type in ['DEDICATED', 'SERVERLESS', 'OLLAMA']:
+        # LangChain message objects for local/dedicated providers
+        messages = [
+            SystemMessage(content=system_prompt),
+            HumanMessage(content=user_prompt)
+        ]
+    else:
+        # Default to dictionary format
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+    return messages
+def create_custom_prompt_template(
+    system_prompt: str,
+    user_template: str
+) -> PromptTemplate:
+    """
+    Create a custom prompt template.
+    Args:
+        system_prompt: System prompt content
+        user_template: User prompt template with placeholders
+    Returns:
+        PromptTemplate instance
+    """
+    return PromptTemplate(
+        system_prompt=system_prompt,
+        user_prompt_template=user_template
+    )
+def create_evaluation_prompt(context_list: List[str], query: str, expected_answer: str) -> List[Dict[str, str]]:
+    """
+    Create prompt for evaluation purposes with expected answer.
+    Args:
+        context_list: List of context passages
+        query: User query
+        expected_answer: Expected/ground truth answer
+    Returns:
+        List of message dictionaries for evaluation
+    """
+    # Join context passages
+    context_str = "\n\n".join([f"Doc {i}: {passage}" for i, passage in enumerate(context_list, 1)])
+    evaluation_system_prompt = """
+    You are an evaluation assistant. Given context passages, a question, and an expected answer,
+    evaluate how well the provided context supports answering the question accurately.
+    Provide your evaluation focusing on:
+    1. Relevance of the context to the question
+    2. Completeness of information needed to answer
+    3. Quality and accuracy of supporting details
+    """
+    user_prompt = f"""Context Passages:
+{context_str}
+Question: {query}
+Expected Answer: {expected_answer}
+Evaluation:"""
+    return [
+        {"role": "system", "content": evaluation_system_prompt},
+        {"role": "user", "content": user_prompt}
+    ]
+def get_prompt_variants() -> Dict[str, PromptTemplate]:
+    """
+    Get different prompt template variants for testing.
+    Returns:
+        Dictionary of named prompt templates
+    """
+    variants = {
+        "standard": create_custom_prompt_template(
+            DEFAULT_AUDIT_SYSTEM_PROMPT,
+            DEFAULT_USER_PROMPT_TEMPLATE
+        ),
+        "concise": create_custom_prompt_template(
+            """You are an audit report AI assistant. Provide clear, concise answers based on the given context passages. Always cite sources using [Doc i] format.""",
+            """Context:\n{context}\n\nQuestion: {question}\nAnswer:"""
+        ),
+        "detailed": create_custom_prompt_template(
+            DEFAULT_AUDIT_SYSTEM_PROMPT + """\n\nAdditional Instructions:
+            - Provide detailed explanations with specific examples
+            - Include relevant numbers, dates, and financial figures when available
+            - Structure your response with clear headings when appropriate
+            - Explain the significance of findings in the context of governance and accountability""",
+            DEFAULT_USER_PROMPT_TEMPLATE
+        )
+    }
+    return variants
+# Backward compatibility function
+def format_context_with_citations(context_list: List[str]) -> str:
+    """
+    Format context list with document citations.
+    Args:
+        context_list: List of context passages
+    Returns:
+        Formatted context string with citations
+    """
+    formatted_passages = []
+    for i, passage in enumerate(context_list, 1):
+        formatted_passages.append(f"Doc {i}: {passage}")
+    return "\n\n".join(formatted_passages)

src/loader.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""Data loading utilities for chunks and JSON files."""
+import json
+from pathlib import Path
+from typing import List, Dict, Any
+from langchain.docstore.document import Document
+def load_json(filepath: Path | str) -> List[Dict[str, Any]]:
+    """
+    Load JSON data from file.
+    Args:
+        filepath: Path to JSON file
+    Returns:
+        List of dictionaries containing the JSON data
+    """
+    filepath = Path(filepath)
+    if not filepath.exists():
+        raise FileNotFoundError(f"JSON file not found: {filepath}")
+    with open(filepath, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+def open_file(filepath: Path | str) -> str:
+    """
+    Open and read a text file.
+    Args:
+        filepath: Path to text file
+    Returns:
+        File contents as string
+    """
+    filepath = Path(filepath)
+    if not filepath.exists():
+        raise FileNotFoundError(f"File not found: {filepath}")
+    with open(filepath, 'r', encoding='utf-8') as f:
+        content = f.read()
+    return content
+def load_chunks(chunks_file: Path | str = None) -> List[Dict[str, Any]]:
+    """
+    Load document chunks from JSON file.
+    Args:
+        chunks_file: Path to chunks JSON file. If None, uses default path.
+    Returns:
+        List of chunk dictionaries
+    """
+    if chunks_file is None:
+        chunks_file = Path("reports/docling_chunks.json")
+    return load_json(chunks_file)
+def chunks_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]:
+    """
+    Convert chunk dictionaries to LangChain Document objects.
+    Args:
+        chunks: List of chunk dictionaries
+    Returns:
+        List of Document objects
+    """
+    documents = []
+    for chunk in chunks:
+        doc = Document(
+            page_content=chunk.get("content", ""),
+            metadata=chunk.get("metadata", {})
+        )
+        documents.append(doc)
+    return documents
+def validate_chunks(chunks: List[Dict[str, Any]]) -> bool:
+    """
+    Validate that chunks have required fields.
+    Args:
+        chunks: List of chunk dictionaries
+    Returns:
+        True if valid, raises ValueError if invalid
+    """
+    required_fields = ["content", "metadata"]
+    for i, chunk in enumerate(chunks):
+        for field in required_fields:
+            if field not in chunk:
+                raise ValueError(f"Chunk {i} missing required field: {field}")
+        # Validate metadata has required fields
+        metadata = chunk["metadata"]
+        if not isinstance(metadata, dict):
+            raise ValueError(f"Chunk {i} metadata must be a dictionary")
+        # Check for common metadata fields
+        if "filename" not in metadata:
+            raise ValueError(f"Chunk {i} metadata missing 'filename' field")
+    return True

src/logging.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""Logging utilities (placeholder for legacy compatibility)."""
+import json
+import logging
+from uuid import uuid4
+from pathlib import Path
+from threading import Lock
+from datetime import datetime
+from typing import Dict, Any, Optional
+from .config import load_config
+def save_logs(
+    scheduler=None,
+    json_dataset_path: Path = None,
+    logs_data: Dict[str, Any] = None,
+    feedback: str = None
+) -> None:
+    """
+    Save logs (placeholder for legacy compatibility).
+    Args:
+        scheduler: HuggingFace scheduler (not used in refactored version)
+        json_dataset_path: Path to JSON dataset
+        logs_data: Log data dictionary
+        feedback: User feedback
+    Note:
+        This is a placeholder function for backward compatibility.
+        In the refactored version, logging would be handled differently.
+    """
+    if not is_logging_enabled():
+        return
+    try:
+        current_time = datetime.now().timestamp()
+        logs_data["time"] = str(current_time)
+        if feedback:
+            logs_data["feedback"] = feedback
+            logs_data["record_id"] = str(uuid4())
+        field_order = [
+            "record_id",
+            "session_id",
+            "time",
+            "session_duration_seconds",
+            "client_location",
+            "platform",
+            "system_prompt",
+            "sources",
+            "reports",
+            "subtype",
+            "year",
+            "question",
+            "retriever",
+            "endpoint_type",
+            "reader",
+            "docs",
+            "answer",
+            "feedback"
+        ]
+        ordered_logs = {k: logs_data.get(k) for k in field_order if k in logs_data}
+        lock = getattr(scheduler, "lock", None)
+        if lock is None:
+            lock = Lock()
+        with lock:
+            with open(json_dataset_path, 'a') as f:
+                json.dump(ordered_logs, f)
+                f.write("\n")
+                logging.info("logging done")
+    except Exception as e:
+        logging.error(f"Error saving logs: {e}")
+        raise
+def setup_logging(log_level: str = "INFO", log_file: str = None) -> None:
+    """
+    Set up logging configuration.
+    Args:
+        log_level: Logging level
+        log_file: Optional log file path
+    """
+    if not is_logging_enabled():
+        return
+    # Configure logging
+    logging.basicConfig(
+        level=getattr(logging, log_level.upper()),
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.StreamHandler(),
+            logging.FileHandler(log_file) if log_file else logging.NullHandler()
+        ]
+    )
+def log_query_response(
+    query: str,
+    response: str,
+    metadata: Dict[str, Any] = None
+) -> None:
+    """
+    Log query and response for analysis.
+    Args:
+        query: User query
+        response: System response
+        metadata: Additional metadata
+    """
+    if not is_logging_enabled():
+        return
+    logger = logging.getLogger(__name__)
+    log_entry = {
+        "query": query,
+        "response_length": len(response),
+        "metadata": metadata or {}
+    }
+    logger.info(f"Query processed: {log_entry}")
+def log_error(error: Exception, context: Dict[str, Any] = None) -> None:
+    """
+    Log error with context.
+    Args:
+        error: Exception that occurred
+        context: Additional context information
+    """
+    if not is_logging_enabled():
+        return
+    logger = logging.getLogger(__name__)
+    error_info = {
+        "error_type": type(error).__name__,
+        "error_message": str(error),
+        "context": context or {}
+    }
+    logger.error(f"Error occurred: {error_info}")
+def log_performance_metrics(
+    operation: str,
+    duration: float,
+    metadata: Dict[str, Any] = None
+) -> None:
+    """
+    Log performance metrics.
+    Args:
+        operation: Name of the operation
+        duration: Duration in seconds
+        metadata: Additional metadata
+    """
+    if not is_logging_enabled():
+        return
+    logger = logging.getLogger(__name__)
+    metrics = {
+        "operation": operation,
+        "duration_seconds": duration,
+        "metadata": metadata or {}
+    }
+    logger.info(f"Performance metrics: {metrics}")
+def is_session_enabled() -> bool:
+    """
+    Returns True if session management is enabled, False otherwise.
+    Checks environment variable ENABLE_SESSION first, then config.
+    """
+    env = os.getenv("ENABLE_SESSION")
+    if env is not None:
+        return env.lower() in ("1", "true", "yes", "on")
+    config = load_config()
+    return config.get("features", {}).get("enable_session", True)
+def is_logging_enabled() -> bool:
+    """
+    Returns True if logging is enabled, False otherwise.
+    Checks environment variable ENABLE_LOGGING first, then config.
+    """
+    env = os.getenv("ENABLE_LOGGING")
+    if env is not None:
+        return env.lower() in ("1", "true", "yes", "on")
+    config = load_config()
+    return config.get("features", {}).get("enable_logging", True)

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,731 @@

+"""Main pipeline orchestrator for the Audit QA system."""
+import time
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Dict, Any, List, Optional
+from langchain.docstore.document import Document
+from .logging import log_error
+from .llm.adapters import LLMRegistry
+from .loader import chunks_to_documents
+from .vectorstore import VectorStoreManager
+from .retrieval.context import ContextRetriever
+from .config.loader import get_embedding_model_for_collection
+@dataclass
+class PipelineResult:
+    """Result of pipeline execution."""
+    answer: str
+    sources: List[Document]
+    execution_time: float
+    metadata: Dict[str, Any]
+    query: str = ""  # Add default value for query
+    def __post_init__(self):
+        """Post-initialization processing."""
+        if not self.query:
+            self.query = "Unknown query"
+class PipelineManager:
+    """Main pipeline manager for the RAG system."""
+    def __init__(self, config: dict = None):
+        """
+        Initialize the pipeline manager.
+        """
+        self.config = config or {}
+        self.vectorstore_manager = None
+        self.context_retriever = None  # Initialize as None
+        self.llm_client = None
+        self.report_service = None
+        self.chunks = None
+        # Initialize components
+        self._initialize_components()
+    def update_config(self, new_config: dict):
+        """
+        Update the pipeline configuration.
+        This is useful for experiments that need different settings.
+        """
+        if not isinstance(new_config, dict):
+            return
+        # Deep merge the new config with existing config
+        def deep_merge(base_dict, update_dict):
+            for key, value in update_dict.items():
+                if key in base_dict and isinstance(base_dict[key], dict) and isinstance(value, dict):
+                    deep_merge(base_dict[key], value)
+                else:
+                    base_dict[key] = value
+        deep_merge(self.config, new_config)
+        # Auto-infer embedding model from collection name if not "docling"
+        collection_name = self.config.get('qdrant', {}).get('collection_name', 'docling')
+        if collection_name != 'docling':
+            inferred_model = get_embedding_model_for_collection(collection_name)
+            if inferred_model:
+                print(f"🔍 Auto-inferred embedding model for collection '{collection_name}': {inferred_model}")
+                if 'retriever' not in self.config:
+                    self.config['retriever'] = {}
+                self.config['retriever']['model'] = inferred_model
+                # Set default normalize parameter if not present
+                if 'normalize' not in self.config['retriever']:
+                    self.config['retriever']['normalize'] = True
+                # Also update vectorstore config if it exists
+                if 'vectorstore' in self.config:
+                    self.config['vectorstore']['embedding_model'] = inferred_model
+        print(f"🔧 CONFIG UPDATED: Pipeline config updated with experiment settings")
+        # Re-initialize vectorstore manager with updated config
+        self._reinitialize_vectorstore_manager()
+    def _reinitialize_vectorstore_manager(self):
+        """Re-initialize vectorstore manager with current config."""
+        try:
+            self.vectorstore_manager = VectorStoreManager(self.config)
+            print("🔄 VectorStore manager re-initialized with updated config")
+        except Exception as e:
+            print(f"❌ Error re-initializing vectorstore manager: {e}")
+    def _get_reranker_model_name(self) -> str:
+        """
+        Get the reranker model name from configuration.
+        Returns:
+            Reranker model name or default
+        """
+        return (
+            self.config.get('retrieval', {}).get('reranker_model') or
+            self.config.get('ranker', {}).get('model') or
+            self.config.get('reranker_model') or
+            'BAAI/bge-reranker-v2-m3'
+        )
+    def _initialize_components(self):
+        """Initialize pipeline components."""
+        try:
+            # Load config if not provided
+            if not self.config:
+                from auditqa.config.loader import load_config
+                self.config = load_config()
+            # Auto-infer embedding model from collection name if not "docling"
+            collection_name = self.config.get('qdrant', {}).get('collection_name', 'docling')
+            if collection_name != 'docling':
+                inferred_model = get_embedding_model_for_collection(collection_name)
+                if inferred_model:
+                    print(f"🔍 Auto-inferred embedding model for collection '{collection_name}': {inferred_model}")
+                    if 'retriever' not in self.config:
+                        self.config['retriever'] = {}
+                    self.config['retriever']['model'] = inferred_model
+                    # Set default normalize parameter if not present
+                    if 'normalize' not in self.config['retriever']:
+                        self.config['retriever']['normalize'] = True
+                    # Also update vectorstore config if it exists
+                    if 'vectorstore' in self.config:
+                        self.config['vectorstore']['embedding_model'] = inferred_model
+            self.vectorstore_manager = VectorStoreManager(self.config)
+            self.llm_manager = LLMRegistry()
+            # Try to get LLM client using the correct method
+            self.llm_client = None
+            try:
+                # Try using get_adapter method (most likely correct)
+                self.llm_client = self.llm_manager.get_adapter("openai")
+                print("✅ LLM CLIENT: Initialized using get_adapter method")
+            except Exception as e:
+                try:
+                    # Try direct instantiation with config
+                    from auditqa.llm.adapters import get_llm_client
+                    self.llm_client = get_llm_client("openai", self.config)
+                    print("✅ LLM CLIENT: Initialized using direct get_llm_client function with config")
+                except Exception as e2:
+                    print(f"❌ LLM CLIENT: Registry methods failed - {e2}")
+                    # Try to create a simple LLM client directly
+                    try:
+                        from langchain_openai import ChatOpenAI
+                        import os
+                        api_key = os.getenv("OPENAI_API_KEY") or os.getenv("OPENROUTER_API_KEY")
+                        if api_key:
+                            self.llm_client = ChatOpenAI(
+                                model="gpt-3.5-turbo",
+                                api_key=api_key,
+                                temperature=0.1,
+                                max_tokens=1000
+                            )
+                            print("✅ LLM CLIENT: Initialized using direct ChatOpenAI")
+                        else:
+                            print("❌ LLM CLIENT: No API key available")
+                    except Exception as e3:
+                        print(f"❌ LLM CLIENT: Direct instantiation also failed - {e3}")
+                        self.llm_client = None
+            # Load system prompt
+            from auditqa.llm.templates import DEFAULT_AUDIT_SYSTEM_PROMPT
+            self.system_prompt = DEFAULT_AUDIT_SYSTEM_PROMPT
+            # Initialize report service
+            try:
+                from auditqa.reporting.service import ReportService
+                self.report_service = ReportService()
+            except Exception as e:
+                print(f"Warning: Could not initialize report service: {e}")
+                self.report_service = None
+        except Exception as e:
+            print(f"Warning: Error initializing components: {e}")
+    def test_retrieval(
+        self,
+        query: str,
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        k: int = None,
+        search_mode: str = None,
+        search_alpha: float = None,
+        use_reranking: bool = True
+    ) -> Dict[str, Any]:
+        """
+        Test retrieval only without LLM inference.
+        Args:
+            query: User query
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            k: Number of documents to retrieve
+            search_mode: Search mode ('vector_only', 'sparse_only', or 'hybrid')
+            search_alpha: Weight for vector scores in hybrid mode
+            use_reranking: Whether to use reranking
+        Returns:
+            Dictionary with retrieval results and metadata
+        """
+        start_time = time.time()
+        try:
+            # Set default search parameters if not provided
+            if search_mode is None:
+                search_mode = self.config.get("hybrid", {}).get("default_mode", "vector_only")
+            if search_alpha is None:
+                search_alpha = self.config.get("hybrid", {}).get("default_alpha", 0.5)
+            # Get vector store
+            vectorstore = self.vectorstore_manager.get_vectorstore()
+            if not vectorstore:
+                raise ValueError(
+                    "Vector store not available. Call connect_vectorstore() or create_vectorstore() first."
+                )
+            # Retrieve context with scores for test retrieval
+            context_docs_with_scores = self.context_retriever.retrieve_with_scores(
+                vectorstore=vectorstore,
+                query=query,
+                reports=reports,
+                sources=sources,
+                subtype=subtype,
+                k=k,
+                search_mode=search_mode,
+                alpha=search_alpha,
+            )
+            # Extract documents and scores
+            context_docs = [doc for doc, score in context_docs_with_scores]
+            context_scores = [score for doc, score in context_docs_with_scores]
+            execution_time = time.time() - start_time
+            # Format results with actual scores
+            results = []
+            for i, (doc, score) in enumerate(zip(context_docs, context_scores)):
+                results.append({
+                    "rank": i + 1,
+                    "content": doc.page_content,  # Return full content without truncation
+                    "metadata": doc.metadata,
+                    "score": score if score is not None else 0.0
+                })
+            return {
+                "results": results,
+                "num_results": len(results),
+                "execution_time": execution_time,
+                "search_mode": search_mode,
+                "search_alpha": search_alpha,
+                "query": query
+            }
+        except Exception as e:
+            print(f"❌ Error during retrieval test: {e}")
+            log_error(e, {"component": "retrieval_test", "query": query})
+            return {
+                "results": [],
+                "num_results": 0,
+                "execution_time": time.time() - start_time,
+                "error": str(e),
+                "search_mode": search_mode or "unknown",
+                "search_alpha": search_alpha or 0.5,
+                "query": query
+            }
+    def connect_vectorstore(self, force_recreate: bool = False) -> bool:
+        """
+        Connect to existing vector store.
+        Args:
+            force_recreate: If True, recreate the collection if dimension mismatch occurs
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            vectorstore = self.vectorstore_manager.connect_to_existing(force_recreate=force_recreate)
+            if vectorstore:
+                print("✅ Connected to vector store")
+                return True
+            else:
+                print("❌ Failed to connect to vector store")
+                return False
+        except Exception as e:
+            print(f"❌ Error connecting to vector store: {e}")
+            log_error(e, {"component": "vectorstore_connection"})
+            # If it's a dimension mismatch error, try with force_recreate
+            if "dimensions" in str(e).lower() and not force_recreate:
+                print("🔄 Dimension mismatch detected, attempting to recreate collection...")
+                try:
+                    vectorstore = self.vectorstore_manager.connect_to_existing(force_recreate=True)
+                    if vectorstore:
+                        print("✅ Connected to vector store (recreated)")
+                        return True
+                except Exception as recreate_error:
+                    print(f"❌ Failed to recreate vector store: {recreate_error}")
+                    log_error(recreate_error, {"component": "vectorstore_recreation"})
+            return False
+    def create_vectorstore(self) -> bool:
+        """
+        Create new vector store from chunks.
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            if not self.chunks:
+                raise ValueError("No chunks available for vector store creation")
+            documents = chunks_to_documents(self.chunks)
+            self.vectorstore_manager.create_from_documents(documents)
+            print("✅ Vector store created successfully")
+            return True
+        except Exception as e:
+            print(f"❌ Error creating vector store: {e}")
+            log_error(e, {"component": "vectorstore_creation"})
+            return False
+    def create_audit_prompt(self, query: str, context_docs: List[Document]) -> str:
+        """Create a prompt for the LLM to generate an answer."""
+        try:
+            # Ensure query is not None
+            if not query or not isinstance(query, str) or query.strip() == "":
+                return "Error: No query provided"
+            # Ensure context_docs is not None and is a list
+            if context_docs is None:
+                context_docs = []
+            # Filter out None documents and ensure they have content
+            valid_docs = []
+            for doc in context_docs:
+                if doc is not None:
+                    if hasattr(doc, 'page_content') and doc.page_content and isinstance(doc.page_content, str):
+                        valid_docs.append(doc)
+                    elif isinstance(doc, str) and doc.strip():
+                        valid_docs.append(doc)
+            # Create context string
+            if valid_docs:
+                context_parts = []
+                for i, doc in enumerate(valid_docs, 1):
+                    if hasattr(doc, 'page_content') and doc.page_content:
+                        context_parts.append(f"Doc {i}: {doc.page_content}")
+                    elif isinstance(doc, str) and doc.strip():
+                        context_parts.append(f"Doc {i}: {doc}")
+                context_string = "\n\n".join(context_parts)
+            else:
+                context_string = "No relevant context found."
+            # Create the prompt
+            prompt = f"""
+{self.system_prompt}
+Context:
+{context_string}
+Query: {query}
+Answer:"""
+            return prompt
+        except Exception as e:
+            print(f"Error creating audit prompt: {e}")
+            return f"Error creating prompt: {e}"
+    def _generate_answer(self, prompt: str) -> str:
+        """Generate answer using the LLM."""
+        try:
+            if not prompt or not isinstance(prompt, str) or prompt.strip() == "":
+                return "Error: No prompt provided"
+            # Ensure LLM client is available
+            if not self.llm_client:
+                return "Error: LLM client not available"
+            # Generate response using the correct method
+            if hasattr(self.llm_client, 'generate'):
+                # Use the generate method (for adapters)
+                response = self.llm_client.generate([{"role": "user", "content": prompt}])
+                # Extract content from LLMResponse
+                if hasattr(response, 'content'):
+                    answer = response.content
+                else:
+                    answer = str(response)
+            elif hasattr(self.llm_client, 'invoke'):
+                # Use the invoke method (for direct LangChain models)
+                response = self.llm_client.invoke(prompt)
+                # Extract content safely
+                if hasattr(response, 'content') and response.content is not None:
+                    answer = response.content
+                elif isinstance(response, str) and response.strip():
+                    answer = response
+                else:
+                    answer = str(response) if response is not None else "Error: LLM returned None response"
+            else:
+                return "Error: LLM client has no generate or invoke method"
+            # Ensure answer is not None and is a string
+            if answer is None or not isinstance(answer, str):
+                return "Error: LLM returned invalid response"
+            return answer.strip()
+        except Exception as e:
+            print(f"Error generating answer: {e}")
+            return f"Error generating answer: {e}"
+    def run(
+        self,
+        query: str,
+        reports: List[str] = None,
+        sources: List[str] = None,
+        subtype: List[str] = None,
+        llm_provider: str = None,
+        use_reranking: bool = True,
+        search_mode: str = None,
+        search_alpha: float = None,
+        auto_infer_filters: bool = True,
+        filters: Dict[str, Any] = None,
+    ) -> PipelineResult:
+        """
+        Run the complete RAG pipeline.
+        Args:
+            query: User query
+            reports: List of specific report filenames
+            sources: Source category filter
+            subtype: List of subtypes/filenames
+            llm_provider: LLM provider to use
+            use_reranking: Whether to use reranking
+            search_mode: Search mode (vector, sparse, hybrid)
+            search_alpha: Alpha value for hybrid search
+            auto_infer_filters: Whether to auto-infer filters from query
+        Returns:
+            PipelineResult object
+        """
+        try:
+            # Validate input
+            if not query or not isinstance(query, str) or query.strip() == "":
+                return PipelineResult(
+                    answer="Error: Invalid query provided",
+                    sources=[],
+                    execution_time=0.0,
+                    metadata={'error': 'Invalid query'},
+                    query=query
+                )
+            # Ensure lists are not None
+            if reports is None:
+                reports = []
+            if subtype is None:
+                subtype = []
+            start_time = time.time()
+            # Auto-infer filters if enabled and no explicit filters provided
+            inferred_filters = {}
+            filters_applied = False
+            qdrant_filter = None  # Add this
+            if auto_infer_filters and not any([reports, sources, subtype]):
+                print(f"🤖 AUTO-INFERRING FILTERS: No explicit filters provided, analyzing query...")
+                try:
+                    # Import get_available_metadata here to avoid circular imports
+                    from auditqa.retrieval.filter import get_available_metadata, infer_filters_from_query
+                    # Get available metadata
+                    available_metadata = get_available_metadata(self.vectorstore_manager.get_vectorstore())
+                    # Infer filters from query - this returns a Qdrant filter
+                    qdrant_filter, filter_summary = infer_filters_from_query(
+                        query=query,
+                        available_metadata=available_metadata,
+                        llm_client=self.llm_client
+                    )
+                    if qdrant_filter:
+                        print(f"✅ QDRANT FILTER APPLIED: Using inferred Qdrant filter")
+                        filters_applied = True
+                        # Don't set sources/reports/subtype - use the Qdrant filter directly
+                    else:
+                        print(f"⚠️ NO QDRANT FILTER: Could not build Qdrant filter from query")
+                except Exception as e:
+                    print(f"❌ AUTO-INFERENCE FAILED: {e}")
+                    qdrant_filter = None
+            else:
+                # Check if any explicit filters were provided
+                filters_applied = any([reports, sources, subtype])
+                if filters_applied:
+                    print(f"✅ EXPLICIT FILTERS: Using provided filters")
+                else:
+                    print(f"⚠️ NO FILTERS: No explicit filters and auto-inference disabled")
+            # Extract filter parameters from the filters parameter
+            reports = filters.get('reports', []) if filters else []
+            sources = filters.get('sources', []) if filters else []
+            subtype = filters.get('subtype', []) if filters else []
+            year = filters.get('year', []) if filters else []
+            district = filters.get('district', []) if filters else []
+            filenames = filters.get('filenames', []) if filters else []  # Support mutually exclusive filename filtering
+            # Get vectorstore
+            vectorstore = self.vectorstore_manager.get_vectorstore()
+            if not vectorstore:
+                return PipelineResult(
+                    answer="Error: Vector store not available",
+                    sources=[],
+                    execution_time=0.0,
+                    metadata={'error': 'Vector store not available'},
+                    query=query
+                )
+            # Initialize context retriever if not already done
+            if not hasattr(self, 'context_retriever') or self.context_retriever is None:
+                # Get the actual vectorstore object
+                vectorstore_obj = self.vectorstore_manager.get_vectorstore()
+                if vectorstore_obj is None:
+                    print("❌ ERROR: Vectorstore is None, cannot initialize ContextRetriever")
+                    return None
+                self.context_retriever = ContextRetriever(vectorstore_obj, self.config)
+                print("✅ ContextRetriever initialized successfully")
+            # Debug config access
+            print(f" CONFIG DEBUG: Full config keys: {list(self.config.keys()) if isinstance(self.config, dict) else 'Not a dict'}")
+            print(f"🔍 CONFIG DEBUG: Retriever config: {self.config.get('retriever', {})}")
+            print(f"🔍 CONFIG DEBUG: Retrieval config: {self.config.get('retrieval', {})}")
+            print(f"🔍 CONFIG DEBUG: use_reranking from config: {self.config.get('retrieval', {}).get('use_reranking', 'NOT_FOUND')}")
+            # Get the correct top_k value
+            # Priority: experiment config > retriever config > default
+            top_k = (
+                self.config.get('retrieval', {}).get('top_k') or
+                self.config.get('retriever', {}).get('top_k') or
+                5
+            )
+            # Get reranking setting
+            use_reranking = self.config.get('retrieval', {}).get('use_reranking', False)
+            print(f"🔍 CONFIG DEBUG: Final top_k: {top_k}")
+            print(f"🔍 CONFIG DEBUG: Final use_reranking: {use_reranking}")
+            # Retrieve context using the context retriever
+            context_docs = self.context_retriever.retrieve_context(
+                query=query,
+                k=top_k,
+                reports=reports,
+                sources=sources,
+                subtype=subtype,
+                year=year,
+                district=district,
+                filenames=filenames,
+                use_reranking=use_reranking,
+                qdrant_filter=qdrant_filter
+            )
+            # Ensure context_docs is not None
+            if context_docs is None:
+                context_docs = []
+            # Generate answer
+            answer = self._generate_answer(self.create_audit_prompt(query, context_docs))
+            execution_time = time.time() - start_time
+            # Create result with comprehensive metadata
+            result = PipelineResult(
+                answer=answer,
+                sources=context_docs,
+                execution_time=execution_time,
+                metadata={
+                    'llm_provider': llm_provider,
+                    'use_reranking': use_reranking,
+                    'search_mode': search_mode,
+                    'search_alpha': search_alpha,
+                    'auto_infer_filters': auto_infer_filters,
+                    'filters_applied': filters_applied,
+                    'with_filtering': filters_applied,
+                    'filter_conditions': {
+                        'reports': reports,
+                        'sources': sources,
+                        'subtype': subtype
+                    },
+                    'inferred_filters': inferred_filters,
+                    'applied_filters': {
+                        'reports': reports,
+                        'sources': sources,
+                        'subtype': subtype
+                    },
+                    # Store filter and reranking metadata
+                    'filter_details': {
+                        'explicit_filters': {
+                            'reports': reports,
+                            'sources': sources,
+                            'subtype': subtype,
+                            'year': year
+                        },
+                        'inferred_filters': inferred_filters if auto_infer_filters else {},
+                        'auto_inference_enabled': auto_infer_filters,
+                        'qdrant_filter_applied': qdrant_filter is not None,
+                        'filter_summary': filter_summary if 'filter_summary' in locals() else None
+                    },
+                    'reranker_model': self._get_reranker_model_name() if use_reranking else None,
+                    'reranker_applied': use_reranking,
+                    'reranking_info': {
+                        'model': self._get_reranker_model_name(),
+                        'applied': use_reranking,
+                        'top_k': len(context_docs) if context_docs else 0,
+                        # 'original_documents': [
+                        #     {
+                        #         'content': doc.page_content[:200] + '...' if len(doc.page_content) > 200 else doc.page_content,
+                        #         'metadata': doc.metadata,
+                        #         'score': getattr(doc, 'score', getattr(doc, 'original_score', 0.0))
+                        #     } for doc in context_docs
+                        # ] if use_reranking else None,
+                        'reranked_documents': [
+                            {
+                                'content': doc.page_content[:200] + '...' if len(doc.page_content) > 200 else doc.page_content,
+                                'metadata': doc.metadata,
+                                'score': doc.metadata.get('original_score', getattr(doc, 'score', 0.0)),
+                                'original_rank': doc.metadata.get('original_rank', None),
+                                'final_rank': doc.metadata.get('final_rank', None),
+                                'reranked_score': doc.metadata.get('reranked_score', None)
+                            } for doc in context_docs
+                        ] if use_reranking else None
+                    }
+                },
+                query=query
+            )
+            return result
+        except Exception as e:
+            print(f"Error in pipeline run: {e}")
+            return PipelineResult(
+                answer=f"Error processing query: {e}",
+                sources=[],
+                execution_time=0.0,
+                metadata={'error': str(e)},
+                query=query
+            )
+    def get_system_status(self) -> Dict[str, Any]:
+        """
+        Get system status information.
+        Returns:
+            Dictionary with system status
+        """
+        status = {
+            "config_loaded": bool(self.config),
+            "chunks_loaded": bool(self.chunks),
+            "vectorstore_connected": bool(
+                self.vectorstore_manager and self.vectorstore_manager.get_vectorstore()
+            ),
+            "components_initialized": bool(
+                self.context_retriever and self.report_service
+            ),
+        }
+        if self.chunks:
+            status["num_chunks"] = len(self.chunks)
+        if self.report_service:
+            status["available_sources"] = self.report_service.get_available_sources()
+            status["available_reports"] = len(
+                self.report_service.get_available_reports()
+            )
+        status["overall_status"] = (
+            "ready"
+            if all(
+                [
+                    status["config_loaded"],
+                    status["chunks_loaded"],
+                    status["vectorstore_connected"],
+                    status["components_initialized"],
+                ]
+            )
+            else "not_ready"
+        )
+        return status
+    def get_available_llm_providers(self) -> List[str]:
+        """Get list of available LLM providers."""
+        providers = []
+        reader_config = self.config.get("reader", {})
+        for provider in [
+            "MISTRAL",
+            "OPENAI",
+            "OLLAMA",
+            "INF_PROVIDERS",
+            "NVIDIA",
+            "DEDICATED",
+            "OPENROUTER",
+        ]:
+            if provider in reader_config:
+                providers.append(provider.lower())
+        return providers

src/reporting/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Report metadata and utilities."""
+from .metadata import get_report_metadata, get_available_sources
+from .service import ReportService
+__all__ = ["get_report_metadata", "get_available_sources", "ReportService"]

src/reporting/feedback_schema.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+Feedback Schema for RAG Chatbot
+This module defines dataclasses for feedback data structures
+and provides Snowflake schema generation.
+"""
+from dataclasses import dataclass, asdict, field
+from typing import List, Optional, Dict, Any, Union
+from datetime import datetime
+@dataclass
+class RetrievedDocument:
+    """Single retrieved document metadata"""
+    doc_id: str
+    filename: str
+    page: int
+    score: float
+    content: str
+    metadata: Dict[str, Any]
+@dataclass
+class RetrievalEntry:
+    """Single retrieval operation metadata"""
+    rag_query: str
+    documents_retrieved: List[RetrievedDocument]
+    conversation_length: int
+    filters_applied: Optional[Dict[str, Any]] = None
+    timestamp: Optional[float] = None
+    _raw_data: Optional[Dict[str, Any]] = None
+@dataclass
+class UserFeedback:
+    """User feedback submission data"""
+    feedback_id: str
+    open_ended_feedback: Optional[str]
+    score: int
+    is_feedback_about_last_retrieval: bool
+    retrieved_data: List[RetrievalEntry]
+    conversation_id: str
+    timestamp: float
+    message_count: int
+    has_retrievals: bool
+    retrieval_count: int
+    user_query: Optional[str] = None
+    bot_response: Optional[str] = None
+    created_at: str = field(default_factory=lambda: datetime.now().isoformat())
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary with nested data structures"""
+        result = asdict(self)
+        # Handle nested objects
+        if self.retrieved_data:
+            result['retrieved_data'] = [self._serialize_retrieval_entry(entry) for entry in self.retrieved_data]
+        return result
+    def _serialize_retrieval_entry(self, entry: RetrievalEntry) -> Dict[str, Any]:
+        """Serialize retrieval entry to dict"""
+        # If raw data exists, use it (it's already properly formatted)
+        if hasattr(entry, '_raw_data') and entry._raw_data:
+            return entry._raw_data
+        # Otherwise, serialize the dataclass
+        result = asdict(entry)
+        if entry.documents_retrieved:
+            result['documents_retrieved'] = [asdict(doc) for doc in entry.documents_retrieved]
+        return result
+    def to_snowflake_schema(self) -> Dict[str, Any]:
+        """Generate Snowflake schema for this dataclass"""
+        schema = {
+            "feedback_id": "VARCHAR(255)",
+            "open_ended_feedback": "VARCHAR(16777216)",  # Large text
+            "score": "INTEGER",
+            "is_feedback_about_last_retrieval": "BOOLEAN",
+            "conversation_id": "VARCHAR(255)",
+            "timestamp": "NUMBER(20, 0)",
+            "message_count": "INTEGER",
+            "has_retrievals": "BOOLEAN",
+            "retrieval_count": "INTEGER",
+            "user_query": "VARCHAR(16777216)",
+            "bot_response": "VARCHAR(16777216)",
+            "created_at": "TIMESTAMP_NTZ",
+            "retrieved_data": "VARIANT",  # Array of retrieval entries
+            # retrieved_data structure:
+            # [
+            #   {
+            #     "rag_query": "...",
+            #     "conversation_length": 5,
+            #     "timestamp": 1234567890,
+            #     "docs_retrieved": [
+            #       {"filename": "...", "page": 14, "score": 0.95, ...},
+            #       ...
+            #     ]
+            #   },
+            #   ...
+            # ]
+        }
+        return schema
+    @classmethod
+    def get_snowflake_create_table_sql(cls, table_name: str = "user_feedback") -> str:
+        """Generate CREATE TABLE SQL for Snowflake"""
+        schema = cls.to_snowflake_schema(None)
+        columns = []
+        for col_name, col_type in schema.items():
+            nullable = "NULL" if col_name not in ["feedback_id", "score", "timestamp"] else "NOT NULL"
+            columns.append(f"  {col_name} {col_type} {nullable}")
+        # Build SQL string properly
+        columns_str = ",\n".join(columns)
+        sql = f"""CREATE TABLE IF NOT EXISTS {table_name} (
+{columns_str},
+  PRIMARY KEY (feedback_id)
+);
+-- Create index on timestamp for querying by time
+CREATE INDEX IF NOT EXISTS idx_feedback_timestamp ON {table_name} (timestamp);
+-- Create index on conversation_id for querying by conversation
+CREATE INDEX IF NOT EXISTS idx_feedback_conversation ON {table_name} (conversation_id);
+-- Create index on score for feedback analysis
+CREATE INDEX IF NOT EXISTS idx_feedback_score ON {table_name} (score);
+"""
+        return sql
+# Snowflake variant schema for retrieved_data array
+RETRIEVAL_ENTRY_SCHEMA = {
+    "rag_query": "VARCHAR",
+    "documents_retrieved": "ARRAY",  # Array of document objects
+    "conversation_length": "INTEGER",
+    "filters_applied": "OBJECT",
+    "timestamp": "NUMBER"
+}
+DOCUMENT_SCHEMA = {
+    "doc_id": "VARCHAR",
+    "filename": "VARCHAR",
+    "page": "INTEGER",
+    "score": "DOUBLE",
+    "content": "VARCHAR(16777216)",
+    "metadata": "OBJECT"
+}
+def generate_snowflake_schema_sql() -> str:
+    """Generate complete Snowflake schema SQL for feedback system"""
+    return UserFeedback.get_snowflake_create_table_sql("user_feedback")
+def create_feedback_from_dict(data: Dict[str, Any]) -> UserFeedback:
+    """Create UserFeedback instance from dictionary"""
+    # Parse retrieved_data if present
+    retrieved_data = []
+    if "retrieved_data" in data and data["retrieved_data"]:
+        for entry_dict in data.get("retrieved_data", []):
+            # Map the actual structure from rag_retrieval_history
+            # Entry has: conversation_up_to, rag_query_expansion, docs_retrieved
+            try:
+                # Try to map to expected structure
+                entry = RetrievalEntry(
+                    rag_query=entry_dict.get("rag_query_expansion", ""),
+                    documents_retrieved=[],  # Empty for now, will store as raw data
+                    conversation_length=len(entry_dict.get("conversation_up_to", [])),
+                    filters_applied=None,
+                    timestamp=entry_dict.get("timestamp", None)
+                )
+                # Store raw data in the entry
+                entry._raw_data = entry_dict  # Store original for preservation
+                retrieved_data.append(entry)
+            except Exception as e:
+                # If mapping fails, store as-is without strict typing
+                pass
+    return UserFeedback(
+        feedback_id=data.get("feedback_id", f"feedback_{data.get('timestamp', 'unknown')}"),
+        open_ended_feedback=data.get("open_ended_feedback"),
+        score=data["score"],
+        is_feedback_about_last_retrieval=data["is_feedback_about_last_retrieval"],
+        retrieved_data=retrieved_data,
+        conversation_id=data["conversation_id"],
+        timestamp=data["timestamp"],
+        message_count=data["message_count"],
+        has_retrievals=data["has_retrievals"],
+        retrieval_count=data["retrieval_count"],
+        user_query=data.get("user_query"),
+        bot_response=data.get("bot_response")
+    )

src/reporting/metadata.py ADDED Viewed

	@@ -0,0 +1,216 @@

+"""Report metadata management."""
+from typing import Dict, List, Any, Set
+from pathlib import Path
+def get_report_metadata(chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """
+    Extract metadata from chunks.
+    Args:
+        chunks: List of chunk dictionaries
+    Returns:
+        Dictionary with report metadata
+    """
+    if not chunks:
+        return {}
+    sources = set()
+    filenames = set()
+    years = set()
+    for chunk in chunks:
+        metadata = chunk.get("metadata", {})
+        if "source" in metadata:
+            sources.add(metadata["source"])
+        if "filename" in metadata:
+            filenames.add(metadata["filename"])
+        if "year" in metadata:
+            years.add(metadata["year"])
+    return {
+        "sources": sorted(list(sources)),
+        "filenames": sorted(list(filenames)),
+        "years": sorted(list(years)),
+        "total_chunks": len(chunks)
+    }
+def get_available_sources() -> List[str]:
+    """
+    Get list of available report sources (legacy compatibility).
+    Returns:
+        List of source categories
+    """
+    # This would typically come from the original auditqa_old.reports module
+    # For now, return common categories
+    return [
+        "Consolidated",
+        "Ministry, Department, Agency and Projects",
+        "Local Government",
+        "Value for Money",
+        "Thematic",
+        "Hospital",
+        "Project"
+    ]
+def get_source_subtypes() -> Dict[str, List[str]]:
+    """
+    Get mapping of sources to their subtypes (placeholder).
+    Returns:
+        Dictionary mapping sources to subtypes
+    """
+    # This was originally imported from auditqa_old.reports.new_files
+    # For now, return a placeholder structure
+    return {
+        "Consolidated": ["Annual Consolidated OAG 2024", "Annual Consolidated OAG 2023"],
+        "Local Government": ["District Reports", "Municipal Reports"],
+        "Ministry, Department, Agency and Projects": ["Ministry Reports", "Agency Reports"],
+        "Value for Money": ["VFM Reports 2024", "VFM Reports 2023"],
+        "Thematic": ["Thematic Reports 2024", "Thematic Reports 2023"],
+        "Hospital": ["Hospital Reports 2024", "Hospital Reports 2023"],
+        "Project": ["Project Reports 2024", "Project Reports 2023"]
+    }
+def validate_report_filters(
+    reports: List[str] = None,
+    sources: str = None,
+    subtype: List[str] = None,
+    available_metadata: Dict[str, Any] = None
+) -> Dict[str, Any]:
+    """
+    Validate report filter parameters.
+    Args:
+        reports: List of specific report filenames
+        sources: Source category
+        subtype: List of subtypes
+        available_metadata: Available metadata for validation
+    Returns:
+        Dictionary with validation results
+    """
+    validation_result = {
+        "valid": True,
+        "warnings": [],
+        "errors": []
+    }
+    if not available_metadata:
+        validation_result["warnings"].append("No metadata available for validation")
+        return validation_result
+    available_sources = available_metadata.get("sources", [])
+    available_filenames = available_metadata.get("filenames", [])
+    # Validate sources
+    if sources and sources not in available_sources:
+        validation_result["errors"].append(f"Source '{sources}' not found in available sources")
+        validation_result["valid"] = False
+    # Validate reports
+    if reports:
+        for report in reports:
+            if report not in available_filenames:
+                validation_result["warnings"].append(f"Report '{report}' not found in available reports")
+    # Validate subtypes
+    if subtype:
+        for sub in subtype:
+            if sub not in available_filenames:
+                validation_result["warnings"].append(f"Subtype '{sub}' not found in available reports")
+    return validation_result
+def get_report_statistics(chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """
+    Get statistics about reports in chunks.
+    Args:
+        chunks: List of chunk dictionaries
+    Returns:
+        Dictionary with report statistics
+    """
+    if not chunks:
+        return {}
+    stats = {
+        "total_chunks": len(chunks),
+        "sources": {},
+        "years": {},
+        "avg_chunk_length": 0,
+        "total_content_length": 0
+    }
+    total_length = 0
+    for chunk in chunks:
+        content = chunk.get("content", "")
+        total_length += len(content)
+        metadata = chunk.get("metadata", {})
+        # Count by source
+        source = metadata.get("source", "Unknown")
+        stats["sources"][source] = stats["sources"].get(source, 0) + 1
+        # Count by year
+        year = metadata.get("year", "Unknown")
+        stats["years"][year] = stats["years"].get(year, 0) + 1
+    stats["total_content_length"] = total_length
+    stats["avg_chunk_length"] = total_length / len(chunks) if chunks else 0
+    return stats
+def filter_chunks_by_metadata(
+    chunks: List[Dict[str, Any]],
+    source_filter: str = None,
+    filename_filter: List[str] = None,
+    year_filter: List[str] = None
+) -> List[Dict[str, Any]]:
+    """
+    Filter chunks by metadata criteria.
+    Args:
+        chunks: List of chunk dictionaries
+        source_filter: Source to filter by
+        filename_filter: List of filenames to filter by
+        year_filter: List of years to filter by
+    Returns:
+        Filtered list of chunks
+    """
+    filtered_chunks = chunks
+    if source_filter:
+        filtered_chunks = [
+            chunk for chunk in filtered_chunks
+            if chunk.get("metadata", {}).get("source") == source_filter
+        ]
+    if filename_filter:
+        filtered_chunks = [
+            chunk for chunk in filtered_chunks
+            if chunk.get("metadata", {}).get("filename") in filename_filter
+        ]
+    if year_filter:
+        filtered_chunks = [
+            chunk for chunk in filtered_chunks
+            if chunk.get("metadata", {}).get("year") in year_filter
+        ]
+    return filtered_chunks

src/reporting/service.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""Report service for managing report operations."""
+from typing import Dict, List, Any, Optional
+from .metadata import get_report_metadata, get_available_sources, get_source_subtypes
+class ReportService:
+    """Service class for report operations."""
+    def __init__(self, chunks: List[Dict[str, Any]] = None):
+        """
+        Initialize report service.
+        Args:
+            chunks: List of chunk dictionaries
+        """
+        self.chunks = chunks or []
+        self.metadata = get_report_metadata(self.chunks) if self.chunks else {}
+    def get_available_sources(self) -> List[str]:
+        """Get available report sources."""
+        if self.metadata:
+            return self.metadata.get("sources", [])
+        return get_available_sources()
+    def get_available_reports(self) -> List[str]:
+        """Get available report filenames."""
+        return self.metadata.get("filenames", [])
+    def get_source_subtypes(self) -> Dict[str, List[str]]:
+        """Get source to subtype mapping."""
+        # For now, use the placeholder function
+        # In a full implementation, this would be derived from actual data
+        return get_source_subtypes()
+    def get_reports_by_source(self, source: str) -> List[str]:
+        """
+        Get reports filtered by source.
+        Args:
+            source: Source category
+        Returns:
+            List of report filenames
+        """
+        if not self.chunks:
+            return []
+        reports = set()
+        for chunk in self.chunks:
+            metadata = chunk.get("metadata", {})
+            if metadata.get("source") == source:
+                filename = metadata.get("filename")
+                if filename:
+                    reports.add(filename)
+        return sorted(list(reports))
+    def get_years_by_source(self, source: str) -> List[str]:
+        """
+        Get years available for a specific source.
+        Args:
+            source: Source category
+        Returns:
+            List of years
+        """
+        if not self.chunks:
+            return []
+        years = set()
+        for chunk in self.chunks:
+            metadata = chunk.get("metadata", {})
+            if metadata.get("source") == source:
+                year = metadata.get("year")
+                if year:
+                    years.add(year)
+        return sorted(list(years))
+    def search_reports(self, query: str) -> List[str]:
+        """
+        Search for reports by name.
+        Args:
+            query: Search query
+        Returns:
+            List of matching report filenames
+        """
+        if not self.chunks:
+            return []
+        query_lower = query.lower()
+        matching_reports = set()
+        for chunk in self.chunks:
+            metadata = chunk.get("metadata", {})
+            filename = metadata.get("filename", "")
+            if query_lower in filename.lower():
+                matching_reports.add(filename)
+        return sorted(list(matching_reports))
+    def get_report_info(self, filename: str) -> Dict[str, Any]:
+        """
+        Get information about a specific report.
+        Args:
+            filename: Report filename
+        Returns:
+            Dictionary with report information
+        """
+        if not self.chunks:
+            return {}
+        report_info = {
+            "filename": filename,
+            "chunk_count": 0,
+            "sources": set(),
+            "years": set(),
+            "total_content_length": 0
+        }
+        for chunk in self.chunks:
+            metadata = chunk.get("metadata", {})
+            if metadata.get("filename") == filename:
+                report_info["chunk_count"] += 1
+                report_info["total_content_length"] += len(chunk.get("content", ""))
+                if "source" in metadata:
+                    report_info["sources"].add(metadata["source"])
+                if "year" in metadata:
+                    report_info["years"].add(metadata["year"])
+        # Convert sets to lists
+        report_info["sources"] = list(report_info["sources"])
+        report_info["years"] = list(report_info["years"])
+        return report_info

src/reporting/snowflake_connector.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""
+Snowflake Connector for Feedback System
+This module handles inserting user feedback into Snowflake.
+"""
+import os
+import json
+import logging
+from typing import Dict, Any, Optional
+from src.reporting.feedback_schema import UserFeedback
+# Try to import snowflake connector
+try:
+    import snowflake.connector
+    SNOWFLAKE_AVAILABLE = True
+except ImportError:
+    SNOWFLAKE_AVAILABLE = False
+    logging.warning("⚠️ snowflake-connector-python not installed. Install with: pip install snowflake-connector-python")
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class SnowflakeFeedbackConnector:
+    """Connector for inserting feedback into Snowflake"""
+    def __init__(
+        self,
+        user: str,
+        password: str,
+        account: str,
+        warehouse: str,
+        database: str = "SNOWFLAKE_LEARNING",
+        schema: str = "PUBLIC"
+    ):
+        self.user = user
+        self.password = password
+        self.account = account
+        self.warehouse = warehouse
+        self.database = database
+        self.schema = schema
+        self._connection = None
+    def connect(self):
+        """Establish Snowflake connection"""
+        if not SNOWFLAKE_AVAILABLE:
+            raise ImportError("snowflake-connector-python is not installed. Install with: pip install snowflake-connector-python")
+        logger.info("=" * 80)
+        logger.info("🔌 SNOWFLAKE CONNECTION: Attempting to connect...")
+        logger.info(f"   - Account: {self.account}")
+        logger.info(f"   - Warehouse: {self.warehouse}")
+        logger.info(f"   - Database: {self.database}")
+        logger.info(f"   - Schema: {self.schema}")
+        logger.info(f"   - User: {self.user}")
+        try:
+            self._connection = snowflake.connector.connect(
+                user=self.user,
+                password=self.password,
+                account=self.account,
+                warehouse=self.warehouse
+                # Don't set database/schema in connection - we'll do it per query
+            )
+            logger.info("✅ SNOWFLAKE CONNECTION: Successfully connected")
+            logger.info("=" * 80)
+            print(f"✅ Connected to Snowflake: {self.database}.{self.schema}")
+        except Exception as e:
+            logger.error(f"❌ SNOWFLAKE CONNECTION FAILED: {e}")
+            logger.error("=" * 80)
+            print(f"❌ Failed to connect to Snowflake: {e}")
+            raise
+    def disconnect(self):
+        """Close Snowflake connection"""
+        if self._connection:
+            self._connection.close()
+            print("✅ Disconnected from Snowflake")
+    def insert_feedback(self, feedback: UserFeedback) -> bool:
+        """Insert a single feedback record into Snowflake"""
+        logger.info("=" * 80)
+        logger.info("🔄 SNOWFLAKE INSERT: Starting feedback insertion process")
+        logger.info(f"📝 Feedback ID: {feedback.feedback_id}")
+        if not self._connection:
+            logger.error("❌ Not connected to Snowflake. Call connect() first.")
+            raise RuntimeError("Not connected to Snowflake. Call connect() first.")
+        try:
+            logger.info("📊 VALIDATION: Validating feedback data structure...")
+            # Validate feedback object
+            validation_errors = []
+            if not feedback.feedback_id:
+                validation_errors.append("Missing feedback_id")
+            if feedback.score is None:
+                validation_errors.append("Missing score")
+            if feedback.timestamp is None:
+                validation_errors.append("Missing timestamp")
+            if validation_errors:
+                logger.error(f"❌ VALIDATION FAILED: {validation_errors}")
+                return False
+            else:
+                logger.info("✅ VALIDATION PASSED: All required fields present")
+            logger.info("📋 Data Summary:")
+            logger.info(f"   - Feedback ID: {feedback.feedback_id}")
+            logger.info(f"   - Score: {feedback.score}")
+            logger.info(f"   - Conversation ID: {feedback.conversation_id}")
+            logger.info(f"   - Has Retrievals: {feedback.has_retrievals}")
+            logger.info(f"   - Retrieval Count: {feedback.retrieval_count}")
+            logger.info(f"   - Message Count: {feedback.message_count}")
+            logger.info(f"   - Timestamp: {feedback.timestamp}")
+            cursor = self._connection.cursor()
+            logger.info("✅ SNOWFLAKE CONNECTION: Cursor created")
+            # Set database and schema context
+            logger.info(f"🔧 SETTING CONTEXT: Database={self.database}, Schema={self.schema}")
+            try:
+                cursor.execute(f'USE DATABASE "{self.database}"')
+                cursor.execute(f'USE SCHEMA "{self.schema}"')
+                cursor.execute("SELECT CURRENT_DATABASE(), CURRENT_SCHEMA()")
+                current_db, current_schema = cursor.fetchone()
+                logger.info(f"✅ Current context verified: Database={current_db}, Schema={current_schema}")
+            except Exception as e:
+                logger.error(f"❌ Could not set context: {e}")
+                raise
+            # Prepare data
+            logger.info("🔧 DATA PREPARATION: Preparing retrieved_data...")
+            retrieved_data_raw = feedback.to_dict()['retrieved_data']
+            logger.info(f"   - Retrieved data type (raw): {type(retrieved_data_raw).__name__}")
+            logger.info(f"   - Retrieved data: {repr(retrieved_data_raw)[:200]}")
+            # If retrieved_data is already a string (from UI), parse it
+            if isinstance(retrieved_data_raw, str):
+                logger.info("   - Parsing string to Python object")
+                retrieved_data = json.loads(retrieved_data_raw)
+            elif retrieved_data_raw is None:
+                retrieved_data = None
+            else:
+                # It's already a Python object (list/dict)
+                logger.info("   - Data is already a Python object")
+                retrieved_data = retrieved_data_raw
+            logger.info(f"   - Retrieved data size: {len(str(retrieved_data)) if retrieved_data else 0} characters")
+            logger.info(f"   - Retrieved data type: {type(retrieved_data).__name__}")
+            # Convert to JSON string for TEXT column
+            if retrieved_data:
+                retrieved_data_for_db = json.dumps(retrieved_data)
+                logger.info(f"   - Converting to JSON string for TEXT column")
+                logger.info(f"   - JSON string length: {len(retrieved_data_for_db)}")
+            else:
+                logger.info(f"   - Retrieved data is None, using NULL")
+                retrieved_data_for_db = None
+            # Build SQL with retrieved_data as a TEXT column parameter
+            sql = f"""INSERT INTO user_feedback (
+                feedback_id,
+                open_ended_feedback,
+                score,
+                is_feedback_about_last_retrieval,
+                conversation_id,
+                timestamp,
+                message_count,
+                has_retrievals,
+                retrieval_count,
+                user_query,
+                bot_response,
+                created_at,
+                retrieved_data
+            ) VALUES (
+                %(feedback_id)s, %(open_ended_feedback)s, %(score)s, %(is_feedback_about_last_retrieval)s,
+                %(conversation_id)s, %(timestamp)s, %(message_count)s, %(has_retrievals)s,
+                %(retrieval_count)s, %(user_query)s, %(bot_response)s, %(created_at)s,
+                %(retrieved_data)s
+            )"""
+            logger.info("📝 SQL PREPARATION: Building INSERT statement...")
+            logger.info(f"   - Target table: user_feedback")
+            logger.info(f"   - Database: {self.database}")
+            logger.info(f"   - Schema: {self.schema}")
+            # Prepare parameters
+            params = {
+                'feedback_id': feedback.feedback_id,
+                'open_ended_feedback': feedback.open_ended_feedback,
+                'score': feedback.score,
+                'is_feedback_about_last_retrieval': feedback.is_feedback_about_last_retrieval,
+                'conversation_id': feedback.conversation_id,
+                'timestamp': int(feedback.timestamp),
+                'message_count': feedback.message_count,
+                'has_retrievals': feedback.has_retrievals,
+                'retrieval_count': feedback.retrieval_count,
+                'user_query': feedback.user_query,
+                'bot_response': feedback.bot_response,
+                'created_at': feedback.created_at,
+                'retrieved_data': retrieved_data_for_db
+            }
+            # Execute insert
+            logger.info("🚀 SQL EXECUTION: Executing INSERT query...")
+            cursor.execute(sql, params)
+            logger.info("✅ SQL EXECUTION: Query executed successfully")
+            logger.info(f"   - Rows affected: 1")
+            logger.info(f"   - Status: SUCCESS")
+            cursor.close()
+            logger.info("✅ SNOWFLAKE INSERT: Feedback inserted successfully")
+            logger.info(f"📝 Inserted feedback: {feedback.feedback_id}")
+            logger.info("=" * 80)
+            return True
+        except Exception as e:
+            # Check if it's a Snowflake error
+            if SNOWFLAKE_AVAILABLE and "ProgrammingError" in str(type(e)):
+                logger.error(f"❌ SQL EXECUTION ERROR: {e}")
+                logger.error(f"   - Error code: {getattr(e, 'errno', 'Unknown')}")
+                logger.error(f"   - SQL state: {getattr(e, 'sqlstate', 'Unknown')}")
+            else:
+                logger.error(f"❌ SNOWFLAKE INSERT FAILED: {type(e).__name__}")
+                logger.error(f"   - Error: {e}")
+            logger.error("=" * 80)
+            return False
+    def __enter__(self):
+        """Context manager entry"""
+        self.connect()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit"""
+        self.disconnect()
+def get_snowflake_connector_from_env() -> Optional[SnowflakeFeedbackConnector]:
+    """Create Snowflake connector from environment variables"""
+    user = os.getenv("SNOWFLAKE_USER")
+    password = os.getenv("SNOWFLAKE_PASSWORD")
+    account = os.getenv("SNOWFLAKE_ACCOUNT")
+    warehouse = os.getenv("SNOWFLAKE_WAREHOUSE")
+    database = os.getenv("SNOWFLAKE_DATABASE", "SNOWFLAKE_LEARN")
+    schema = os.getenv("SNOWFLAKE_SCHEMA", "PUBLIC")
+    if not all([user, password, account, warehouse]):
+        print("⚠️ Snowflake credentials not found in environment variables")
+        print("Required variables: SNOWFLAKE_USER, SNOWFLAKE_PASSWORD, SNOWFLAKE_ACCOUNT, SNOWFLAKE_WAREHOUSE")
+        return None
+    return SnowflakeFeedbackConnector(
+        user=user,
+        password=password,
+        account=account,
+        warehouse=warehouse,
+        database=database,
+        schema=schema
+    )
+def save_to_snowflake(feedback: UserFeedback) -> bool:
+    """Helper function to save feedback to Snowflake"""
+    logger.info("=" * 80)
+    logger.info("🔵 SNOWFLAKE SAVE: Starting save process")
+    logger.info(f"📝 Feedback ID: {feedback.feedback_id}")
+    connector = get_snowflake_connector_from_env()
+    if not connector:
+        logger.warning("⚠️ SNOWFLAKE SAVE: Skipping insertion (credentials not configured)")
+        logger.warning("   Required variables: SNOWFLAKE_USER, SNOWFLAKE_PASSWORD, SNOWFLAKE_ACCOUNT, SNOWFLAKE_WAREHOUSE")
+        logger.info("=" * 80)
+        return False
+    try:
+        logger.info("📡 SNOWFLAKE SAVE: Establishing connection...")
+        connector.connect()
+        logger.info("✅ SNOWFLAKE SAVE: Connection established")
+        logger.info("📥 SNOWFLAKE SAVE: Attempting to insert feedback...")
+        success = connector.insert_feedback(feedback)
+        logger.info("🔌 SNOWFLAKE SAVE: Disconnecting...")
+        connector.disconnect()
+        if success:
+            logger.info("✅ SNOWFLAKE SAVE: Successfully saved feedback")
+        else:
+            logger.error("❌ SNOWFLAKE SAVE: Failed to save feedback")
+        logger.info("=" * 80)
+        return success
+    except Exception as e:
+        logger.error(f"❌ SNOWFLAKE SAVE ERROR: {type(e).__name__}")
+        logger.error(f"   - Error: {e}")
+        logger.info("=" * 80)
+        return False

src/retrieval/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Document retrieval and filtering utilities."""
+from .filter import create_filter, FilterBuilder
+from .context import ContextRetriever, get_context
+from .hybrid import HybridRetriever, get_available_search_modes, get_search_mode_description
+__all__ = [
+    "create_filter",
+    "FilterBuilder",
+    "ContextRetriever",
+    "get_context",
+    "HybridRetriever",
+    "get_available_search_modes",
+    "get_search_mode_description"
+]

src/retrieval/colbert_cache.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+ColBERT embeddings cache for test set documents.
+Provides O(1) lookup for ColBERT embeddings during late interaction.
+"""
+import json
+import numpy as np
+from pathlib import Path
+from typing import Dict, Optional, Any
+class ColBERTCache:
+    """Cache for ColBERT embeddings of test set documents."""
+    def __init__(self, cache_file: str = "test_set_colbert_cache.json"):
+        self.cache_file = Path("outputs/caches") / cache_file
+        self.embeddings_cache: Dict[str, np.ndarray] = {}
+        self._load_cache()
+    def _load_cache(self):
+        """Load embeddings from cache file."""
+        if not self.cache_file.exists():
+            print(f"⚠️ ColBERT cache not found: {self.cache_file}")
+            print("💡 Run 'python precalculate_test_set_colbert.py' to create cache")
+            return
+        print(f"📂 Loading ColBERT cache from {self.cache_file}...")
+        try:
+            with open(self.cache_file, 'r') as f:
+                cache_data = json.load(f)
+            # Reconstruct embeddings from compressed format
+            for doc_id, data in cache_data.items():
+                embedding_min = data['min']
+                embedding_max = data['max']
+                quantized_embedding = np.array(data['embedding'], dtype=np.uint8)
+                # Reconstruct original embedding
+                reconstructed = (quantized_embedding.astype(np.float32) / 255.0) * (embedding_max - embedding_min) + embedding_min
+                self.embeddings_cache[doc_id] = reconstructed.reshape(data['shape'])
+            print(f"✅ Loaded {len(self.embeddings_cache)} ColBERT embeddings from cache")
+        except Exception as e:
+            print(f"❌ Error loading ColBERT cache: {e}")
+            self.embeddings_cache = {}
+    def get_embedding(self, document_text: str) -> Optional[np.ndarray]:
+        """Get ColBERT embedding for a document (O(1) lookup)."""
+        return self.embeddings_cache.get(document_text)
+    def has_embedding(self, document_text: str) -> bool:
+        """Check if embedding exists for document."""
+        return document_text in self.embeddings_cache
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        return {
+            'total_embeddings': len(self.embeddings_cache),
+            'cache_file': str(self.cache_file),
+            'cache_exists': self.cache_file.exists()
+        }
+# Global cache instance
+_colbert_cache = None
+def get_colbert_cache() -> ColBERTCache:
+    """Get global ColBERT cache instance."""
+    global _colbert_cache
+    if _colbert_cache is None:
+        _colbert_cache = ColBERTCache()
+    return _colbert_cache

src/retrieval/context.py ADDED Viewed

	@@ -0,0 +1,881 @@

+"""Context retrieval with reranking capabilities."""
+import os
+from typing import List, Optional, Tuple, Dict, Any
+from langchain.schema import Document
+from langchain_community.vectorstores import Qdrant
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from sentence_transformers import CrossEncoder
+import numpy as np
+import torch
+from qdrant_client.http import models as rest
+import traceback
+from .filter import create_filter
+class ContextRetriever:
+    """
+    Context retriever for hybrid search with optional filtering and reranking.
+    """
+    def __init__(self, vectorstore: Qdrant, config: dict = None):
+        """
+        Initialize the context retriever.
+        Args:
+            vectorstore: Qdrant vector store instance
+            config: Configuration dictionary
+        """
+        self.vectorstore = vectorstore
+        self.config = config or {}
+        self.reranker = None
+        # BM25 attributes
+        self.bm25_vectorizer = None
+        self.bm25_matrix = None
+        self.bm25_documents = None
+        # Initialize reranker if available
+        # Try to get reranker model from different config paths
+        self.reranker_model_name = (
+            config.get('retrieval', {}).get('reranker_model') or
+            config.get('ranker', {}).get('model') or
+            config.get('reranker_model') or
+            'BAAI/bge-reranker-v2-m3'
+        )
+        self.reranker_type = self._detect_reranker_type(self.reranker_model_name)
+        try:
+            if self.reranker_type == 'colbert':
+                from colbert.infra import Run, ColBERTConfig
+                from colbert.modeling.checkpoint import Checkpoint
+                # ColBERT uses late interaction - different implementation needed
+                print(f"✅ RERANKER: ColBERT model detected ({self.reranker_model_name})")
+                print(f"🔍 INTERACTION TYPE: Late interaction (token-level embeddings)")
+                # Create ColBERT config for CPU mode
+                colbert_config = ColBERTConfig(
+                    doc_maxlen=300,
+                    query_maxlen=32,
+                    nbits=2,
+                    kmeans_niters=4,
+                    root="./colbert_data"
+                )
+                # Load checkpoint (e.g. "colbert-ir/colbertv2.0")
+                self.colbert_checkpoint = Checkpoint(self.reranker_model_name, colbert_config=colbert_config)
+                self.colbert_model = self.colbert_checkpoint.model
+                self.colbert_tokenizer = self.colbert_checkpoint.raw_tokenizer
+                self.reranker = self._colbert_rerank  # attach wrapper function
+                print(f"✅ COLBERT: Model and tokenizer loaded successfully")
+            else:
+                # Standard CrossEncoder for BGE and other models
+                from sentence_transformers import CrossEncoder
+                self.reranker = CrossEncoder(self.reranker_model_name)
+                print(f"✅ RERANKER: Initialized {self.reranker_model_name}")
+                print(f"🔍 INTERACTION TYPE: Cross-encoder (single relevance score)")
+        except Exception as e:
+            print(f"⚠️ Reranker initialization failed: {e}")
+            self.reranker = None
+    def _detect_reranker_type(self, model_name: str) -> str:
+        """
+        Detect the type of reranker based on model name.
+        Args:
+            model_name: Name of the reranker model
+        Returns:
+            'colbert' for ColBERT models, 'crossencoder' for others
+        """
+        model_name_lower = model_name.lower()
+        # ColBERT model patterns
+        colbert_patterns = [
+            'colbert',
+            'colbert-ir',
+            'colbertv2',
+            'colbert-v2'
+        ]
+        for pattern in colbert_patterns:
+            if pattern in model_name_lower:
+                return 'colbert'
+        # Default to cross-encoder for BGE and other models
+        return 'crossencoder'
+    def _similarity_search_with_colbert_embeddings(self, query: str, k: int = 5, **kwargs) -> List[Tuple[Document, float]]:
+        """
+        Perform similarity search and fetch ColBERT embeddings for documents.
+        Args:
+            query: Search query
+            k: Number of documents to retrieve
+            **kwargs: Additional search parameters (filter, etc.)
+        Returns:
+            List of (Document, score) tuples with ColBERT embeddings in metadata
+        """
+        try:
+            print(f"🔍 COLBERT RETRIEVAL: Fetching documents with ColBERT embeddings")
+            # Use the vectorstore's similarity_search_with_score method instead of direct client
+            # This ensures proper filter handling
+            if 'filter' in kwargs and kwargs['filter']:
+                # Use the vectorstore method with filter
+                result = self.vectorstore.similarity_search_with_score(
+                    query,
+                    k=k,
+                    filter=kwargs['filter']
+                )
+            else:
+                # Use the vectorstore method without filter
+                result = self.vectorstore.similarity_search_with_score(query, k=k)
+            # Convert to the format we need
+            if isinstance(result, tuple) and len(result) == 2:
+                documents, scores = result
+            elif isinstance(result, list):
+                documents = []
+                scores = []
+                for item in result:
+                    if isinstance(item, tuple) and len(item) == 2:
+                        doc, score = item
+                        documents.append(doc)
+                        scores.append(score)
+                    else:
+                        documents.append(item)
+                        scores.append(0.0)
+            else:
+                documents = []
+                scores = []
+            # Now we need to fetch the ColBERT embeddings for these documents
+            # We'll use the Qdrant client directly for this part since we need specific payload fields
+            from qdrant_client.http import models as rest
+            collection_name = self.vectorstore.collection_name
+            # Get document IDs from the retrieved documents
+            doc_ids = []
+            for doc in documents:
+                # Extract ID from document metadata or use page_content hash as fallback
+                doc_id = doc.metadata.get('id') or doc.metadata.get('_id')
+                if not doc_id:
+                    # Use a hash of the content as ID
+                    import hashlib
+                    doc_id = hashlib.md5(doc.page_content.encode()).hexdigest()
+                doc_ids.append(doc_id)
+            # Fetch documents with ColBERT embeddings from Qdrant
+            search_result = self.vectorstore.client.retrieve(
+                collection_name=collection_name,
+                ids=doc_ids,
+                with_payload=True,
+                with_vectors=False
+            )
+            # Convert results to Document objects with ColBERT embeddings
+            enhanced_documents = []
+            enhanced_scores = []
+            # Create a mapping from doc_id to original score
+            doc_id_to_score = {}
+            for i, doc in enumerate(documents):
+                doc_id = doc.metadata.get('id') or doc.metadata.get('_id')
+                if not doc_id:
+                    import hashlib
+                    doc_id = hashlib.md5(doc.page_content.encode()).hexdigest()
+                doc_id_to_score[doc_id] = scores[i]
+            for point in search_result:
+                # Extract payload
+                payload = point.payload
+                # Get the original score for this document
+                doc_id = str(point.id)
+                original_score = doc_id_to_score.get(doc_id, 0.0)
+                # Create Document object with ColBERT embeddings
+                doc = Document(
+                    page_content=payload.get('page_content', ''),
+                    metadata={
+                        **payload.get('metadata', {}),
+                        'colbert_embedding': payload.get('colbert_embedding'),
+                        'colbert_model': payload.get('colbert_model'),
+                        'colbert_calculated_at': payload.get('colbert_calculated_at')
+                    }
+                )
+                enhanced_documents.append(doc)
+                enhanced_scores.append(original_score)
+            print(f"✅ COLBERT RETRIEVAL: Retrieved {len(enhanced_documents)} documents with ColBERT embeddings")
+            return list(zip(enhanced_documents, enhanced_scores))
+        except Exception as e:
+            print(f"❌ COLBERT RETRIEVAL ERROR: {e}")
+            print(f"❌ Falling back to regular similarity search")
+            # Fallback to regular search - handle filter parameter correctly
+            if 'filter' in kwargs and kwargs['filter']:
+                return self.vectorstore.similarity_search_with_score(query, k=k, filter=kwargs['filter'])
+            else:
+                return self.vectorstore.similarity_search_with_score(query, k=k)
+    def retrieve_context(
+        self,
+        query: str,
+        k: int = 5,
+        reports: Optional[List[str]] = None,
+        sources: Optional[List[str]] = None,
+        subtype: Optional[str] = None,
+        year: Optional[str] = None,
+        district: Optional[List[str]] = None,
+        filenames: Optional[List[str]] = None,
+        use_reranking: bool = False,
+        qdrant_filter: Optional[rest.Filter] = None
+    ) -> List[Document]:
+        """
+        Retrieve context documents using hybrid search with optional filtering and reranking.
+        Args:
+            query: User query
+            top_k: Number of documents to retrieve
+            reports: List of report names to filter by
+            sources: List of sources to filter by
+            subtype: Document subtype to filter by
+            year: Year to filter by
+            use_reranking: Whether to apply reranking
+            qdrant_filter: Pre-built Qdrant filter to use
+        Returns:
+            List of retrieved documents
+        """
+        try:
+            # Determine how many documents to retrieve
+            retrieve_k = k  #* 3 if use_reranking else k  # Retrieve more for reranking
+            # Build search kwargs
+            search_kwargs = {}
+            # Use qdrant_filter if provided (this takes precedence)
+            if qdrant_filter:
+                search_kwargs = {"filter": qdrant_filter}
+                print(f"✅ FILTERS APPLIED: Using inferred Qdrant filter")
+            else:
+                # Build filter from individual parameters
+                filter_obj = create_filter(
+                    reports=reports,
+                    sources=sources,
+                    subtype=subtype,
+                    year=year,
+                    district=district,
+                    filenames=filenames
+                )
+                if filter_obj:
+                    search_kwargs = {"filter": filter_obj}
+                    print(f"✅ FILTERS APPLIED: Using built filter")
+                else:
+                    search_kwargs = {}
+                    print(f"⚠️ NO FILTERS APPLIED: All documents will be searched")
+            # Perform vector search
+            try:
+                # Check if we need ColBERT embeddings for reranking
+                if use_reranking and self.reranker_type == 'colbert':
+                    result = self._similarity_search_with_colbert_embeddings(
+                        query,
+                        k=retrieve_k,
+                        **search_kwargs
+                    )
+                else:
+                    result = self.vectorstore.similarity_search_with_score(
+                        query,
+                        k=retrieve_k,
+                        **search_kwargs
+                    )
+                # Handle different return formats
+                if isinstance(result, tuple) and len(result) == 2:
+                    documents, scores = result
+                elif isinstance(result, list) and len(result) > 0:
+                    # Handle case where result is a list of (Document, score) tuples
+                    documents = []
+                    scores = []
+                    for item in result:
+                        if isinstance(item, tuple) and len(item) == 2:
+                            doc, score = item
+                            documents.append(doc)
+                            scores.append(score)
+                        else:
+                            # Handle case where item is just a Document
+                            documents.append(item)
+                            scores.append(0.0)  # Default score
+                else:
+                    documents = []
+                    scores = []
+                print(f"✅ RETRIEVAL SUCCESS: Retrieved {len(documents)} documents (requested: {retrieve_k})")
+                # If we got fewer documents than requested, try without filters
+                if len(documents) < retrieve_k and search_kwargs.get('filter'):
+                    print(f"⚠️ RETRIEVAL: Got {len(documents)} docs with filters, trying without filters...")
+                    try:
+                        result_no_filter = self.vectorstore.similarity_search_with_score(
+                            query,
+                            k=retrieve_k
+                        )
+                        if isinstance(result_no_filter, tuple) and len(result_no_filter) == 2:
+                            documents_no_filter, scores_no_filter = result_no_filter
+                        elif isinstance(result_no_filter, list):
+                            documents_no_filter = []
+                            scores_no_filter = []
+                            for item in result_no_filter:
+                                if isinstance(item, tuple) and len(item) == 2:
+                                    doc, score = item
+                                    documents_no_filter.append(doc)
+                                    scores_no_filter.append(score)
+                                else:
+                                    documents_no_filter.append(item)
+                                    scores_no_filter.append(0.0)
+                        else:
+                            documents_no_filter = []
+                            scores_no_filter = []
+                        if len(documents_no_filter) > len(documents):
+                            print(f"✅ RETRIEVAL: Got {len(documents_no_filter)} docs without filters")
+                            documents = documents_no_filter
+                            scores = scores_no_filter
+                    except Exception as e:
+                        print(f"⚠️ RETRIEVAL: Fallback search failed: {e}")
+            except Exception as e:
+                print(f"❌ RETRIEVAL ERROR: {str(e)}")
+                return []
+            # Apply reranking if enabled
+            reranking_applied = False
+            if use_reranking and len(documents) > 1:
+                print(f"🔄 RERANKING: Applying {self.reranker_model_name} to {len(documents)} documents...")
+                try:
+                    original_docs = documents.copy()
+                    original_scores = scores.copy()
+                    # Apply reranking
+                    # print(f"🔍 ORIGINAL DOCS: {documents[0]}")
+                    reranked_docs = self._apply_reranking(query, documents, scores)
+                    # print(f"🔍 RERANKED DOCS: {reranked_docs[0]}")
+                    reranking_applied = len(reranked_docs) > 0
+                    if reranking_applied:
+                        print(f"✅ RERANKING APPLIED: {self.reranker_model_name}")
+                        documents = reranked_docs
+                        # Update scores to reflect reranking
+                        # scores = [0.0] * len(documents)  # Reranked scores are not directly comparable
+                    else:
+                        print(f"⚠️ RERANKING FAILED: Using original order")
+                        documents = original_docs
+                        scores = original_scores
+                    return documents
+                except Exception as e:
+                    print(f"❌ RERANKING ERROR: {str(e)}")
+                    print(f"⚠️ RERANKING FAILED: Using original order")
+                    reranking_applied = False
+            elif use_reranking and len(documents) <= 1:
+                print(f"ℹ️ RERANKING: Skipped (only {len(documents)} document(s) retrieved)")
+                if use_reranking:
+                    print(f"ℹ️ RERANKING: Skipped (disabled or insufficient documents)")
+                # Store original scores in metadata
+                for i, (doc, score) in enumerate(zip(documents, scores)):
+                    doc.metadata['original_score'] = float(score)
+                    doc.metadata['reranking_applied'] = False
+                return documents
+            else:
+                print(f"ℹ️ RERANKING: Skipped (disabled or insufficient documents)")
+            # Limit to requested number of documents
+            documents = documents[:k]
+            scores = scores[:k] if scores else [0.0] * len(documents)
+            # Add metadata to documents
+            for i, (doc, score) in enumerate(zip(documents, scores)):
+                if hasattr(doc, 'metadata'):
+                    doc.metadata.update({
+                        'reranking_applied': reranking_applied,
+                        'reranker_model': 'BAAI/bge-reranker-v2-m3' if reranking_applied else None,
+                        'original_rank': i + 1,
+                        'final_rank': i + 1,
+                        'original_score': float(score) if score is not None else 0.0
+                    })
+            return documents
+        except Exception as e:
+            print(f"❌ CONTEXT RETRIEVAL ERROR: {str(e)}")
+            return []
+    def _apply_reranking(self, query: str, documents: List[Document], scores: List[float]) -> List[Document]:
+        """
+        Apply reranking to documents using the appropriate reranker.
+        Args:
+            query: User query
+            documents: List of documents to rerank
+            scores: Original scores
+        Returns:
+            Reranked list of documents
+        """
+        if not self.reranker or len(documents) == 0:
+            return documents
+        try:
+            print(f"🔍 RERANKING METHOD: Starting reranking with {len(documents)} documents")
+            print(f"🔍 RERANKING TYPE: {self.reranker_type.upper()}")
+            if self.reranker_type == 'colbert':
+                return self._apply_colbert_reranking(query, documents, scores)
+            else:
+                return self._apply_crossencoder_reranking(query, documents, scores)
+        except Exception as e:
+            print(f"❌ RERANKING ERROR: {str(e)}")
+            return documents
+    def _apply_crossencoder_reranking(self, query: str, documents: List[Document], scores: List[float]) -> List[Document]:
+        """
+        Apply reranking using CrossEncoder (BGE and other models).
+        Args:
+            query: User query
+            documents: List of documents to rerank
+            scores: Original scores
+        Returns:
+            Reranked list of documents
+        """
+        # Prepare pairs for reranking
+        pairs = []
+        for doc in documents:
+            pairs.append([query, doc.page_content])
+        print(f"🔍 CROSS-ENCODER: Prepared {len(pairs)} pairs for reranking")
+        # Get reranking scores using the correct CrossEncoder API
+        rerank_scores = self.reranker.predict(pairs)
+        # Handle single score case
+        if not isinstance(rerank_scores, (list, np.ndarray)):
+            rerank_scores = [rerank_scores]
+        # Ensure we have the right number of scores
+        if len(rerank_scores) != len(documents):
+            print(f"⚠️ RERANKING WARNING: Expected {len(documents)} scores, got {len(rerank_scores)}")
+            return documents
+        print(f"🔍 CROSS-ENCODER: Got {len(rerank_scores)} rerank scores")
+        print(f"🔍 CROSS-ENCODER SCORES: {rerank_scores[:5]}...")  # Show first 5 scores
+        # Combine documents with their rerank scores
+        doc_scores = list(zip(documents, rerank_scores))
+        # Sort by rerank score (descending)
+        doc_scores.sort(key=lambda x: x[1], reverse=True)
+        # Extract reranked documents and store scores in metadata
+        reranked_docs = []
+        for i, (doc, rerank_score) in enumerate(doc_scores):
+            # Find original index for original score
+            original_idx = documents.index(doc)
+            original_score = scores[original_idx] if original_idx < len(scores) else 0.0
+            # Create new document with reranking metadata
+            new_doc = Document(
+                page_content=doc.page_content,
+                metadata={
+                    **doc.metadata,
+                    'reranking_applied': True,
+                    'reranker_model': self.reranker_model_name,
+                    'reranker_type': self.reranker_type,
+                    'original_rank': original_idx + 1,
+                    'final_rank': i + 1,
+                    'original_score': float(original_score),
+                    'reranked_score': float(rerank_score)
+                }
+            )
+            reranked_docs.append(new_doc)
+        print(f"✅ CROSS-ENCODER: Reranked {len(reranked_docs)} documents")
+        return reranked_docs
+    def _apply_colbert_reranking(self, query: str, documents: List[Document], scores: List[float]) -> List[Document]:
+        """
+        Apply reranking using ColBERT late interaction.
+        Args:
+            query: User query
+            documents: List of documents to rerank
+            scores: Original scores
+        Returns:
+            Reranked list of documents
+        """
+        # Use the actual ColBERT reranking implementation
+        return self._colbert_rerank(query, documents, scores)
+    def _colbert_rerank(self, query: str, documents: List[Document], scores: List[float]) -> List[Document]:
+        """
+        ColBERT reranking using late interaction with pre-calculated embeddings support.
+        Args:
+            query: User query
+            documents: List of documents to rerank
+            scores: Original scores
+        Returns:
+            Reranked list of documents
+        """
+        try:
+            print(f"🔍 COLBERT: Starting late interaction reranking with {len(documents)} documents")
+            # Check if documents have pre-calculated ColBERT embeddings
+            pre_calculated_embeddings = []
+            documents_without_embeddings = []
+            documents_without_indices = []
+            for i, doc in enumerate(documents):
+                if (hasattr(doc, 'metadata') and
+                    'colbert_embedding' in doc.metadata and
+                    doc.metadata['colbert_embedding'] is not None):
+                    # Use pre-calculated embedding
+                    colbert_embedding = doc.metadata['colbert_embedding']
+                    if isinstance(colbert_embedding, list):
+                        colbert_embedding = torch.tensor(colbert_embedding)
+                    pre_calculated_embeddings.append(colbert_embedding)
+                else:
+                    # Need to calculate embedding
+                    documents_without_embeddings.append(doc)
+                    documents_without_indices.append(i)
+            # Calculate query embedding
+            query_embeddings = self.colbert_checkpoint.queryFromText([query])
+            # Calculate embeddings for documents without pre-calculated ones
+            if documents_without_embeddings:
+                print(f"🔄 COLBERT: Calculating embeddings for {len(documents_without_embeddings)} documents without pre-calculated embeddings")
+                doc_texts = [doc.page_content for doc in documents_without_embeddings]
+                doc_embeddings = self.colbert_checkpoint.docFromText(doc_texts)
+                # Insert calculated embeddings into the right positions
+                for i, embedding in enumerate(doc_embeddings):
+                    idx = documents_without_indices[i]
+                    pre_calculated_embeddings.insert(idx, embedding)
+            else:
+                print(f"✅ COLBERT: Using pre-calculated embeddings for all {len(documents)} documents")
+            # Calculate late interaction scores
+            # ColBERT uses MaxSim: for each query token, find max similarity with document tokens
+            colbert_scores = []
+            for i, doc_embedding in enumerate(pre_calculated_embeddings):
+                # Calculate similarity matrix between query and document i
+                sim_matrix = torch.matmul(query_embeddings[0], doc_embedding.transpose(-1, -2))
+                # MaxSim: for each query token, take max similarity with document
+                max_sim_per_query_token = torch.max(sim_matrix, dim=-1)[0]
+                # Sum over query tokens to get final score
+                final_score = torch.sum(max_sim_per_query_token).item()
+                colbert_scores.append(final_score)
+            # Sort documents by ColBERT scores
+            doc_scores = list(zip(documents, colbert_scores))
+            doc_scores.sort(key=lambda x: x[1], reverse=True)
+            # Create reranked documents with metadata
+            reranked_docs = []
+            for i, (doc, colbert_score) in enumerate(doc_scores):
+                original_idx = documents.index(doc)
+                original_score = scores[original_idx] if original_idx < len(scores) else 0.0
+                new_doc = Document(
+                    page_content=doc.page_content,
+                    metadata={
+                        **doc.metadata,
+                        'reranking_applied': True,
+                        'reranker_model': self.reranker_model_name,
+                        'reranker_type': self.reranker_type,
+                        'original_rank': original_idx + 1,
+                        'final_rank': i + 1,
+                        'original_score': float(original_score),
+                        'reranked_score': float(colbert_score),
+                        'colbert_score': float(colbert_score),
+                        'colbert_embedding_pre_calculated': 'colbert_embedding' in doc.metadata
+                    }
+                )
+                reranked_docs.append(new_doc)
+            print(f"✅ COLBERT: Reranked {len(reranked_docs)} documents using late interaction")
+            print(f"🔍 COLBERT SCORES: {[f'{score:.4f}' for score in colbert_scores[:5]]}...")
+            return reranked_docs
+        except Exception as e:
+            print(f"❌ COLBERT RERANKING ERROR: {str(e)}")
+            print(f"❌ COLBERT TRACEBACK: {traceback.format_exc()}")
+            # Fallback to original order - return documents as-is
+            return documents
+    def retrieve_with_scores(self, query: str, vectorstore=None, k: int = 5, reports: List[str] = None,
+                            sources: List[str] = None, subtype: List[str] = None,
+                            year: List[str] = None, use_reranking: bool = False,
+                            qdrant_filter: Optional[rest.Filter] = None) -> Tuple[List[Document], List[float]]:
+        """
+        Retrieve context documents with scores using hybrid search with optional reranking.
+        Args:
+            query: User query
+            vectorstore: Optional vectorstore instance (for compatibility)
+            k: Number of documents to retrieve
+            reports: List of report names to filter by
+            sources: List of sources to filter by
+            subtype: Document subtype to filter by
+            year: List of years to filter by
+            use_reranking: Whether to apply reranking
+            qdrant_filter: Pre-built Qdrant filter
+        Returns:
+            Tuple of (documents, scores)
+        """
+        try:
+            # Use the provided vectorstore if available, otherwise use the instance one
+            if vectorstore:
+                self.vectorstore = vectorstore
+            # Determine search strategy
+            search_strategy = self.config.get('retrieval', {}).get('search_strategy', 'vector_only')
+            if search_strategy == 'vector_only':
+                # Vector search only
+                print(f"🔄 VECTOR SEARCH: Retrieving {k} documents...")
+                if qdrant_filter:
+                    print(f"✅ QDRANT FILTER APPLIED: Using inferred Qdrant filter")
+                    # Pass filter as positional argument, not keyword argument
+                    results = self.vectorstore.similarity_search_with_score(
+                        query,
+                        k=k,
+                        filter=qdrant_filter
+                    )
+                else:
+                    # Build filter from individual parameters
+                    filter_conditions = self._build_filter_conditions(reports, sources, subtype, year)
+                    if filter_conditions:
+                        print(f"✅ FILTER APPLIED: {filter_conditions}")
+                        results = self.vectorstore.similarity_search_with_score(
+                            query,
+                            k=k,
+                            filter=filter_conditions
+                        )
+                    else:
+                        print(f"ℹ️ NO FILTERS APPLIED: All documents will be searched")
+                        results = self.vectorstore.similarity_search_with_score(query, k=k)
+                print(f"🔍 SEARCH DEBUG: Raw result type: {type(results)}")
+                print(f"🔍 SEARCH DEBUG: Raw result length: {len(results)}")
+                # Handle different result formats
+                if results and isinstance(results[0], tuple):
+                    documents = [doc for doc, score in results]
+                    scores = [score for doc, score in results]
+                    print(f"🔍 SEARCH DEBUG: After unpacking - documents: {len(documents)}, scores: {len(scores)}")
+                else:
+                    documents = results
+                    scores = [0.0] * len(documents)
+                    print(f"🔍 SEARCH DEBUG: No scores available, using default")
+                print(f"🔧 CONVERTING: Converting {len(documents)} documents")
+                # Convert to Document objects and store original scores
+                final_documents = []
+                for i, (doc, score) in enumerate(zip(documents, scores)):
+                    if hasattr(doc, 'page_content'):
+                        new_doc = Document(
+                            page_content=doc.page_content,
+                            metadata=doc.metadata.copy()
+                        )
+                        # Store original score in metadata
+                        new_doc.metadata['original_score'] = float(score) if score is not None else 0.0
+                        final_documents.append(new_doc)
+                    else:
+                        print(f"⚠️ WARNING: Document {i} has no page_content")
+                print(f"✅ RETRIEVAL SUCCESS: Retrieved {len(final_documents)} documents")
+                # Apply reranking if enabled
+                if use_reranking and len(final_documents) > 1:
+                    print(f"🔄 RERANKING: Applying {self.reranker_model} to {len(final_documents)} documents...")
+                    final_documents = self._apply_reranking(query, final_documents, scores)
+                    print(f"✅ RERANKING APPLIED: {self.reranker_model}")
+                else:
+                    print(f"ℹ️ RERANKING: Skipped (disabled or no documents)")
+                return final_documents, scores
+            else:
+                print(f"❌ UNSUPPORTED STRATEGY: {search_strategy}")
+                return [], []
+        except Exception as e:
+            print(f"❌ RETRIEVAL ERROR: {e}")
+            print(f"❌ RETRIEVAL TRACEBACK: {traceback.format_exc()}")
+            return [], []
+    def _build_filter_conditions(self, reports: List[str] = None, sources: List[str] = None,
+                                subtype: List[str] = None, year: List[str] = None) -> Optional[rest.Filter]:
+        """
+        Build Qdrant filter conditions from individual parameters.
+        Args:
+            reports: List of report names
+            sources: List of sources
+            subtype: Document subtype
+            year: List of years
+        Returns:
+            Qdrant filter or None
+        """
+        conditions = []
+        if reports:
+            conditions.append(rest.FieldCondition(
+                key="metadata.filename",
+                match=rest.MatchAny(any=reports)
+            ))
+        if sources:
+            conditions.append(rest.FieldCondition(
+                key="metadata.source",
+                match=rest.MatchAny(any=sources)
+            ))
+        if subtype:
+            conditions.append(rest.FieldCondition(
+                key="metadata.subtype",
+                match=rest.MatchAny(any=subtype)
+            ))
+        if year:
+            conditions.append(rest.FieldCondition(
+                key="metadata.year",
+                match=rest.MatchAny(any=year)
+            ))
+        if conditions:
+            return rest.Filter(must=conditions)
+        return None
+def get_context(
+    query: str,
+    vectorstore: Qdrant,
+    k: int = 5,
+    reports: Optional[List[str]] = None,
+    sources: Optional[List[str]] = None,
+    subtype: Optional[str] = None,
+    year: Optional[str] = None,
+    use_reranking: bool = False,
+    qdrant_filter: Optional[rest.Filter] = None
+) -> List[Document]:
+    """
+    Convenience function to get context documents.
+    Args:
+        query: User query
+        vectorstore: Qdrant vector store instance
+        k: Number of documents to retrieve
+        reports: Optional list of report names to filter by
+        sources: Optional list of source categories to filter by
+        subtype: Optional subtype to filter by
+        year: Optional year to filter by
+        use_reranking: Whether to apply reranking
+        qdrant_filter: Optional pre-built Qdrant filter
+    Returns:
+        List of retrieved documents
+    """
+    retriever = ContextRetriever(vectorstore)
+    return retriever.retrieve_context(
+        query=query,
+        k=k,
+        reports=reports,
+        sources=sources,
+        subtype=subtype,
+        year=year,
+        use_reranking=use_reranking,
+        qdrant_filter=qdrant_filter
+    )
+def format_context_for_llm(documents: List[Document]) -> str:
+    """
+    Format retrieved documents for LLM input.
+    Args:
+        documents: List of Document objects
+    Returns:
+        Formatted string for LLM
+    """
+    if not documents:
+        return ""
+    formatted_parts = []
+    for i, doc in enumerate(documents, 1):
+        content = doc.page_content.strip()
+        source = doc.metadata.get('filename', 'Unknown')
+        formatted_parts.append(f"Document {i} (Source: {source}):\n{content}")
+    return "\n\n".join(formatted_parts)
+def get_context_metadata(documents: List[Document]) -> Dict[str, Any]:
+    """
+    Extract metadata summary from retrieved documents.
+    Args:
+        documents: List of Document objects
+    Returns:
+        Dictionary with metadata summary
+    """
+    if not documents:
+        return {}
+    sources = set()
+    years = set()
+    doc_types = set()
+    for doc in documents:
+        metadata = doc.metadata
+        if 'filename' in metadata:
+            sources.add(metadata['filename'])
+        if 'year' in metadata:
+            years.add(metadata['year'])
+        if 'source' in metadata:
+            doc_types.add(metadata['source'])
+    return {
+        "num_documents": len(documents),
+        "sources": list(sources),
+        "years": list(years),
+        "document_types": list(doc_types)
+    }

src/retrieval/filter.py ADDED Viewed

	@@ -0,0 +1,975 @@

+"""Document filtering utilities for Qdrant vector store."""
+from typing import List, Optional, Union, Dict, Tuple, Any
+from qdrant_client.http import models as rest
+import time
+class FilterBuilder:
+    """Builder class for creating Qdrant filters."""
+    def __init__(self):
+        self.conditions = []
+    def add_source_filter(self, source: Union[str, List[str]]) -> 'FilterBuilder':
+        """Add source filter condition."""
+        if source:
+            if isinstance(source, list):
+                condition = rest.FieldCondition(
+                    key="metadata.source",
+                    match=rest.MatchAny(any=source)
+                )
+                print(f"🔧 FilterBuilder: Added source filter for {source}")
+            else:
+                condition = rest.FieldCondition(
+                    key="metadata.source",
+                    match=rest.MatchValue(value=source)
+                )
+                print(f"🔧 FilterBuilder: Added source filter for '{source}'")
+            self.conditions.append(condition)
+        return self
+    def add_filename_filter(self, filenames: List[str]) -> 'FilterBuilder':
+        """Add filename filter condition."""
+        if filenames:
+            condition = rest.FieldCondition(
+                key="metadata.filename",
+                match=rest.MatchAny(any=filenames)
+            )
+            self.conditions.append(condition)
+            print(f"🔧 FilterBuilder: Added filename filter for {filenames}")
+        return self
+    def add_year_filter(self, years: List[str]) -> 'FilterBuilder':
+        """Add year filter condition."""
+        if years:
+            condition = rest.FieldCondition(
+                key="metadata.year",
+                match=rest.MatchAny(any=years)
+            )
+            self.conditions.append(condition)
+            print(f"🔧 FilterBuilder: Added year filter for {years}")
+        return self
+    def add_district_filter(self, districts: List[str]) -> 'FilterBuilder':
+        """Add district filter condition."""
+        if districts:
+            condition = rest.FieldCondition(
+                key="metadata.district",
+                match=rest.MatchAny(any=districts)
+            )
+            self.conditions.append(condition)
+            print(f"🔧 FilterBuilder: Added district filter for {districts}")
+        return self
+    def add_custom_filter(self, key: str, value: Union[str, List[str]]) -> 'FilterBuilder':
+        """Add custom filter condition."""
+        if isinstance(value, list):
+            condition = rest.FieldCondition(
+                key=key,
+                match=rest.MatchAny(any=value)
+            )
+        else:
+            condition = rest.FieldCondition(
+                key=key,
+                match=rest.MatchValue(value=value)
+            )
+        self.conditions.append(condition)
+        return self
+    def build(self) -> rest.Filter:
+        """Build the final filter."""
+        if not self.conditions:
+            return None
+        return rest.Filter(must=self.conditions)
+def create_filter(
+    reports: List[str] = None,
+    sources: Union[str, List[str]] = None,
+    subtype: List[str] = None,
+    year: List[str] = None,
+    district: List[str] = None,
+    filenames: List[str] = None
+) -> rest.Filter:
+    """
+    Create a search filter for Qdrant (legacy function for compatibility).
+    Args:
+        reports: List of specific report filenames
+        sources: Source category
+        subtype: List of subtypes/filenames
+        year: List of years
+        district: List of districts
+        filenames: List of specific filenames (mutually exclusive with other filters)
+    Returns:
+        Qdrant Filter object
+    Note:
+        If filenames are provided, ONLY filename filtering is applied (mutually exclusive)
+    """
+    builder = FilterBuilder()
+    # Check if filename filtering is requested (mutually exclusive)
+    # Both filenames and reports serve the same purpose (backward compatibility)
+    # Prefer filenames, fallback to reports for legacy support
+    target_filenames = filenames if filenames else reports
+    if target_filenames and len(target_filenames) > 0:
+        # ONLY apply filename filter, ignore all other filters
+        print(f"🔍 FILTER APPLIED: Filenames = {target_filenames} (mutually exclusive mode)")
+        builder.add_filename_filter(target_filenames)
+    else:
+        # Otherwise, filter by source and subtype
+        print(f"🔍 FILTER APPLIED: Sources = {sources}, Subtype = {subtype}, Year = {year}, District = {district}")
+        if sources:
+            print(f"✅ Adding source filter: metadata.source = '{sources}'")
+            builder.add_source_filter(sources)
+        if subtype:
+            print(f"✅ Adding subtype filter: metadata.filename IN {subtype}")
+            builder.add_filename_filter(subtype)
+        if year:
+            print(f"✅ Adding year filter: metadata.year IN {year}")
+            builder.add_year_filter(year)
+        if district:
+            print(f"✅ Adding district filter: metadata.district IN {district}")
+            builder.add_district_filter(district)
+    filter_obj = builder.build()
+    if filter_obj:
+        print(f"�� FINAL FILTER: {len(filter_obj.must)} condition(s) applied")
+        for i, condition in enumerate(filter_obj.must, 1):
+            print(f"   Condition {i}: {condition.key} = {condition.match}")
+    else:
+        print("⚠️ NO FILTERS APPLIED: All documents will be searched")
+    return filter_obj
+def create_advanced_filter(
+    must_conditions: List[dict] = None,
+    should_conditions: List[dict] = None,
+    must_not_conditions: List[dict] = None
+) -> rest.Filter:
+    """
+    Create advanced filter with multiple condition types.
+    Args:
+        must_conditions: Conditions that must match
+        should_conditions: Conditions that should match (OR logic)
+        must_not_conditions: Conditions that must not match
+    Returns:
+        Qdrant Filter object
+    """
+    filter_dict = {}
+    if must_conditions:
+        filter_dict["must"] = [
+            _dict_to_field_condition(cond) for cond in must_conditions
+        ]
+    if should_conditions:
+        filter_dict["should"] = [
+            _dict_to_field_condition(cond) for cond in should_conditions
+        ]
+    if must_not_conditions:
+        filter_dict["must_not"] = [
+            _dict_to_field_condition(cond) for cond in must_not_conditions
+        ]
+    if not filter_dict:
+        return None
+    return rest.Filter(**filter_dict)
+def _dict_to_field_condition(condition_dict: dict) -> rest.FieldCondition:
+    """Convert dictionary to FieldCondition."""
+    key = condition_dict["key"]
+    value = condition_dict["value"]
+    if isinstance(value, list):
+        match = rest.MatchAny(any=value)
+    else:
+        match = rest.MatchValue(value=value)
+    return rest.FieldCondition(key=key, match=match)
+def validate_filter(filter_obj: rest.Filter) -> bool:
+    """
+    Validate that a filter object is properly constructed.
+    Args:
+        filter_obj: Qdrant Filter object
+    Returns:
+        True if valid, raises ValueError if invalid
+    """
+    if filter_obj is None:
+        return True
+    if not isinstance(filter_obj, rest.Filter):
+        raise ValueError("Filter must be a rest.Filter object")
+    # Check that at least one condition type is present
+    has_conditions = any([
+        hasattr(filter_obj, 'must') and filter_obj.must,
+        hasattr(filter_obj, 'should') and filter_obj.should,
+        hasattr(filter_obj, 'must_not') and filter_obj.must_not
+    ])
+    if not has_conditions:
+        raise ValueError("Filter must have at least one condition")
+    return True
+def infer_filters_from_query(
+    query: str,
+    available_metadata: dict,
+    llm_client=None
+) -> Tuple[rest.Filter, Union[dict, None]]:
+    """
+    Automatically infer filters from a query using LLM analysis.
+    Args:
+        query: User query to analyze
+        available_metadata: Available metadata values in the vectorstore
+        llm_client: LLM client for analysis (optional)
+    Returns:
+        Qdrant Filter object with inferred conditions
+    """
+    print(f"�� AUTO-INFERRING FILTERS from query: '{query[:50]}...'")
+    # Check if LLM client is available
+    if not llm_client:
+        print(f"❌ LLM CLIENT MISSING: Cannot use LLM analysis, falling back to rule-based")
+        return _infer_filters_rule_based(query, available_metadata), None
+    # Extract available options
+    available_sources = available_metadata.get('sources', [])
+    available_years = available_metadata.get('years', [])
+    available_filenames = available_metadata.get('filenames', [])
+    print(f"📊 Available metadata: sources={len(available_sources)}, years={len(available_years)}, filenames={len(available_filenames)}")
+    # Try LLM analysis first
+    print(f" LLM ANALYSIS: Attempting LLM-based filter inference...")
+    llm_result = _analyze_query_with_llm(
+        query=query,
+        available_metadata=available_metadata,
+        llm_client=llm_client
+    )
+    if llm_result:
+        print(f"✅ LLM SUCCESS: LLM successfully inferred filters")
+        # Use the _build_qdrant_filter function to properly build the Qdrant filter
+        qdrant_filter, filter_summary = _build_qdrant_filter(llm_result)
+        if qdrant_filter:
+            print(f"✅ QDRANT FILTER: Successfully built Qdrant filter")
+            # print(f"✅ INFERRED FILTERS: {qdrant_filter}")
+            return qdrant_filter, filter_summary
+        else:
+            print(f"❌ QDRANT FILTER: Failed to build Qdrant filter, trying rule-based fallback")
+            rule_based_result = _infer_filters_rule_based(query, available_metadata)
+            # Use the _build_qdrant_filter function to properly build the Qdrant filter
+            qdrant_filter, filter_summary = _build_qdrant_filter(rule_based_result)
+            if qdrant_filter:
+                print(f"✅ RULE-BASED QDRANT FILTER: Successfully built Qdrant filter")
+                return qdrant_filter, filter_summary
+            else:
+                print(f"❌ RULE-BASED QDRANT FILTER: Failed to build Qdrant filter")
+                return None, None
+    else:
+        print(f"⚠️ LLM FAILED: LLM could not infer filters, trying rule-based fallback")
+        rule_based_result = _infer_filters_rule_based(query, available_metadata)
+        # Use the _build_qdrant_filter function to properly build the Qdrant filter
+        qdrant_filter, filter_summary = _build_qdrant_filter(rule_based_result)
+        if qdrant_filter:
+            print(f"✅ RULE-BASED QDRANT FILTER: Successfully built Qdrant filter")
+            return qdrant_filter, filter_summary
+        else:
+            print(f"❌ RULE-BASED QDRANT FILTER: Failed to build Qdrant filter")
+            return None, None
+def _analyze_query_with_llm(
+    query: str,
+    available_metadata: Dict[str, List[str]],
+    llm_client=None
+) -> dict:
+    """
+    - Filenames: {available_metadata.get('filenames', [])}
+    📁 FILENAME FILTERING (Use Sparingly):
+    - Only if specific filename explicitly mentioned
+    - Prefer source/subtype over filename
+    - Be very conservative
+            "filenames": ["filename1", "filename2"] or [],
+            - For filenames: Only use if you have high confidence and can identify specific files
+    """
+    """
+    Use LLM to analyze query and infer appropriate filters.
+    Args:
+        query: User query to analyze
+        available_metadata: Available metadata values in the vectorstore
+        llm_client: LLM client for analysis
+    Returns:
+        Dictionary with inferred filters or empty dict if failed
+    """
+    if not llm_client:
+        print("❌ LLM CLIENT MISSING: Cannot analyze query without LLM client")
+        return {}
+    try:
+        print(f" LLM ANALYSIS: Analyzing query with LLM...")
+        """
+        For example: "What is the expected ... in 2024" - this refference to a future statement, so retrieving documents for 2023, 2022 and 2021 can be relevant too
+        Another example: "What is the GDP increase now compared to 2022" - this is a relative statement, refferring to past data, so both Year 2022, and now - 2025 needs to be detected/marked
+        """
+        # Create prompt for LLM analysis
+        prompt = f"""
+You are a filter inference system. Analyze this query and return ONLY a JSON object.
+Query: "{query}"
+Available metadata:
+- Sources: {available_metadata.get('sources', [])}
+- Years: {available_metadata.get('years', [])}
+FILTER INFERENCE GUIDELINES:
+ YEAR FILTERING (Be VERY Conservative):
+✅ INFER YEARS ONLY IF:
+    - Explicit 4-digit years: "2022", "2023", "2021"
+    - Clear relative terms: "last year", "this year", "recent", "current year" (for the context, now is 2025)
+    - Temporal context: "annual report 2022", "audit for 2023"
+    - Give multiple years for complex queries.
+❌ DO NOT INFER YEARS FOR:
+    - Vague terms: "implementation", "activities", "costs", "challenges", "issues"
+    - General concepts: "PDM", "administrative", "budget", "staff"
+    - Process descriptions: "how were", "what challenges", "management of"
+🏛️ SOURCE FILTERING (Context-Based):
+    - "Ministry, Department and Agency" → Central government, ministries, departments, PS/ST
+    - "Local Government" → Districts, municipalities, local authorities, DLG
+    - "Consolidated" → Annual consolidated reports, OAG reports
+    - "Thematic" → Special studies, thematic reports
+�� SUBTYPE FILTERING (Document Type):
+    - "audit" → Audit reports, reviews, examinations
+    - "report" → General reports, annual reports
+    - "guidance" → Guidelines, directives, circulars
+CONFIDENCE SCORING:
+    - 0.9-1.0: Crystal clear indicators (explicit years, specific sources)
+    - 0.7-0.8: Good indicators (relative years, clear context)
+    - 0.5-0.6: Moderate indicators (some context clues)
+    - 0.0-0.4: Low confidence (vague or unclear)
+EXAMPLES:
+    ✅ "What challenges arose in 2022?" → years: ["2022"], confidence: 1
+    ✅ "How were administrative costs managed in our government?" → sources: ["Local Government"], confidence: 0.75
+    ✅ "PDM implementation guidelines from last year" → years: ["2024"], confidence: 0.9
+    ❌ "What issues arose with budget execution?" → NO FILTERS, confidence: 0.2
+    ❌ "How were tools related to administrative costs?" → NO FILTERS, confidence: 0.1
+RESPONSE FORMAT (JSON only):
+    {{
+        "years": ["2022", "2023"] or [],
+        "sources": ["Ministry, Department and Agency", "Local Government"] or [],
+        "subtype": ["audit", "report"] or [],
+        "confidence": 0.8,
+        "reasoning": "Very brief explanation of filter choices"
+    }}
+Rules:
+- Use OR logic (SHOULD) for multiple values
+- Prefer sources over filenames
+- Only include years if clearly mentioned
+- Return null for unclear fields
+- For sources/subtypes: Include at least 3 candidates unless confidence is high and you can identify exactly one source (MUST)
+- For years: If you want to include, then include at least 2 candidates unless confidence is high and you can identify exactly one year (MUST)
+"""
+        print(f"🔄 LLM CALL: Sending prompt to LLM...")
+        try:
+            # Try different methods to call the LLM
+            if hasattr(llm_client, 'invoke'):
+                response = llm_client.invoke(prompt)
+            elif hasattr(llm_client, 'generate'):
+                response = llm_client.generate([{"role": "user", "content": prompt}])
+            elif hasattr(llm_client, 'call'):
+                response = llm_client.call(prompt)
+            elif hasattr(llm_client, 'predict'):
+                response = llm_client.predict(prompt)
+            else:
+                # Try to call it directly
+                response = llm_client(prompt)
+            print(f"✅ LLM CALL SUCCESS: Received response from LLM")
+            # Extract content from response
+            if hasattr(response, 'content'):
+                response_content = response.content
+            elif hasattr(response, 'text'):
+                response_content = response.text
+            elif isinstance(response, str):
+                response_content = response
+            else:
+                response_content = str(response)
+            print(f"🔄 LLM RESPONSE: {response_content[:200]}...")
+        except Exception as e:
+            print(f"❌ LLM CALL FAILED: Error calling LLM - {e}")
+            return {}
+        # Parse JSON response
+        import json
+        import re
+        try:
+            print(f"🔄 JSON PARSING: Attempting to parse LLM response...")
+            # Clean the response to extract JSON from markdown
+            response_text = response_content.strip()
+            # Remove markdown formatting if present
+            if "```json" in response_text:
+                # Extract JSON from markdown code block
+                start_marker = "```json"
+                end_marker = "```"
+                start_idx = response_text.find(start_marker)
+                if start_idx != -1:
+                    start_idx += len(start_marker)
+                    end_idx = response_text.find(end_marker, start_idx)
+                    if end_idx != -1:
+                        response_text = response_text[start_idx:end_idx].strip()
+            # Try to find JSON object in the response
+            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+            if json_match:
+                response_text = json_match.group(0)
+            print(f"🔄 JSON PARSING: Cleaned response: {response_text[:200]}...")
+            # Parse JSON
+            filters = json.loads(response_text)
+            print(f"✅ JSON PARSING SUCCESS: Parsed filters: {filters}")
+            # Validate filters
+            if not isinstance(filters, dict):
+                print(f"❌ JSON VALIDATION FAILED: Response is not a dictionary")
+                return {}
+            # Check if any filters were inferred
+            has_filters = any(filters.get(key) for key in ['sources', 'years', 'filenames'])
+            if not has_filters:
+                print(f"⚠️ QUERY DIFFICULT: LLM could not determine appropriate filters from query")
+                return {}
+            # print(f"✅ FILTER INFERENCE SUCCESS: Inferred filters: {filters}")
+            return filters
+        except json.JSONDecodeError as e:
+            print(f"❌ JSON PARSING FAILED: Invalid JSON format - {e}")
+            print(f"❌ JSON PARSING FAILED: Raw response: {response_text[:500]}...")
+            return {}
+        except Exception as e:
+            print(f"❌ JSON PARSING FAILED: Unexpected error - {e}")
+            print(f"❌ JSON PARSING FAILED: Raw response: {response_text[:500]}...")
+            return {}
+    except Exception as e:
+        print(f"❌ LLM CALL FAILED: Error calling LLM - {e}")
+        return {}
+def _infer_filters_rule_based(
+    query: str,
+    available_metadata: dict
+) -> dict:
+    """
+    Rule-based fallback for filter inference with improved logic.
+    Args:
+        query: User query
+        available_metadata: Available metadata values in the vectorstore
+    Returns:
+        Dictionary of inferred filters
+    """
+    print(f" RULE-BASED ANALYSIS: Starting rule-based inference for query: '{query[:50]}...'")
+    inferred = {}
+    query_lower = query.lower()
+    # SEMANTIC SOURCE INFERENCE - Use semantic understanding
+    source_matches = []
+    # Define semantic mappings for better source inference
+    source_keywords = {
+        'consolidated': ['consolidated', 'annual', 'oag', 'auditor general', 'government', 'financial statements', 'budget', 'expenditure', 'revenue'],
+        'military': ['military', 'defence', 'defense', 'army', 'navy', 'air force', 'security', 'defense ministry'],
+        'departmental': ['department', 'ministry', 'agency', 'authority', 'commission', 'board', 'directorate'],
+        'thematic': ['thematic', 'sector', 'program', 'project', 'initiative', 'development', 'infrastructure']
+    }
+    for source in available_metadata.get('sources', []):
+        source_lower = source.lower()
+        # Direct keyword match
+        if source_lower in query_lower:
+            source_matches.append(source)
+            print(f"✅ DIRECT MATCH: Found direct keyword match for '{source}'")
+        else:
+            # Semantic keyword matching
+            if source_lower in source_keywords:
+                keywords = source_keywords[source_lower]
+                matches = sum(1 for keyword in keywords if keyword in query_lower)
+                if matches >= 2:  # Require at least 2 keyword matches for semantic inference
+                    source_matches.append(source)
+                    print(f"✅ SEMANTIC MATCH: Found {matches} semantic keywords for '{source}': {[k for k in keywords if k in query_lower]}")
+    if source_matches:
+        # Use SHOULD (OR logic) for multiple sources
+        inferred['sources_should'] = source_matches
+        print(f"✅ SOURCE INFERENCE: Found {len(source_matches)} sources with OR logic: {source_matches}")
+    else:
+        print("❌ SOURCE INFERENCE: No source keywords found in query")
+    # Infer year filters - use SHOULD (OR logic) for multiple years
+    import re
+    year_matches = []
+    for year in available_metadata.get('years', []):
+        if year in query or f"'{year}" in query:
+            year_matches.append(year)
+    if year_matches:
+        # Use SHOULD (OR logic) for multiple years
+        inferred['years_should'] = year_matches
+        print(f"✅ YEAR INFERENCE: Found {len(year_matches)} years with OR logic: {year_matches}")
+    else:
+        print("❌ YEAR INFERENCE: No year references found in query")
+    # Only infer filename filters if no year filter was found (to avoid conflicts)
+    if not year_matches:
+        filename_matches = []
+        for filename in available_metadata.get('filenames', []):
+            # Only match if multiple words from filename appear in query
+            filename_words = filename.lower().split()
+            matches = sum(1 for word in filename_words if word in query_lower)
+            if matches >= 2:  # High confidence threshold
+                filename_matches.append(filename)
+        if filename_matches:
+            # Use SHOULD (OR logic) for multiple filenames
+            inferred['filenames_should'] = filename_matches
+            print(f"✅ FILENAME INFERENCE: Found {len(filename_matches)} filenames with OR logic: {filename_matches}")
+        else:
+            print("❌ FILENAME INFERENCE: No high-confidence filename matches found")
+    else:
+        print("ℹ️ FILENAME INFERENCE: Skipped (year filter already applied to avoid conflicts)")
+    print(f" RULE-BASED RESULT: {inferred}")
+    return inferred
+def _validate_inferred_filters(inferred_filters: dict) -> dict:
+    """
+    Validate and normalize inferred filters to ensure they're in the expected format.
+    Args:
+        inferred_filters: Raw inferred filters dictionary
+    Returns:
+        Validated and normalized filters dictionary
+    """
+    if not isinstance(inferred_filters, dict):
+        print(f"⚠️ FILTER VALIDATION: Inferred filters is not a dict: {type(inferred_filters)}")
+        return {}
+    validated = {}
+    # Normalize field names and validate values
+    for field_name in ['sources', 'sources_should', 'years', 'years_should', 'filenames', 'filenames_should']:
+        if field_name in inferred_filters and inferred_filters[field_name]:
+            value = inferred_filters[field_name]
+            if isinstance(value, list) and len(value) > 0:
+                # Remove any None or empty string values
+                clean_value = [v for v in value if v is not None and str(v).strip()]
+                if clean_value:
+                    validated[field_name] = clean_value
+                    print(f"✅ FILTER VALIDATION: {field_name} = {clean_value}")
+            elif isinstance(value, str) and value.strip():
+                validated[field_name] = [value.strip()]
+                print(f"✅ FILTER VALIDATION: {field_name} = [{value.strip()}]")
+    return validated
+def _build_qdrant_filter(inferred_filters: dict) -> rest.Filter:
+    """
+    Build Qdrant filter from inferred filters.
+    Args:
+        inferred_filters: Dictionary with inferred filter values
+    Returns:
+        Qdrant Filter object
+    """
+    try:
+        from qdrant_client.http import models as rest
+        # Validate and normalize the inferred filters first
+        validated_filters = _validate_inferred_filters(inferred_filters)
+        if not validated_filters:
+            print(f"⚠️ NO VALID FILTERS: All filters were invalid or empty")
+            return None, {}
+        conditions = []
+        filter_summary = {}
+        # Handle sources (use OR logic for multiple values)
+        # Support both 'sources' and 'sources_should' field names
+        source_values = None
+        if 'sources' in validated_filters and validated_filters['sources']:
+            source_values = validated_filters['sources']
+        elif 'sources_should' in validated_filters and validated_filters['sources_should']:
+            source_values = validated_filters['sources_should']
+        if source_values and isinstance(source_values, list) and len(source_values) > 0:
+            if len(source_values) == 1:
+                conditions.append(rest.FieldCondition(
+                    key="metadata.source",
+                    match=rest.MatchValue(value=source_values[0])
+                ))
+            else:
+                # Use MatchAny instead of Filter(should=...) to avoid QueryPoints error
+                conditions.append(rest.FieldCondition(
+                    key="metadata.source",
+                    match=rest.MatchAny(any=source_values)
+                ))
+            filter_summary['sources'] = f"SHOULD: {source_values}"
+        # Handle years (use OR logic for multiple values)
+        # Support both 'years' and 'years_should' field names
+        year_values = None
+        if 'years' in validated_filters and validated_filters['years']:
+            year_values = validated_filters['years']
+        elif 'years_should' in validated_filters and validated_filters['years_should']:
+            year_values = validated_filters['years_should']
+        if year_values and isinstance(year_values, list) and len(year_values) > 0:
+            if len(year_values) == 1:
+                conditions.append(rest.FieldCondition(
+                    key="metadata.year",
+                    match=rest.MatchValue(value=year_values[0])
+                ))
+            else:
+                # Use MatchAny instead of Filter(should=...) to avoid QueryPoints error
+                conditions.append(rest.FieldCondition(
+                    key="metadata.year",
+                    match=rest.MatchAny(any=year_values)
+                ))
+            filter_summary['years'] = f"SHOULD: {year_values}"
+        # Handle filenames (use OR logic for multiple values)
+        # Support both 'filenames' and 'filenames_should' field names
+        filename_values = None
+        if 'filenames' in validated_filters and validated_filters['filenames']:
+            filename_values = validated_filters['filenames']
+        elif 'filenames_should' in validated_filters and validated_filters['filenames_should']:
+            filename_values = validated_filters['filenames_should']
+        if filename_values and isinstance(filename_values, list) and len(filename_values) > 0:
+            if len(filename_values) == 1:
+                conditions.append(rest.FieldCondition(
+                    key="metadata.filename",
+                    match=rest.MatchValue(value=filename_values[0])
+                ))
+            else:
+                # Use MatchAny instead of Filter(should=...) to avoid QueryPoints error
+                conditions.append(rest.FieldCondition(
+                    key="metadata.filename",
+                    match=rest.MatchAny(any=filename_values)
+                ))
+            filter_summary['filenames'] = f"SHOULD: {filename_values}"
+        # Build final filter
+        if conditions:
+            # Always wrap conditions in a Filter object, even for single conditions
+            result_filter = rest.Filter(must=conditions)
+            # Print clean filter summary
+            print(f"✅ APPLIED FILTERS: {filter_summary}")
+            return result_filter, filter_summary
+        else:
+            print(f"⚠️ NO FILTERS APPLIED: All documents will be searched")
+            return None, {}
+    except Exception as e:
+        print(f"❌ FILTER BUILD ERROR: {str(e)}")
+        print(f"🔍 DEBUG: Original inferred filters keys: {list(inferred_filters.keys()) if isinstance(inferred_filters, dict) else 'Not a dict'}")
+        print(f"🔍 DEBUG: Original inferred filters content: {inferred_filters}")
+        print(f"🔍 DEBUG: Validated filters keys: {list(validated_filters.keys()) if isinstance(validated_filters, dict) else 'Not a dict'}")
+        print(f"🔍 DEBUG: Validated filters content: {validated_filters}")
+        # Return a safe fallback - no filter (search all documents)
+        return None, {}
+class MetadataCache:
+    """Cache for vectorstore metadata to avoid repeated queries."""
+    def __init__(self):
+        self._cache = None
+        self._last_updated = None
+        self._cache_ttl = 3600  # 1 hour TTL
+    def get_metadata(self, vectorstore) -> dict:
+        """
+        Get metadata from cache or load it if not available/expired.
+        Args:
+            vectorstore: QdrantVectorStore instance
+        Returns:
+            Dictionary of available metadata values
+        """
+        import time
+        # Check if cache is valid
+        if (self._cache is not None and
+            self._last_updated is not None and
+            time.time() - self._last_updated < self._cache_ttl):
+            print(f"✅ METADATA CACHE: Using cached metadata")
+            return self._cache
+        try:
+            print(f"🔄 METADATA CACHE: Loading metadata from vectorstore...")
+            # Get collection info
+            try:
+                collection_info = vectorstore._client.get_collection(vectorstore.collection_name)
+                print(f"✅ Collection info retrieved: {getattr(collection_info, 'name', 'unknown')}")
+            except Exception as e:
+                print(f"⚠️ Could not get collection info: {e}")
+            # Get ALL documents to extract complete metadata
+            print(f"📄 Scanning entire corpus for complete metadata extraction...")
+            # Get collection info to determine total size
+            try:
+                collection_info = vectorstore._client.get_collection(vectorstore.collection_name)
+                total_points = getattr(collection_info, 'points_count', 0)
+                print(f"📊 Total documents in corpus: {total_points}")
+            except Exception as e:
+                print(f"⚠️ Could not get collection size: {e}")
+                total_points = 0
+            # Extract unique metadata values from ALL documents
+            sources = set()
+            years = set()
+            filenames = set()
+            # Try to use scroll to get all documents in batches
+            batch_size = 1000  # Process in batches to avoid memory issues
+            offset = None
+            processed_count = 0
+            scroll_success = False
+            try:
+                while True:
+                    # Scroll through all documents
+                    scroll_result = vectorstore._client.scroll(
+                        collection_name=vectorstore.collection_name,
+                        limit=batch_size,
+                        offset=offset,
+                        with_payload=True,
+                        with_vectors=False  # We only need metadata
+                    )
+                    points = scroll_result[0]  # Get the points
+                    if not points:
+                        break  # No more documents
+                    # Process each document
+                    for i, point in enumerate(points):
+                        if hasattr(point, 'payload') and point.payload:
+                            payload = point.payload
+                            # Debug: Log structure of first few documents
+                            if processed_count + i < 2:  # Only log first 2 documents
+                                print(f"🔍 DEBUG Document {processed_count + i + 1} payload structure:")
+                                print(f"   Payload keys: {list(payload.keys()) if isinstance(payload, dict) else 'Not a dict'}")
+                                if isinstance(payload, dict) and 'metadata' in payload:
+                                    print(f"   Metadata keys: {list(payload['metadata'].keys()) if isinstance(payload['metadata'], dict) else 'Not a dict'}")
+                                elif isinstance(payload, dict):
+                                    print(f"   Top-level keys: {list(payload.keys())}")
+                                print(f"   Payload type: {type(payload)}")
+                                print(f"   Payload sample: {str(payload)[:200]}...")
+                                print()
+                            # Try different metadata structures
+                            found_metadata = False
+                            # Structure 1: payload['metadata']['source']
+                            if isinstance(payload, dict) and 'metadata' in payload:
+                                metadata = payload['metadata']
+                                if isinstance(metadata, dict):
+                                    if 'source' in metadata:
+                                        sources.add(metadata['source'])
+                                        found_metadata = True
+                                    if 'year' in metadata:
+                                        years.add(metadata['year'])
+                                        found_metadata = True
+                                    if 'filename' in metadata:
+                                        filenames.add(metadata['filename'])
+                                        found_metadata = True
+                            # Structure 2: payload['source'] (direct)
+                            if isinstance(payload, dict):
+                                if 'source' in payload:
+                                    sources.add(payload['source'])
+                                    found_metadata = True
+                                if 'year' in payload:
+                                    years.add(payload['year'])
+                                    found_metadata = True
+                                if 'filename' in payload:
+                                    filenames.add(payload['filename'])
+                                    found_metadata = True
+                            # Structure 3: Check for nested structures
+                            if not found_metadata and isinstance(payload, dict):
+                                # Look for any nested dict that might contain metadata
+                                for key, value in payload.items():
+                                    if isinstance(value, dict):
+                                        if 'source' in value:
+                                            sources.add(value['source'])
+                                            found_metadata = True
+                                        if 'year' in value:
+                                            years.add(value['year'])
+                                            found_metadata = True
+                                        if 'filename' in value:
+                                            filenames.add(value['filename'])
+                                            found_metadata = True
+                    processed_count += len(points)
+                    progress_pct = (processed_count / total_points * 100) if total_points > 0 else 0
+                    print(f"📄 Processed {processed_count}/{total_points} documents ({progress_pct:.1f}%)... (sources: {len(sources)}, years: {len(years)}, filenames: {len(filenames)})")
+                    # Update offset for next batch
+                    offset = scroll_result[1]  # Next offset
+                    if offset is None:
+                        break  # No more documents
+                scroll_success = True
+                print(f"✅ Scroll method successful - processed {processed_count} documents")
+            except Exception as e:
+                print(f"❌ Scroll method failed: {e}")
+                print(f"🔄 Falling back to similarity search method...")
+                # Fallback: Use similarity search with multiple queries to get more coverage
+                fallback_queries = [
+                    "",  # Empty query
+                    "audit", "report", "government", "ministry", "department",
+                    "local", "consolidated", "annual", "financial", "budget",
+                    "2020", "2021", "2022", "2023", "2024"  # Year queries
+                ]
+                processed_count = 0
+                for query in fallback_queries:
+                    try:
+                        # Get documents for this query
+                        docs = vectorstore.similarity_search(query, k=1000)  # Get more per query
+                        for j, doc in enumerate(docs):
+                            if hasattr(doc, 'metadata') and doc.metadata:
+                                # Debug: Log structure of first few documents in fallback
+                                if processed_count + j < 3:  # Only log first 3 documents per query
+                                    print(f"🔍 DEBUG Fallback Document {processed_count + j + 1} (query: '{query}') metadata structure:")
+                                    print(f"   Metadata keys: {list(doc.metadata.keys()) if isinstance(doc.metadata, dict) else 'Not a dict'}")
+                                    print(f"   Metadata type: {type(doc.metadata)}")
+                                    print(f"   Metadata sample: {str(doc.metadata)[:200]}...")
+                                    print()
+                                if 'source' in doc.metadata:
+                                    sources.add(doc.metadata['source'])
+                                if 'year' in doc.metadata:
+                                    years.add(doc.metadata['year'])
+                                if 'filename' in doc.metadata:
+                                    filenames.add(doc.metadata['filename'])
+                        processed_count += len(docs)
+                        print(f"📄 Fallback query '{query}': {len(docs)} docs (total: {processed_count}, sources: {len(sources)}, years: {len(years)}, filenames: {len(filenames)})")
+                    except Exception as query_error:
+                        print(f"⚠️ Fallback query '{query}' failed: {query_error}")
+                        continue
+                print(f"✅ Fallback method completed - processed {processed_count} documents")
+            print(f"✅ Completed scanning {processed_count} documents from entire corpus")
+            # Convert to sorted lists
+            metadata = {
+                'sources': sorted(list(sources)),
+                'years': sorted(list(years)),
+                'filenames': sorted(list(filenames))
+            }
+            # Cache the results
+            self._cache = metadata
+            self._last_updated = time.time()
+            print(f"✅ Complete metadata extracted from entire corpus: {len(sources)} sources, {len(years)} years, {len(filenames)} files")
+            # Debug: Show what was actually found
+            if sources:
+                print(f"📁 Sources found: {sorted(list(sources))}")
+            else:
+                print(f"❌ No sources found - check metadata structure")
+            if years:
+                print(f"📅 Years found: {sorted(list(years))}")
+            else:
+                print(f"❌ No years found - check metadata structure")
+            if filenames:
+                print(f"📄 Filenames found: {sorted(list(filenames))[:10]}{'...' if len(filenames) > 10 else ''}")
+            else:
+                print(f"❌ No filenames found - check metadata structure")
+            return metadata
+        except Exception as e:
+            print(f"❌ Error extracting metadata: {e}")
+            return {'sources': [], 'years': [], 'filenames': []}
+# Global metadata cache
+_metadata_cache = MetadataCache()
+def get_available_metadata(vectorstore) -> dict:
+    """Get available metadata values from the vectorstore efficiently."""
+    return _metadata_cache.get_metadata(vectorstore)

src/retrieval/hybrid.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""Hybrid search implementation combining vector and sparse retrieval."""
+import json
+import numpy as np
+from typing import List, Dict, Any, Tuple
+from pathlib import Path
+from langchain.docstore.document import Document
+from langchain_qdrant import QdrantVectorStore
+from langchain_community.retrievers import BM25Retriever
+from .filter import create_filter
+import pickle
+import os
+class HybridRetriever:
+    """
+    Hybrid retrieval system combining vector search (dense) and BM25 (sparse) search.
+    Supports configurable search modes: vector_only, sparse_only, or hybrid.
+    """
+    def __init__(self, config: Dict[str, Any]):
+        """
+        Initialize hybrid retriever.
+        Args:
+            config: Configuration dictionary with hybrid search settings
+        """
+        self.config = config
+        self.bm25_retriever = None
+        self.documents = []
+        self._bm25_cache_file = None
+    def _get_bm25_cache_path(self) -> str:
+        """Get path for BM25 cache file."""
+        cache_dir = Path("cache/bm25")
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        return str(cache_dir / "bm25_retriever.pkl")
+    def initialize_bm25(self, documents: List[Document], force_rebuild: bool = False) -> None:
+        """
+        Initialize BM25 retriever with documents.
+        Args:
+            documents: List of Document objects to index
+            force_rebuild: Whether to force rebuilding the BM25 index
+        """
+        self.documents = documents
+        self._bm25_cache_file = self._get_bm25_cache_path()
+        # Try to load cached BM25 retriever
+        if not force_rebuild and os.path.exists(self._bm25_cache_file):
+            try:
+                print("Loading cached BM25 retriever...")
+                with open(self._bm25_cache_file, 'rb') as f:
+                    self.bm25_retriever = pickle.load(f)
+                print(f"✅ Loaded cached BM25 retriever with {len(self.documents)} documents")
+                return
+            except Exception as e:
+                print(f"⚠️ Failed to load cached BM25 retriever: {e}")
+                print("Building new BM25 index...")
+        # Build new BM25 retriever
+        print("Building BM25 index...")
+        try:
+            # Use langchain's BM25Retriever
+            self.bm25_retriever = BM25Retriever.from_documents(documents)
+            # Configure BM25 parameters
+            bm25_config = self.config.get("bm25", {})
+            k = bm25_config.get("top_k", 20)
+            self.bm25_retriever.k = k
+            # Cache the BM25 retriever
+            with open(self._bm25_cache_file, 'wb') as f:
+                pickle.dump(self.bm25_retriever, f)
+            print(f"✅ Built and cached BM25 retriever with {len(documents)} documents")
+        except Exception as e:
+            print(f"❌ Failed to build BM25 retriever: {e}")
+            print("BM25 search will be disabled")
+            self.bm25_retriever = None
+    def _filter_documents_by_metadata(
+        self,
+        documents: List[Document],
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        year: List[str] = None
+    ) -> List[Document]:
+        """
+        Filter documents by metadata criteria.
+        Args:
+            documents: List of documents to filter
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            year: List of years
+        Returns:
+            Filtered list of documents
+        """
+        if not any([reports, sources, subtype, year]):
+            return documents
+        filtered_docs = []
+        for doc in documents:
+            metadata = doc.metadata
+            # Filter by reports
+            if reports:
+                filename = metadata.get('filename', '')
+                if not any(report in filename for report in reports):
+                    continue
+            # Filter by sources
+            if sources:
+                doc_source = metadata.get('source', '')
+                if sources != doc_source:
+                    continue
+            # Filter by subtype
+            if subtype:
+                doc_subtype = metadata.get('subtype', '')
+                if doc_subtype not in subtype:
+                    continue
+            # Filter by year
+            if year:
+                doc_year = str(metadata.get('year', ''))
+                if doc_year not in year:
+                    continue
+            filtered_docs.append(doc)
+        return filtered_docs
+    def _bm25_search(
+        self,
+        query: str,
+        k: int = 20,
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        year: List[str] = None
+    ) -> List[Tuple[Document, float]]:
+        """
+        Perform BM25 sparse search.
+        Args:
+            query: Search query
+            k: Number of documents to retrieve
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            year: List of years
+        Returns:
+            List of (Document, score) tuples
+        """
+        if not self.bm25_retriever:
+            print("⚠️ BM25 retriever not available")
+            return []
+        try:
+            # Get BM25 results
+            self.bm25_retriever.k = k
+            bm25_docs = self.bm25_retriever.invoke(query)
+            # Apply metadata filtering
+            if any([reports, sources, subtype, year]):
+                bm25_docs = self._filter_documents_by_metadata(
+                    bm25_docs, reports, sources, subtype, year
+                )
+            # BM25Retriever doesn't return scores directly, so we'll use placeholder scores
+            # In a production system, you'd want to access the actual BM25 scores
+            results = []
+            for i, doc in enumerate(bm25_docs):
+                # Assign decreasing scores based on rank (higher rank = higher score)
+                # Normalize to [0, 1] range for consistency with vector search
+                score = max(0.1, 1.0 - (i / max(len(bm25_docs), 1)))
+                results.append((doc, score))
+            return results
+        except Exception as e:
+            print(f"❌ BM25 search failed: {e}")
+            return []
+    def _vector_search(
+        self,
+        vectorstore: QdrantVectorStore,
+        query: str,
+        k: int = 20,
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        year: List[str] = None
+    ) -> List[Tuple[Document, float]]:
+        """
+        Perform vector similarity search.
+        Args:
+            vectorstore: QdrantVectorStore instance
+            query: Search query
+            k: Number of documents to retrieve
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            year: List of years
+        Returns:
+            List of (Document, score) tuples
+        """
+        try:
+            # Create filter
+            filter_obj = create_filter(
+                reports=reports,
+                sources=sources,
+                subtype=subtype,
+                year=year
+            )
+            # Perform vector search
+            if filter_obj:
+                results = vectorstore.similarity_search_with_score(
+                    query, k=k, filter=filter_obj
+                )
+            else:
+                results = vectorstore.similarity_search_with_score(query, k=k)
+            return results
+        except Exception as e:
+            print(f"❌ Vector search failed: {e}")
+            return []
+    def _normalize_scores(self, results: List[Tuple[Document, float]], method: str = "min_max") -> List[Tuple[Document, float]]:
+        """
+        Normalize scores to [0, 1] range.
+        Args:
+            results: List of (Document, score) tuples
+            method: Normalization method ('min_max' or 'z_score')
+        Returns:
+            List of (Document, normalized_score) tuples
+        """
+        if not results:
+            return results
+        scores = [score for _, score in results]
+        if method == "min_max":
+            min_score = min(scores)
+            max_score = max(scores)
+            if max_score == min_score:
+                normalized_results = [(doc, 1.0) for doc, _ in results]
+            else:
+                normalized_results = [
+                    (doc, (score - min_score) / (max_score - min_score))
+                    for doc, score in results
+                ]
+        elif method == "z_score":
+            mean_score = np.mean(scores)
+            std_score = np.std(scores)
+            if std_score == 0:
+                normalized_results = [(doc, 1.0) for doc, _ in results]
+            else:
+                normalized_results = [
+                    (doc, max(0, (score - mean_score) / std_score))
+                    for doc, score in results
+                ]
+        else:
+            normalized_results = results
+        return normalized_results
+    def _combine_results(
+        self,
+        vector_results: List[Tuple[Document, float]],
+        bm25_results: List[Tuple[Document, float]],
+        alpha: float = 0.5
+    ) -> List[Tuple[Document, float]]:
+        """
+        Combine vector and BM25 results with weighted scoring.
+        Args:
+            vector_results: Vector search results
+            bm25_results: BM25 search results
+            alpha: Weight for vector scores (1-alpha for BM25 scores)
+        Returns:
+            Combined and ranked results
+        """
+        # Normalize scores
+        vector_results = self._normalize_scores(vector_results)
+        bm25_results = self._normalize_scores(bm25_results)
+        # Create document ID mapping for both result sets
+        vector_docs = {id(doc): (doc, score) for doc, score in vector_results}
+        bm25_docs = {id(doc): (doc, score) for doc, score in bm25_results}
+        # Combine scores
+        combined_scores = {}
+        all_doc_ids = set(vector_docs.keys()) | set(bm25_docs.keys())
+        for doc_id in all_doc_ids:
+            vector_score = vector_docs.get(doc_id, (None, 0.0))[1]
+            bm25_score = bm25_docs.get(doc_id, (None, 0.0))[1]
+            # Weighted combination
+            combined_score = alpha * vector_score + (1 - alpha) * bm25_score
+            # Get document object
+            doc = vector_docs.get(doc_id, bm25_docs.get(doc_id))[0]
+            combined_scores[doc_id] = (doc, combined_score)
+        # Sort by combined score (descending)
+        sorted_results = sorted(
+            combined_scores.values(),
+            key=lambda x: x[1],
+            reverse=True
+        )
+        return sorted_results
+    def retrieve(
+        self,
+        vectorstore: QdrantVectorStore,
+        query: str,
+        mode: str = "hybrid",
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        year: List[str] = None,
+        alpha: float = 0.5,
+        k: int = None
+    ) -> List[Document]:
+        """
+        Retrieve documents using the specified search mode.
+        Args:
+            vectorstore: QdrantVectorStore instance
+            query: Search query
+            mode: Search mode ('vector_only', 'sparse_only', or 'hybrid')
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            year: List of years
+            alpha: Weight for vector scores in hybrid mode (0.5 = equal weight)
+            k: Number of documents to retrieve
+        Returns:
+            List of relevant Document objects
+        """
+        if k is None:
+            k = self.config.get("retriever", {}).get("top_k", 20)
+        results = []
+        if mode == "vector_only":
+            # Vector search only
+            vector_results = self._vector_search(
+                vectorstore, query, k, reports, sources, subtype, year
+            )
+            results = [(doc, score) for doc, score in vector_results]
+        elif mode == "sparse_only":
+            # BM25 search only
+            bm25_results = self._bm25_search(
+                query, k, reports, sources, subtype, year
+            )
+            results = [(doc, score) for doc, score in bm25_results]
+        elif mode == "hybrid":
+            # Hybrid search - combine both
+            # Get more results from each method to have better fusion
+            retrieval_k = min(k * 2, 50)  # Get more candidates for fusion
+            vector_results = self._vector_search(
+                vectorstore, query, retrieval_k, reports, sources, subtype, year
+            )
+            bm25_results = self._bm25_search(
+                query, retrieval_k, reports, sources, subtype, year
+            )
+            results = self._combine_results(vector_results, bm25_results, alpha)
+        else:
+            raise ValueError(f"Unknown search mode: {mode}")
+        # Limit to top k results
+        results = results[:k]
+        # Return just the documents
+        return [doc for doc, score in results]
+    def retrieve_with_scores(
+        self,
+        vectorstore: QdrantVectorStore,
+        query: str,
+        mode: str = "hybrid",
+        reports: List[str] = None,
+        sources: str = None,
+        subtype: List[str] = None,
+        year: List[str] = None,
+        alpha: float = 0.5,
+        k: int = None
+    ) -> List[Tuple[Document, float]]:
+        """
+        Retrieve documents with scores using the specified search mode.
+        Args:
+            vectorstore: QdrantVectorStore instance
+            query: Search query
+            mode: Search mode ('vector_only', 'sparse_only', or 'hybrid')
+            reports: List of specific report filenames
+            sources: Source category
+            subtype: List of subtypes
+            year: List of years
+            alpha: Weight for vector scores in hybrid mode (0.5 = equal weight)
+            k: Number of documents to retrieve
+        Returns:
+            List of (Document, score) tuples
+        """
+        if k is None:
+            k = self.config.get("retriever", {}).get("top_k", 20)
+        results = []
+        if mode == "vector_only":
+            # Vector search only
+            results = self._vector_search(
+                vectorstore, query, k, reports, sources, subtype, year
+            )
+        elif mode == "sparse_only":
+            # BM25 search only
+            results = self._bm25_search(
+                query, k, reports, sources, subtype, year
+            )
+        elif mode == "hybrid":
+            # Hybrid search - combine both
+            # Get more results from each method to have better fusion
+            retrieval_k = min(k * 2, 50)  # Get more candidates for fusion
+            vector_results = self._vector_search(
+                vectorstore, query, retrieval_k, reports, sources, subtype, year
+            )
+            bm25_results = self._bm25_search(
+                query, retrieval_k, reports, sources, subtype, year
+            )
+            results = self._combine_results(vector_results, bm25_results, alpha)
+        else:
+            raise ValueError(f"Unknown search mode: {mode}")
+        # Limit to top k results
+        return results[:k]
+def get_available_search_modes() -> List[str]:
+    """Get list of available search modes."""
+    return ["vector_only", "sparse_only", "hybrid"]
+def get_search_mode_description() -> Dict[str, str]:
+    """Get descriptions for each search mode."""
+    return {
+        "vector_only": "Semantic search using dense embeddings - good for conceptual matching",
+        "sparse_only": "Keyword search using BM25 - good for exact term matching",
+        "hybrid": "Combined semantic and keyword search - balanced approach"
+    }

src/vectorstore.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""Vector store management and operations."""
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+import torch
+from langchain_qdrant import QdrantVectorStore
+from langchain.docstore.document import Document
+from langchain_core.embeddings import Embeddings
+from sentence_transformers import SentenceTransformer
+from langchain_huggingface import HuggingFaceEmbeddings
+class MatryoshkaEmbeddings(Embeddings):
+    """Custom embeddings class that supports Matryoshka dimension truncation."""
+    def __init__(self, model_name: str, truncate_dim: int = None, **kwargs):
+        """
+        Initialize Matryoshka embeddings.
+        Args:
+            model_name: Name of the model
+            truncate_dim: Dimension to truncate to (for Matryoshka models)
+            **kwargs: Additional arguments (ignored for Matryoshka models)
+        """
+        self.model_name = model_name
+        self.truncate_dim = truncate_dim
+        if truncate_dim and "matryoshka" in model_name.lower():
+            # Use SentenceTransformer directly for Matryoshka models
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.model = SentenceTransformer(model_name, truncate_dim=truncate_dim, device=device)
+            print(f"🔧 Matryoshka model configured for {truncate_dim} dimensions")
+        else:
+            # Use standard HuggingFaceEmbeddings
+            self.model = HuggingFaceEmbeddings(model_name=model_name, **kwargs)
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed documents."""
+        if self.truncate_dim and "matryoshka" in self.model_name.lower():
+            embeddings = self.model.encode(texts, normalize_embeddings=True)
+            return embeddings.tolist()
+        else:
+            return self.model.embed_documents(texts)
+    def embed_query(self, text: str) -> List[float]:
+        """Embed query."""
+        if self.truncate_dim and "matryoshka" in self.model_name.lower():
+            embedding = self.model.encode([text], normalize_embeddings=True)
+            return embedding[0].tolist()
+        else:
+            return self.model.embed_query(text)
+class VectorStoreManager:
+    """Manages vector store operations and connections."""
+    def __init__(self, config: Dict[str, Any]):
+        """
+        Initialize vector store manager.
+        Args:
+            config: Configuration dictionary
+        """
+        self.config = config
+        self.embeddings = self._create_embeddings()
+        self.vectorstore = None
+        # Define metadata fields that need payload indexes for filtering
+        self.metadata_fields = [
+            ("metadata.year", "keyword"),
+            ("metadata.source", "keyword"),
+            ("metadata.filename", "keyword"),
+            # Add more metadata fields as needed
+        ]
+    def _create_embeddings(self) -> HuggingFaceEmbeddings:
+        """Create embeddings model from configuration."""
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model_name = self.config["retriever"]["model"]
+        normalize = self.config["retriever"]["normalize"]
+        model_kwargs = {"device": device}
+        encode_kwargs = {
+            "normalize_embeddings": normalize,
+            "batch_size": 100,
+        }
+        # For Matryoshka models, check if we need to truncate dimensions
+        if "matryoshka" in model_name.lower():
+            # Check if we have a specific dimension requirement
+            collection_name = self.config.get("qdrant", {}).get("collection_name", "")
+            if "modernbert-embed-base-akryl-matryoshka" in collection_name:
+                # This collection expects 768 dimensions
+                truncate_dim = 768
+                print(f"🔧 Matryoshka model configured for {truncate_dim} dimensions")
+                # Use custom MatryoshkaEmbeddings
+                embeddings = MatryoshkaEmbeddings(
+                    model_name=model_name,
+                    truncate_dim=truncate_dim,
+                    model_kwargs=model_kwargs,
+                    encode_kwargs=encode_kwargs,
+                    show_progress=True,
+                )
+                return embeddings
+        # Use standard HuggingFaceEmbeddings for non-Matryoshka models
+        embeddings = HuggingFaceEmbeddings(
+            model_name=model_name,
+            model_kwargs=model_kwargs,
+            encode_kwargs=encode_kwargs,
+            show_progress=True,
+        )
+        return embeddings
+    def ensure_metadata_indexes(self) -> None:
+        """
+        Create payload indexes for all required metadata fields.
+        This ensures filtering works properly, especially in Qdrant Cloud.
+        """
+        if not self.vectorstore:
+            return
+        qdrant_config = self.config["qdrant"]
+        collection_name = qdrant_config["collection_name"]
+        for field_name, field_type in self.metadata_fields:
+            try:
+                self.vectorstore.client.create_payload_index(
+                    collection_name=collection_name,
+                    field_name=field_name,
+                    field_type=field_type
+                )
+                print(f"Created payload index for {field_name} ({field_type})")
+            except Exception as e:
+                # Index might already exist or other error - log but continue
+                print(f"Index creation for {field_name} ({field_type}): {str(e)}")
+    def connect_to_existing(self, force_recreate: bool = False) -> QdrantVectorStore:
+        """
+        Connect to existing Qdrant collection.
+        Args:
+            force_recreate: If True, recreate the collection if dimension mismatch occurs
+        Returns:
+            QdrantVectorStore instance
+        """
+        qdrant_config = self.config["qdrant"]
+        kwargs_qdrant = {
+            "url": qdrant_config["url"],
+            "collection_name": qdrant_config["collection_name"],
+            "prefer_grpc": qdrant_config.get("prefer_grpc", True),
+            "api_key": qdrant_config.get("api_key", None),
+        }
+        if force_recreate:
+            kwargs_qdrant["force_recreate"] = True
+        self.vectorstore = QdrantVectorStore.from_existing_collection(
+            embedding=self.embeddings,
+            **kwargs_qdrant
+        )
+        # Ensure payload indexes exist for metadata filtering
+        self.ensure_metadata_indexes()
+        return self.vectorstore
+    def create_from_documents(self, documents: List[Document]) -> QdrantVectorStore:
+        """
+        Create new Qdrant collection from documents.
+        Args:
+            documents: List of Document objects
+        Returns:
+            QdrantVectorStore instance
+        """
+        qdrant_config = self.config["qdrant"]
+        kwargs_qdrant = {
+            "url": qdrant_config["url"],
+            "collection_name": qdrant_config["collection_name"],
+            "prefer_grpc": qdrant_config.get("prefer_grpc", True),
+            "api_key": qdrant_config.get("api_key", None),
+        }
+        self.vectorstore = QdrantVectorStore.from_documents(
+            documents=documents,
+            embedding=self.embeddings,
+            **kwargs_qdrant
+        )
+        # Ensure payload indexes exist for metadata filtering
+        self.ensure_metadata_indexes()
+        return self.vectorstore
+    def delete_collection(self) -> None:
+        """
+        Delete the current Qdrant collection.
+        Returns:
+            QdrantVectorStore instance
+        """
+        qdrant_config = self.config["qdrant"]
+        collection_name = qdrant_config.get("collection_name")
+        self.vectorstore.client.delete_collection(
+            collection_name=collection_name
+        )
+        return self.vectorstore
+    def get_vectorstore(self) -> Optional[QdrantVectorStore]:
+        """Get current vectorstore instance."""
+        return self.vectorstore
+def get_local_qdrant(config: Dict[str, Any]) -> QdrantVectorStore:
+    """
+    Get local Qdrant vector store (legacy function for compatibility).
+    Args:
+        config: Configuration dictionary
+    Returns:
+        QdrantVectorStore instance
+    """
+    manager = VectorStoreManager(config)
+    return manager.connect_to_existing()
+def create_vectorstore(config: Dict[str, Any], documents: List[Document]) -> QdrantVectorStore:
+    """
+    Create new vector store from documents.
+    Args:
+        config: Configuration dictionary
+        documents: List of Document objects
+    Returns:
+        QdrantVectorStore instance
+    """
+    manager = VectorStoreManager(config)
+    return manager.create_from_documents(documents)
+def get_embeddings_model(config: Dict[str, Any]) -> HuggingFaceEmbeddings:
+    """
+    Create embeddings model from configuration (legacy function).
+    Args:
+        config: Configuration dictionary
+    Returns:
+        HuggingFaceEmbeddings instance
+    """
+    manager = VectorStoreManager(config)
+    return manager.embeddings