Spaces:

akryldigital
/

audit_assistant

Sleeping

App Files Files Community

akryldigital commited on 18 days ago

Commit

150fb2f

verified ·

1 Parent(s): c0655b8

add colpali scripts

Browse files

Files changed (5) hide show

src/colpali/__init__.py +29 -0
src/colpali/processor.py +120 -0
src/colpali/search.py +494 -0
src/colpali/visual_search.py +237 -0
src/colpali/visualizer.py +236 -0

src/colpali/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+ColPali Visual Document Retrieval Module
+This module implements visual document retrieval using ColPali (ColBERT-style multi-vector embeddings)
+for processing PDF documents as images.
+All components are self-contained within src/colpali/ - no external dependencies on colpali_colab_package.
+"""
+# Core inference components
+from .processor import ColPaliProcessor
+from .search import VisualDocumentSearch
+from .visual_search import VisualSearchAdapter, VisualSearchResult, create_visual_search_adapter
+# Upload/management components (for data ingestion)
+from .qdrant_manager import ColPaliQdrantManager
+from .visualizer import generate_saliency_maps
+__all__ = [
+    # Inference
+    "ColPaliProcessor",
+    "VisualDocumentSearch",
+    "VisualSearchAdapter",
+    "VisualSearchResult",
+    "create_visual_search_adapter",
+    # Data management
+    "ColPaliQdrantManager",
+    "generate_saliency_maps",
+]

src/colpali/processor.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+ColPali Query Embedding Processor
+Handles query embedding generation using ColSmol-500M model.
+This is a standalone implementation for inference only (no PDF processing).
+"""
+import logging
+from typing import Optional
+import torch
+logger = logging.getLogger(__name__)
+# Check if colpali_engine is available
+try:
+    from colpali_engine.models import ColIdefics3, ColIdefics3Processor
+    COLPALI_AVAILABLE = True
+except ImportError:
+    COLPALI_AVAILABLE = False
+    logger.warning("colpali_engine not installed. Install with: pip install colpali-engine")
+class ColPaliProcessor:
+    """
+    Processes queries using ColPali for visual document retrieval.
+    This is a lightweight processor focused on query embedding generation.
+    """
+    def __init__(
+        self,
+        model_name: str = "vidore/colSmol-500M",
+        device: str = "cpu",
+        torch_dtype: torch.dtype = torch.float32,
+        batch_size: int = 4
+    ):
+        """
+        Initialize ColPali processor.
+        Args:
+            model_name: HuggingFace model name for ColPali
+            device: Device to use ("cuda", "cpu", "mps")
+            torch_dtype: Data type for model weights
+            batch_size: Batch size for processing
+        """
+        if not COLPALI_AVAILABLE:
+            raise ImportError(
+                "colpali_engine not installed. Install with: "
+                "pip install colpali-engine"
+            )
+        # Validate model name (must include organization prefix)
+        if '/' not in model_name:
+            logger.warning(f"⚠️ Model name '{model_name}' missing organization prefix, adding 'vidore/'")
+            model_name = f"vidore/{model_name}"
+        self.model_name = model_name
+        self.device = device
+        self.torch_dtype = torch_dtype
+        self.batch_size = batch_size
+        logger.info(f"🤖 Loading ColPali model: {model_name}")
+        logger.info(f"   Device: {device}, dtype: {torch_dtype}")
+        # Load model and processor
+        try:
+            # Determine attention implementation
+            attn_implementation = "eager"  # Default for compatibility
+            if device != "cpu":
+                try:
+                    import flash_attn
+                    attn_implementation = "flash_attention_2"
+                    logger.info("   Using FlashAttention2 for faster inference")
+                except ImportError:
+                    logger.info("   FlashAttention2 not available, using eager attention")
+            self.model = ColIdefics3.from_pretrained(
+                model_name,
+                dtype=torch_dtype,
+                device_map=device,
+                attn_implementation=attn_implementation
+            ).eval()
+            self.processor = ColIdefics3Processor.from_pretrained(model_name)
+            logger.info(f"✅ ColPali model loaded successfully")
+            logger.info(f"   Attention implementation: {attn_implementation}")
+        except Exception as e:
+            logger.error(f"❌ Failed to load ColPali model: {e}")
+            raise
+    def embed_query(self, query_text: str) -> torch.Tensor:
+        """
+        Generate embedding for a text query.
+        Args:
+            query_text: Natural language query string
+        Returns:
+            Query embedding tensor of shape [num_patches, embedding_dim]
+        """
+        with torch.no_grad():
+            # Process query using ColPali's query processing
+            processed_query = self.processor.process_queries([query_text]).to(self.model.device)
+            query_embedding = self.model(**processed_query)
+        return query_embedding
+    @property
+    def embedding_dim(self) -> int:
+        """Get the embedding dimension of the model."""
+        return self.model.config.hidden_size
+    @property
+    def image_token_id(self) -> int:
+        """Get the image token ID from the processor."""
+        return self.processor.image_token_id

src/colpali/search.py ADDED Viewed

	@@ -0,0 +1,494 @@

+"""
+Visual Document Search Engine
+Two-stage visual document retrieval:
+1. Fast prefetch using pooled vectors (mean/max with HNSW)
+2. Exact reranking using full multi-vector embeddings (ColBERT-style)
+"""
+import logging
+from typing import List, Dict, Any, Optional
+import numpy as np
+import torch
+from qdrant_client import QdrantClient
+from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny, Range
+logger = logging.getLogger(__name__)
+class VisualDocumentSearch:
+    """
+    Two-stage visual document retrieval:
+    - Stage 1: Fast HNSW search with pooled vectors (10-100ms)
+    - Stage 2: Exact ColBERT reranking with full embeddings (100-500ms)
+    """
+    def __init__(
+        self,
+        qdrant_client: QdrantClient,
+        collection_name: str = "colSmol-500M"
+    ):
+        """
+        Initialize search engine.
+        Args:
+            qdrant_client: Connected Qdrant client
+            collection_name: Name of the collection
+        """
+        self.client = qdrant_client
+        self.collection_name = collection_name
+    def get_filter_options(
+        self,
+        max_points: int = None,
+        use_cache: bool = True,
+        progress_callback=None
+    ) -> Dict[str, List[Any]]:
+        """
+        Scan collection to get all possible filter values using iterative scrolling.
+        Args:
+            max_points: Maximum number of points to scan (None = scan all)
+            use_cache: Whether to cache results (default True)
+            progress_callback: Optional callback function(points_scanned, elapsed_time, iteration)
+        Returns:
+            Dictionary with all unique values for each filterable field
+        """
+        scan_limit = max_points if max_points else "all"
+        logger.info(f"🔍 Starting metadata scan (target: {scan_limit} points)")
+        logger.info(f"   Collection: {self.collection_name}")
+        # Scroll through points to collect unique values
+        years = set()
+        sources = set()
+        districts = set()
+        filenames = set()
+        batch_size = 900
+        points_scanned = 0
+        offset = None
+        iteration = 0
+        max_iterations = 100
+        import time
+        start_time = time.time()
+        try:
+            while True:
+                iteration += 1
+                if iteration > max_iterations:
+                    logger.warning(f"⚠️ Reached max iterations ({max_iterations}), stopping")
+                    break
+                if max_points and points_scanned >= max_points:
+                    logger.info(f"✅ Reached target of {max_points} points")
+                    break
+                if max_points:
+                    remaining = max_points - points_scanned
+                    current_batch_size = min(batch_size, remaining)
+                else:
+                    current_batch_size = batch_size
+                elapsed = time.time() - start_time
+                logger.info(f"   Batch {iteration}: fetching {current_batch_size} points (scanned: {points_scanned}, {elapsed:.1f}s)")
+                batch_start = time.time()
+                try:
+                    results = self.client.scroll(
+                        collection_name=self.collection_name,
+                        limit=current_batch_size,
+                        offset=offset,
+                        with_payload=True,
+                        with_vectors=False,
+                    )
+                    points, next_offset = results
+                    batch_time = time.time() - batch_start
+                    logger.info(f"      ✓ Fetched {len(points)} points in {batch_time:.2f}s")
+                except Exception as scroll_error:
+                    logger.error(f"❌ Scroll failed at iteration {iteration}: {scroll_error}")
+                    break
+                if not points:
+                    logger.info(f"✅ Reached end of collection (scanned {points_scanned} points)")
+                    break
+                for point in points:
+                    payload = point.payload
+                    if payload.get('year'):
+                        year_value = payload['year']
+                        if isinstance(year_value, str):
+                            try:
+                                year_value = int(year_value)
+                            except ValueError:
+                                continue
+                        if isinstance(year_value, int):
+                            years.add(year_value)
+                    if payload.get('source'):
+                        sources.add(payload['source'])
+                    if payload.get('district'):
+                        districts.add(payload['district'])
+                    if payload.get('filename'):
+                        filenames.add(payload['filename'])
+                points_scanned += len(points)
+                offset = next_offset
+                if progress_callback:
+                    elapsed = time.time() - start_time
+                    progress_callback(points_scanned, elapsed, iteration)
+                if offset is None:
+                    elapsed = time.time() - start_time
+                    logger.info(f"✅ Completed full scan: {points_scanned} points in {elapsed:.1f}s")
+                    break
+            elapsed = time.time() - start_time
+            logger.info(f"✅ Scan complete: {points_scanned} points in {elapsed:.1f}s")
+            logger.info(f"   Found: {len(years)} years, {len(sources)} sources, "
+                       f"{len(districts)} districts, {len(filenames)} files")
+        except Exception as e:
+            logger.error(f"❌ Error scanning collection: {e}")
+        return {
+            'years': sorted(list(years)),
+            'sources': sorted(list(sources)),
+            'districts': sorted(list(districts)),
+            'filenames': sorted(list(filenames))
+        }
+    def build_filter(
+        self,
+        year: Optional[Any] = None,
+        source: Optional[Any] = None,
+        district: Optional[Any] = None,
+        filename: Optional[Any] = None,
+        has_text: Optional[bool] = None,
+        page_range: Optional[tuple] = None
+    ) -> Optional[Filter]:
+        """
+        Build Qdrant filter from parameters.
+        Supports both single values and lists (using MatchAny for lists).
+        """
+        conditions = []
+        if year is not None:
+            if isinstance(year, list):
+                year_values = [int(y) if isinstance(y, str) else y for y in year]
+                conditions.append(
+                    FieldCondition(key="year", match=MatchAny(any=year_values))
+                )
+                logger.info(f"🔍 Filter: year IN {year_values}")
+            else:
+                year_value = int(year) if isinstance(year, str) else year
+                conditions.append(
+                    FieldCondition(key="year", match=MatchValue(value=year_value))
+                )
+                logger.info(f"🔍 Filter: year = {year_value}")
+        if source is not None:
+            if isinstance(source, list):
+                conditions.append(
+                    FieldCondition(key="source", match=MatchAny(any=source))
+                )
+                logger.info(f"🔍 Filter: source IN {source}")
+            else:
+                conditions.append(
+                    FieldCondition(key="source", match=MatchValue(value=source))
+                )
+                logger.info(f"🔍 Filter: source = {source}")
+        if district is not None:
+            if isinstance(district, list):
+                conditions.append(
+                    FieldCondition(key="district", match=MatchAny(any=district))
+                )
+                logger.info(f"🔍 Filter: district IN {district}")
+            else:
+                conditions.append(
+                    FieldCondition(key="district", match=MatchValue(value=district))
+                )
+                logger.info(f"🔍 Filter: district = {district}")
+        if filename is not None:
+            if isinstance(filename, list):
+                conditions.append(
+                    FieldCondition(key="filename", match=MatchAny(any=filename))
+                )
+                logger.info(f"🔍 Filter: filename IN {filename}")
+            else:
+                conditions.append(
+                    FieldCondition(key="filename", match=MatchValue(value=filename))
+                )
+                logger.info(f"🔍 Filter: filename = {filename}")
+        if has_text is not None:
+            conditions.append(
+                FieldCondition(key="has_text", match=MatchValue(value=has_text))
+            )
+        if page_range is not None:
+            min_page, max_page = page_range
+            conditions.append(
+                FieldCondition(
+                    key="page_number",
+                    range=Range(gte=min_page, lte=max_page)
+                )
+            )
+        if not conditions:
+            return None
+        return Filter(must=conditions)
+    def search_stage1_prefetch(
+        self,
+        query_embedding: torch.Tensor,
+        top_k: int = 100,
+        filter_obj: Optional[Filter] = None,
+        use_pooling: bool = False,
+        pooling_method: str = "mean"
+    ) -> List[Dict[str, Any]]:
+        """
+        Stage 1: Prefetch candidates using either multi-vector or pooled search.
+        """
+        # Convert to numpy
+        if isinstance(query_embedding, torch.Tensor):
+            query_np = query_embedding.cpu().float().numpy()
+        else:
+            query_np = np.array(query_embedding, dtype=np.float32)
+        # Handle batch dimension
+        if query_np.ndim == 3:
+            query_np = query_np.squeeze(0)
+        # Strategy 1: Pooled search (fast, approximate)
+        if use_pooling:
+            if pooling_method == "mean":
+                query_pooled = query_np.mean(axis=0)
+                vector_name = "mean_pooling"
+            elif pooling_method == "max":
+                query_pooled = query_np.max(axis=0)
+                vector_name = "max_pooling"
+            else:
+                raise ValueError(f"Unknown pooling method: {pooling_method}")
+            if query_pooled.ndim != 1:
+                raise ValueError(f"Pooling failed! Expected 1D vector, got shape {query_pooled.shape}")
+            query_vector = query_pooled.tolist()
+            logger.info(f"🔍 Pooled search: vector={vector_name}, dims={len(query_vector)}")
+        # Strategy 2: Native multi-vector search (SOTA)
+        else:
+            vector_name = "initial"
+            query_vector = query_np.tolist()
+            logger.info(f"🎯 Multi-vector search: vector={vector_name}, patches={len(query_vector)}, dims={len(query_vector[0])}")
+        try:
+            results = self.client.query_points(
+                collection_name=self.collection_name,
+                query=query_vector,
+                using=vector_name,
+                query_filter=filter_obj,
+                limit=top_k,
+                with_payload=True,
+                with_vectors=False,
+                timeout=120
+            ).points
+            logger.info(f"✅ Stage 1: Retrieved {len(results)} candidates")
+        except Exception as e:
+            logger.error(f"❌ Search with vector '{vector_name}' failed: {e}")
+            raise
+        candidates = []
+        for result in results:
+            candidates.append({
+                'id': result.id,
+                'score_stage1': result.score,
+                'payload': result.payload
+            })
+        return candidates
+    def colbert_score(
+        self,
+        query_embedding: np.ndarray,
+        doc_embedding: np.ndarray
+    ) -> float:
+        """
+        Compute ColBERT-style late interaction score.
+        """
+        # Normalize embeddings
+        query_norm = query_embedding / (np.linalg.norm(query_embedding, axis=1, keepdims=True) + 1e-8)
+        doc_norm = doc_embedding / (np.linalg.norm(doc_embedding, axis=1, keepdims=True) + 1e-8)
+        # Compute similarity matrix
+        sim_matrix = np.dot(query_norm, doc_norm.T)
+        # For each query patch, take max similarity with any doc patch
+        max_sims = sim_matrix.max(axis=1)
+        # Average across query patches
+        score = max_sims.mean()
+        return float(score)
+    def search_stage2_rerank(
+        self,
+        query_embedding: torch.Tensor,
+        candidates: List[Dict[str, Any]],
+        top_k: int = 10
+    ) -> List[Dict[str, Any]]:
+        """
+        Stage 2: Exact reranking using full multi-vector embeddings.
+        """
+        if isinstance(query_embedding, torch.Tensor):
+            query_np = query_embedding.cpu().float().numpy()
+        else:
+            query_np = np.array(query_embedding, dtype=np.float32)
+        reranked = []
+        for candidate in candidates:
+            payload = candidate['payload']
+            full_embedding = payload.get('full_embedding')
+            if full_embedding is None:
+                candidate['score_final'] = candidate['score_stage1']
+                reranked.append(candidate)
+                continue
+            doc_np = np.array(full_embedding, dtype=np.float32)
+            colbert_score = self.colbert_score(query_np, doc_np)
+            candidate['score_stage2'] = colbert_score
+            candidate['score_final'] = colbert_score
+            reranked.append(candidate)
+        reranked.sort(key=lambda x: x['score_final'], reverse=True)
+        return reranked[:top_k]
+    def search(
+        self,
+        query_embedding: torch.Tensor,
+        top_k: int = 10,
+        prefetch_k: Optional[int] = None,
+        year: Optional[int] = None,
+        source: Optional[str] = None,
+        district: Optional[str] = None,
+        filename: Optional[str] = None,
+        has_text: Optional[bool] = None,
+        page_range: Optional[tuple] = None,
+        search_strategy: str = "multi_vector",
+        pooling_method: str = "mean",
+        use_reranking: bool = False
+    ) -> List[Dict[str, Any]]:
+        """
+        Multi-strategy visual document search.
+        Search Strategies:
+        1. "multi_vector" (DEFAULT, SOTA): Native multi-vector search
+        2. "pooled": Pooled search (fastest, less accurate)
+        3. "hybrid": Two-stage retrieval with reranking
+        """
+        # Build filter
+        filter_obj = self.build_filter(
+            year=year,
+            source=source,
+            district=district,
+            filename=filename,
+            has_text=has_text,
+            page_range=page_range
+        )
+        # Strategy 1: Native multi-vector search (SOTA, default)
+        if search_strategy == "multi_vector":
+            logger.info(f"🎯 SOTA Multi-Vector Search: Querying 'initial' vector with native MaxSim")
+            candidates = self.search_stage1_prefetch(
+                query_embedding=query_embedding,
+                top_k=top_k,
+                filter_obj=filter_obj,
+                use_pooling=False
+            )
+            if not candidates:
+                logger.warning("❌ No results found")
+                return []
+            for c in candidates:
+                c['score_final'] = c['score_stage1']
+            logger.info(f"✅ Retrieved {len(candidates)} results (native MaxSim)")
+            return candidates
+        # Strategy 2: Pooled search (fast, approximate)
+        elif search_strategy == "pooled":
+            logger.info(f"🔍 Pooled Search: Querying '{pooling_method}_pooling' vector")
+            candidates = self.search_stage1_prefetch(
+                query_embedding=query_embedding,
+                top_k=top_k,
+                filter_obj=filter_obj,
+                use_pooling=True,
+                pooling_method=pooling_method
+            )
+            if not candidates:
+                logger.warning("❌ No results found")
+                return []
+            for c in candidates:
+                c['score_final'] = c['score_stage1']
+            logger.info(f"✅ Retrieved {len(candidates)} results (pooled)")
+            return candidates
+        # Strategy 3: Hybrid two-stage
+        elif search_strategy == "hybrid":
+            if prefetch_k is None:
+                prefetch_k = max(100, top_k * 10)
+            logger.info(f"🔄 Hybrid Search: Stage 1 - Prefetching {prefetch_k} with {pooling_method} pooling")
+            candidates = self.search_stage1_prefetch(
+                query_embedding=query_embedding,
+                top_k=prefetch_k,
+                filter_obj=filter_obj,
+                use_pooling=True,
+                pooling_method=pooling_method
+            )
+            if not candidates:
+                logger.warning("❌ No results found in stage 1")
+                return []
+            logger.info(f"✅ Stage 1: Found {len(candidates)} candidates")
+            if use_reranking and len(candidates) > top_k:
+                logger.info(f"🎯 Stage 2: Reranking with ColBERT scoring...")
+                results = self.search_stage2_rerank(
+                    query_embedding=query_embedding,
+                    candidates=candidates,
+                    top_k=top_k
+                )
+                logger.info(f"✅ Reranked to top {len(results)} results")
+                return results
+            else:
+                results = candidates[:top_k]
+                for r in results:
+                    r['score_final'] = r['score_stage1']
+                logger.info(f"⏭️ Skipping reranking, returning top {len(results)}")
+                return results
+        else:
+            raise ValueError(f"Unknown search_strategy: {search_strategy}")

src/colpali/visual_search.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+Visual Document Search Adapter for Main App
+This module provides an adapter to integrate ColPali visual search
+into the main app's retrieval pipeline.
+All dependencies are now within src/colpali/ - no external colpali_colab_package needed.
+"""
+import logging
+from typing import List, Dict, Any, Optional
+import numpy as np
+import torch
+from qdrant_client import QdrantClient
+# Import from local src/colpali modules (no external dependencies)
+from src.colpali.processor import ColPaliProcessor
+from src.colpali.search import VisualDocumentSearch
+# Import device detection utility
+from src.utils import get_device_for_colpali
+logger = logging.getLogger(__name__)
+class VisualSearchResult:
+    """
+    Wrapper for visual search results to match the interface expected by app.py
+    """
+    def __init__(self, point_id: str, score: float, payload: Dict[str, Any]):
+        self.id = point_id
+        self.score = score
+        self.payload = payload
+        self.metadata = payload  # Alias for compatibility
+        # Extract content for compatibility with Document interface
+        self.page_content = payload.get('text', '')
+        self.content = self.page_content
+    def __repr__(self):
+        return f"VisualSearchResult(id={self.id}, score={self.score:.4f})"
+class VisualSearchAdapter:
+    """
+    Adapter to integrate ColPali visual search into the main app.
+    This provides a unified interface for visual document retrieval that works
+    with the existing chatbot architecture.
+    """
+    def __init__(
+        self,
+        qdrant_url: str,
+        qdrant_api_key: str,
+        collection_name: str = "colSmol-500M",
+        model_name: str = "vidore/colSmol-500M",
+        device: str = None,
+        batch_size: int = 4
+    ):
+        """
+        Initialize visual search adapter.
+        Args:
+            qdrant_url: Qdrant cluster URL
+            qdrant_api_key: Qdrant API key
+            collection_name: Name of the collection with visual embeddings
+            model_name: ColPali model name
+            device: Device to use (cuda/cpu/mps, auto-detected if None)
+            batch_size: Batch size for embedding generation
+        """
+        logger.info("🎨 Initializing Visual Search Adapter...")
+        # Auto-detect device using utility function
+        if device is None:
+            device = get_device_for_colpali()
+        self.device = device
+        logger.info(f"   Device: {device}")
+        # Initialize Qdrant client
+        logger.info(f"   Connecting to Qdrant: {qdrant_url}")
+        self.client = QdrantClient(
+            url=qdrant_url,
+            api_key=qdrant_api_key,
+            prefer_grpc=False,  # Use HTTP for compatibility
+            timeout=60
+        )
+        # Initialize search engine (from local src/colpali/search.py)
+        self.search_engine = VisualDocumentSearch(
+            qdrant_client=self.client,
+            collection_name=collection_name
+        )
+        # Initialize processor (from local src/colpali/processor.py)
+        logger.info(f"   Loading model: {model_name}")
+        torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
+        self.processor = ColPaliProcessor(
+            model_name=model_name,
+            device=device,
+            torch_dtype=torch_dtype,
+            batch_size=batch_size
+        )
+        # Store last query embedding for saliency generation
+        self.last_query_embedding = None
+        self.collection_name = collection_name
+        logger.info("✅ Visual Search Adapter initialized!")
+    def search(
+        self,
+        query: str,
+        top_k: int = 10,
+        filters: Optional[Dict[str, Any]] = None,
+        search_strategy: str = "multi_vector",
+        **kwargs
+    ) -> List[VisualSearchResult]:
+        """
+        Search for visually similar documents.
+        Args:
+            query: Text query
+            top_k: Number of results to return
+            filters: Optional filters (year, source, district, filename, has_text)
+            search_strategy: Search strategy (multi_vector, pooled, hybrid)
+            **kwargs: Additional search parameters
+        Returns:
+            List of VisualSearchResult objects
+        """
+        logger.info(f"🔍 Visual search: '{query}' (top_k={top_k}, strategy={search_strategy})")
+        # Generate query embedding
+        query_embedding = self.processor.embed_query(query)
+        # Store for saliency generation
+        self.last_query_embedding = query_embedding
+        # Convert filters to Qdrant format
+        filter_params = {}
+        if filters:
+            if 'sources' in filters and filters['sources']:
+                filter_params['source'] = filters['sources']
+            if 'years' in filters and filters['years']:
+                years = filters['years']
+                if isinstance(years, list):
+                    filter_params['year'] = [int(y) if isinstance(y, str) else y for y in years]
+                else:
+                    filter_params['year'] = int(years) if isinstance(years, str) else years
+            if 'districts' in filters and filters['districts']:
+                filter_params['district'] = filters['districts']
+            if 'filenames' in filters and filters['filenames']:
+                filter_params['filename'] = filters['filenames']
+            if 'has_text' in filters:
+                filter_params['has_text'] = filters['has_text']
+        logger.info(f"🔍 Visual search: Converted filter params: {filter_params}")
+        # Perform search
+        results = self.search_engine.search(
+            query_embedding=query_embedding,
+            top_k=top_k,
+            search_strategy=search_strategy,
+            **filter_params,
+            **kwargs
+        )
+        # Fallback: If 0 results with filters, retry without filters
+        if not results and filter_params:
+            logger.warning(f"⚠️ Visual search: 0 results with filters, retrying WITHOUT filters...")
+            results = self.search_engine.search(
+                query_embedding=query_embedding,
+                top_k=top_k,
+                search_strategy=search_strategy,
+                **kwargs  # No filter_params
+            )
+            if results:
+                logger.info(f"✅ Visual search: Found {len(results)} results after removing filters")
+            else:
+                logger.warning(f"❌ Visual search: Still 0 results even without filters")
+        # Convert to VisualSearchResult objects
+        visual_results = []
+        for result in results:
+            visual_result = VisualSearchResult(
+                point_id=result['id'],
+                score=result.get('score_final', result.get('score', 0.0)),
+                payload=result['payload']
+            )
+            visual_results.append(visual_result)
+        logger.info(f"✅ Found {len(visual_results)} visual results")
+        return visual_results
+    def get_filter_options(self) -> Dict[str, List[Any]]:
+        """
+        Get available filter options from the collection.
+        Returns:
+            Dictionary with years, sources, districts, filenames
+        """
+        return self.search_engine.get_filter_options()
+def create_visual_search_adapter(
+    qdrant_url: Optional[str] = None,
+    qdrant_api_key: Optional[str] = None,
+    collection_name: str = "colSmol-500M"
+) -> VisualSearchAdapter:
+    """
+    Factory function to create a visual search adapter.
+    Args:
+        qdrant_url: Qdrant URL (reads from env if not provided)
+        qdrant_api_key: Qdrant API key (reads from env if not provided)
+        collection_name: Collection name
+    Returns:
+        Initialized VisualSearchAdapter
+    """
+    import os
+    if qdrant_url is None:
+        qdrant_url = os.environ.get("QDRANT_URL")
+    if qdrant_api_key is None:
+        qdrant_api_key = os.environ.get("QDRANT_API_KEY")
+    if not qdrant_url or not qdrant_api_key:
+        raise ValueError("QDRANT_URL and QDRANT_API_KEY must be provided or set in environment")
+    return VisualSearchAdapter(
+        qdrant_url=qdrant_url,
+        qdrant_api_key=qdrant_api_key,
+        collection_name=collection_name
+    )

src/colpali/visualizer.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+ColPali Visualization Module
+Generates attention/saliency maps to visualize which parts of the document
+are most relevant to a query.
+"""
+import torch
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from typing import List, Dict, Any, Optional
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.colors import LinearSegmentedColormap
+import logging
+logger = logging.getLogger(__name__)
+def generate_saliency_maps(
+    query_embedding: torch.Tensor,
+    image_embeddings: List[torch.Tensor],
+    images: List[Image.Image],
+    processor,
+    model,
+    top_k: int = 5,
+    threshold: float = 0.5
+) -> List[Image.Image]:
+    """
+    Generate saliency/attention maps showing which parts of images are most relevant.
+    Args:
+        query_embedding: Query embedding tensor [num_query_patches, embedding_dim]
+        image_embeddings: List of image embedding tensors, each [num_patches, embedding_dim]
+        images: List of PIL Images corresponding to embeddings
+        processor: ColPali processor for scoring
+        model: ColPali model
+        top_k: Number of top images to visualize
+        threshold: Threshold for highlighting (0-1)
+    Returns:
+        List of annotated images with saliency overlays
+    """
+    logger.info(f"🎨 Generating saliency maps for {len(images)} images")
+    # Calculate scores for all images
+    scores = []
+    for img_emb in image_embeddings:
+        # Use processor's scoring method
+        score = processor.score_multi_vector(query_embedding.unsqueeze(0), img_emb.unsqueeze(0))
+        scores.append(score.item() if isinstance(score, torch.Tensor) else score)
+    # Get top-k images
+    top_indices = np.argsort(scores)[-top_k:][::-1]
+    annotated_images = []
+    for idx in top_indices:
+        image = images[idx]
+        embedding = image_embeddings[idx]
+        score = scores[idx]
+        # Create saliency map
+        # For ColPali, we can visualize patch-level relevance
+        # Each patch in the embedding corresponds to a region in the image
+        # Calculate patch-level scores
+        # Query embedding: [num_query_patches, dim]
+        # Image embedding: [num_image_patches, dim]
+        # Compute similarity for each patch pair
+        query_np = query_embedding.cpu().numpy()
+        img_np = embedding.cpu().numpy()
+        # Compute cosine similarity for each patch
+        # Normalize
+        query_norm = query_np / (np.linalg.norm(query_np, axis=1, keepdims=True) + 1e-8)
+        img_norm = img_np / (np.linalg.norm(img_np, axis=1, keepdims=True) + 1e-8)
+        # Compute similarity matrix: [num_query_patches, num_image_patches]
+        similarity_matrix = np.dot(query_norm, img_norm.T)
+        # Get max similarity per image patch (best match from any query patch)
+        patch_scores = similarity_matrix.max(axis=0)  # [num_image_patches]
+        # Normalize scores to [0, 1]
+        patch_scores = (patch_scores - patch_scores.min()) / (patch_scores.max() - patch_scores.min() + 1e-8)
+        # Create overlay image
+        annotated = _create_saliency_overlay(
+            image,
+            patch_scores,
+            score,
+            threshold=threshold
+        )
+        annotated_images.append(annotated)
+    logger.info(f"✅ Generated {len(annotated_images)} saliency maps")
+    return annotated_images
+def _create_saliency_overlay(
+    image: Image.Image,
+    patch_scores: np.ndarray,
+    overall_score: float,
+    threshold: float = 0.5,
+    patch_size: int = 16  # Approximate patch size in pixels
+) -> Image.Image:
+    """
+    Create saliency overlay on image.
+    Args:
+        image: Original PIL Image
+        patch_scores: Array of scores for each patch [num_patches]
+        overall_score: Overall relevance score
+        threshold: Threshold for highlighting
+        patch_size: Size of each patch in pixels
+    Returns:
+        Annotated PIL Image
+    """
+    # Convert to numpy array
+    img_array = np.array(image)
+    h, w = img_array.shape[:2]
+    # Estimate grid dimensions
+    # ColPali typically uses a grid of patches
+    # For simplicity, assume square grid
+    num_patches = len(patch_scores)
+    grid_size = int(np.sqrt(num_patches))
+    if grid_size * grid_size != num_patches:
+        # Non-square grid, try to estimate
+        # Common aspect ratios
+        aspect_ratio = w / h
+        cols = int(np.sqrt(num_patches * aspect_ratio))
+        rows = int(num_patches / cols)
+        if cols * rows != num_patches:
+            # Fallback to square
+            grid_size = int(np.sqrt(num_patches))
+            rows = cols = grid_size
+    else:
+        rows = cols = grid_size
+    # Calculate patch dimensions
+    patch_h = h // rows
+    patch_w = w // cols
+    # Create overlay
+    overlay = np.zeros((h, w, 4), dtype=np.uint8)  # RGBA
+    # Create colormap (red for high relevance)
+    cmap = plt.cm.Reds
+    patch_idx = 0
+    for i in range(rows):
+        for j in range(cols):
+            if patch_idx >= len(patch_scores):
+                break
+            score = patch_scores[patch_idx]
+            if score >= threshold:
+                # Calculate patch bounds
+                y1 = i * patch_h
+                y2 = min((i + 1) * patch_h, h)
+                x1 = j * patch_w
+                x2 = min((j + 1) * patch_w, w)
+                # Get color from colormap
+                color = cmap(score)[:3]  # RGB
+                color_uint8 = (np.array(color) * 255).astype(np.uint8)
+                # Set overlay
+                overlay[y1:y2, x1:x2, :3] = color_uint8
+                overlay[y1:y2, x1:x2, 3] = int(score * 128)  # Alpha based on score
+            patch_idx += 1
+    # Blend overlay with original image
+    overlay_img = Image.fromarray(overlay, 'RGBA')
+    annotated = Image.alpha_composite(image.convert('RGBA'), overlay_img)
+    # Add text annotation with score
+    draw = ImageDraw.Draw(annotated)
+    try:
+        font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 24)
+    except:
+        font = ImageFont.load_default()
+    score_text = f"Relevance: {overall_score:.3f}"
+    draw.text((10, 10), score_text, fill=(255, 255, 255, 255), font=font, stroke_width=2, stroke_fill=(0, 0, 0, 255))
+    return annotated.convert('RGB')
+def visualize_retrieval_results(
+    query: str,
+    retrieved_docs: List[Dict[str, Any]],
+    output_path: Optional[str] = None
+) -> None:
+    """
+    Visualize retrieval results with images and scores.
+    Args:
+        query: Original query text
+        retrieved_docs: List of retrieved documents with images and scores
+        output_path: Optional path to save visualization
+    """
+    num_docs = len(retrieved_docs)
+    fig, axes = plt.subplots(1, num_docs, figsize=(5 * num_docs, 5))
+    if num_docs == 1:
+        axes = [axes]
+    for idx, (doc, ax) in enumerate(zip(retrieved_docs, axes)):
+        if 'image' in doc:
+            ax.imshow(doc['image'])
+        ax.set_title(f"Rank {idx+1}\nScore: {doc.get('score', 0):.3f}")
+        ax.axis('off')
+    plt.suptitle(f"Query: {query}", fontsize=14, fontweight='bold')
+    plt.tight_layout()
+    if output_path:
+        plt.savefig(output_path, dpi=150, bbox_inches='tight')
+        logger.info(f"💾 Saved visualization to: {output_path}")
+    else:
+        plt.show()
+    plt.close()