Spaces:

Yeroyan
/

visual-rag-toolkit

Running

App Files Files Community

Yeroyan commited on Feb 5

Commit

9513cca

verified ·

1 Parent(s): d9f2c00

sync v0.1.3

Browse files

Files changed (25) hide show

benchmarks/__init__.py +1 -0
benchmarks/quick_test.py +566 -0
visual_rag/__init__.py +11 -7
visual_rag/cli/__init__.py +0 -2
visual_rag/cli/main.py +125 -133
visual_rag/config.py +61 -53
visual_rag/demo_runner.py +90 -0
visual_rag/embedding/__init__.py +4 -5
visual_rag/embedding/pooling.py +53 -51
visual_rag/embedding/visual_embedder.py +137 -91
visual_rag/indexing/__init__.py +38 -0
visual_rag/indexing/cloudinary_uploader.py +46 -51
visual_rag/indexing/pdf_processor.py +85 -76
visual_rag/indexing/pipeline.py +170 -125
visual_rag/indexing/qdrant_indexer.py +162 -143
visual_rag/preprocessing/__init__.py +0 -2
visual_rag/preprocessing/crop_empty.py +15 -7
visual_rag/qdrant_admin.py +29 -12
visual_rag/retrieval/__init__.py +2 -2
visual_rag/retrieval/multi_vector.py +64 -64
visual_rag/retrieval/single_stage.py +17 -18
visual_rag/retrieval/three_stage.py +1 -2
visual_rag/retrieval/two_stage.py +73 -94
visual_rag/visualization/__init__.py +1 -1
visual_rag/visualization/saliency.py +63 -67

benchmarks/__init__.py CHANGED Viewed

@@ -8,3 +8,4 @@ work in Docker/Spaces environments.
 """
 __all__ = []


8	"""
9
10	__all__ = []
11	+

benchmarks/quick_test.py ADDED Viewed

	@@ -0,0 +1,566 @@

+#!/usr/bin/env python3
+"""
+Quick Benchmark - Validate retrieval quality with ViDoRe data.
+This script:
+1. Downloads samples from ViDoRe (with ground truth relevance)
+2. Embeds with ColSmol-500M
+3. Tests retrieval strategies (exhaustive vs two-stage)
+4. Computes METRICS: NDCG@K, MRR@K, Recall@K
+5. Compares speed and quality
+Usage:
+    python quick_test.py --samples 100
+    python quick_test.py --samples 500 --skip-exhaustive  # Faster
+"""
+import sys
+import time
+import argparse
+import logging
+from pathlib import Path
+from typing import List, Dict, Any
+# Add parent directory to Python path (so we can import visual_rag)
+# This allows running the script directly without pip install
+_script_dir = Path(__file__).parent
+_parent_dir = _script_dir.parent
+if str(_parent_dir) not in sys.path:
+    sys.path.insert(0, str(_parent_dir))
+import numpy as np
+from tqdm import tqdm
+# Visual RAG imports (now works without pip install)
+from visual_rag.embedding import VisualEmbedder
+from visual_rag.embedding.pooling import (
+    tile_level_mean_pooling,
+    compute_maxsim_score,
+)
+# Optional: datasets for ViDoRe
+try:
+    from datasets import load_dataset as hf_load_dataset
+    HAS_DATASETS = True
+except ImportError:
+    HAS_DATASETS = False
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def load_vidore_sample(num_samples: int = 100) -> List[Dict]:
+    """
+    Load sample from ViDoRe DocVQA with ground truth.
+    Each sample has a query and its relevant document (1:1 mapping).
+    This allows computing retrieval metrics.
+    """
+    if not HAS_DATASETS:
+        logger.error("Install datasets: pip install datasets")
+        sys.exit(1)
+    logger.info(f"📥 Loading {num_samples} samples from ViDoRe DocVQA...")
+    ds = hf_load_dataset("vidore/docvqa_test_subsampled", split="test")
+    samples = []
+    for i, example in enumerate(ds):
+        if i >= num_samples:
+            break
+        samples.append({
+            "id": i,
+            "doc_id": f"doc_{i}",
+            "query_id": f"q_{i}",
+            "image": example.get("image", example.get("page_image")),
+            "query": example.get("query", example.get("question", "")),
+            # Ground truth: query i is relevant to doc i
+            "relevant_doc": f"doc_{i}",
+        })
+    logger.info(f"✅ Loaded {len(samples)} samples with ground truth")
+    return samples
+def embed_all(
+    samples: List[Dict],
+    model_name: str = "vidore/colSmol-500M",
+) -> Dict[str, Any]:
+    """Embed all documents and queries."""
+    logger.info(f"\n🤖 Loading model: {model_name}")
+    embedder = VisualEmbedder(model_name=model_name)
+    images = [s["image"] for s in samples]
+    queries = [s["query"] for s in samples if s["query"]]
+    # Embed images
+    logger.info(f"🎨 Embedding {len(images)} documents...")
+    start_time = time.time()
+    embeddings, token_infos = embedder.embed_images(
+        images, batch_size=4, return_token_info=True
+    )
+    doc_embed_time = time.time() - start_time
+    logger.info(f"   Time: {doc_embed_time:.2f}s ({doc_embed_time/len(images)*1000:.1f}ms/doc)")
+    # Process embeddings: extract visual tokens + tile-level pooling
+    doc_data = {}
+    for i, (emb, token_info) in enumerate(zip(embeddings, token_infos)):
+        if hasattr(emb, 'cpu'):
+            emb = emb.cpu()
+        emb_np = emb.numpy() if hasattr(emb, 'numpy') else np.array(emb)
+        # Extract visual tokens only (filter special tokens)
+        visual_indices = token_info["visual_token_indices"]
+        visual_emb = emb_np[visual_indices].astype(np.float32)
+        # Tile-level pooling
+        n_rows = token_info.get("n_rows", 4)
+        n_cols = token_info.get("n_cols", 3)
+        num_tiles = n_rows * n_cols + 1 if n_rows and n_cols else 13
+        tile_pooled = tile_level_mean_pooling(visual_emb, num_tiles, patches_per_tile=64)
+        doc_data[f"doc_{i}"] = {
+            "embedding": visual_emb,
+            "pooled": tile_pooled,
+            "num_visual_tokens": len(visual_indices),
+            "num_tiles": tile_pooled.shape[0],
+        }
+    # Embed queries
+    logger.info(f"🔍 Embedding {len(queries)} queries...")
+    start_time = time.time()
+    query_data = {}
+    for i, query in enumerate(tqdm(queries, desc="Queries")):
+        q_emb = embedder.embed_query(query)
+        if hasattr(q_emb, 'cpu'):
+            q_emb = q_emb.cpu()
+        q_np = q_emb.numpy() if hasattr(q_emb, 'numpy') else np.array(q_emb)
+        query_data[f"q_{i}"] = q_np.astype(np.float32)
+    query_embed_time = time.time() - start_time
+    return {
+        "docs": doc_data,
+        "queries": query_data,
+        "samples": samples,
+        "doc_embed_time": doc_embed_time,
+        "query_embed_time": query_embed_time,
+        "model": model_name,
+    }
+def search_exhaustive(query_emb: np.ndarray, docs: Dict, top_k: int = 10) -> List[Dict]:
+    """Exhaustive MaxSim search over all documents."""
+    scores = []
+    for doc_id, doc in docs.items():
+        score = compute_maxsim_score(query_emb, doc["embedding"])
+        scores.append({"id": doc_id, "score": score})
+    scores.sort(key=lambda x: x["score"], reverse=True)
+    return scores[:top_k]
+def search_two_stage(
+    query_emb: np.ndarray,
+    docs: Dict,
+    prefetch_k: int = 20,
+    top_k: int = 10,
+) -> List[Dict]:
+    """
+    Two-stage retrieval with tile-level pooling.
+    Stage 1: Fast prefetch using tile-pooled vectors
+    Stage 2: Exact MaxSim reranking on candidates
+    """
+    # Stage 1: Tile-level pooled search
+    query_pooled = query_emb.mean(axis=0)
+    query_pooled = query_pooled / (np.linalg.norm(query_pooled) + 1e-8)
+    stage1_scores = []
+    for doc_id, doc in docs.items():
+        doc_pooled = doc["pooled"]
+        doc_norm = doc_pooled / (np.linalg.norm(doc_pooled, axis=1, keepdims=True) + 1e-8)
+        tile_sims = np.dot(doc_norm, query_pooled)
+        score = float(tile_sims.max())
+        stage1_scores.append({"id": doc_id, "score": score})
+    stage1_scores.sort(key=lambda x: x["score"], reverse=True)
+    candidates = stage1_scores[:prefetch_k]
+    # Stage 2: Exact MaxSim on candidates
+    reranked = []
+    for cand in candidates:
+        doc_id = cand["id"]
+        score = compute_maxsim_score(query_emb, docs[doc_id]["embedding"])
+        reranked.append({"id": doc_id, "score": score, "stage1_rank": stage1_scores.index(cand) + 1})
+    reranked.sort(key=lambda x: x["score"], reverse=True)
+    return reranked[:top_k]
+def compute_metrics(
+    results: Dict[str, List[Dict]],
+    samples: List[Dict],
+    k_values: List[int] = [1, 3, 5, 7, 10],
+) -> Dict[str, float]:
+    """
+    Compute retrieval metrics.
+    Since ViDoRe has 1:1 query-doc mapping (1 relevant doc per query):
+    - Recall@K (Hit Rate): Is the relevant doc in top-K? (0 or 1)
+    - Precision@K: (# relevant in top-K) / K
+    - MRR@K: 1/rank if found in top-K, else 0
+    - NDCG@K: DCG / IDCG with binary relevance
+    """
+    metrics = {}
+    # Also track per-query ranks for analysis
+    all_ranks = []
+    for k in k_values:
+        recalls = []
+        precisions = []
+        mrrs = []
+        ndcgs = []
+        for sample in samples:
+            query_id = sample["query_id"]
+            relevant_doc = sample["relevant_doc"]
+            if query_id not in results:
+                continue
+            ranking = results[query_id][:k]
+            ranked_ids = [r["id"] for r in ranking]
+            # Find rank of relevant doc (1-indexed, 0 if not found)
+            rank = 0
+            for i, doc_id in enumerate(ranked_ids):
+                if doc_id == relevant_doc:
+                    rank = i + 1
+                    break
+            # Recall@K (Hit Rate): 1 if found in top-K
+            found = 1.0 if rank > 0 else 0.0
+            recalls.append(found)
+            # Precision@K: (# relevant found) / K
+            # With 1 relevant doc: 1/K if found, 0 otherwise
+            precision = found / k
+            precisions.append(precision)
+            # MRR@K: 1/rank if found
+            mrr = 1.0 / rank if rank > 0 else 0.0
+            mrrs.append(mrr)
+            # NDCG@K (binary relevance)
+            # DCG = 1/log2(rank+1) if found, 0 otherwise
+            # IDCG = 1/log2(2) = 1 (best case: relevant at rank 1)
+            dcg = 1.0 / np.log2(rank + 1) if rank > 0 else 0.0
+            idcg = 1.0
+            ndcg = dcg / idcg
+            ndcgs.append(ndcg)
+            # Track actual rank for analysis (only for k=10)
+            if k == max(k_values):
+                full_ranking = results[query_id]
+                full_rank = 0
+                for i, r in enumerate(full_ranking):
+                    if r["id"] == relevant_doc:
+                        full_rank = i + 1
+                        break
+                all_ranks.append(full_rank)
+        metrics[f"Recall@{k}"] = np.mean(recalls)
+        metrics[f"P@{k}"] = np.mean(precisions)
+        metrics[f"MRR@{k}"] = np.mean(mrrs)
+        metrics[f"NDCG@{k}"] = np.mean(ndcgs)
+    # Add summary stats
+    if all_ranks:
+        found_ranks = [r for r in all_ranks if r > 0]
+        metrics["avg_rank"] = np.mean(found_ranks) if found_ranks else float('inf')
+        metrics["median_rank"] = np.median(found_ranks) if found_ranks else float('inf')
+        metrics["not_found"] = sum(1 for r in all_ranks if r == 0)
+    return metrics
+def run_benchmark(
+    data: Dict,
+    skip_exhaustive: bool = False,
+    prefetch_k: int = None,
+    top_k: int = 10,
+) -> Dict[str, Dict]:
+    """Run retrieval benchmark with metrics."""
+    docs = data["docs"]
+    queries = data["queries"]
+    samples = data["samples"]
+    num_docs = len(docs)
+    # Auto-set prefetch_k to be meaningful (default: 20, or 20% of docs if >100 docs)
+    if prefetch_k is None:
+        if num_docs <= 100:
+            prefetch_k = 20  # Default: prefetch 20, rerank to top-10
+        else:
+            prefetch_k = max(20, min(100, int(num_docs * 0.2)))  # 20% for larger collections
+    # Ensure prefetch_k < num_docs for meaningful two-stage comparison
+    if prefetch_k >= num_docs:
+        logger.warning(f"⚠️  prefetch_k={prefetch_k} >= num_docs={num_docs}")
+        logger.warning(f"   Two-stage will fetch ALL docs (same as exhaustive)")
+        logger.warning(f"   Use --samples > {prefetch_k * 3} for meaningful comparison")
+    logger.info(f"📊 Benchmark config: {num_docs} docs, prefetch_k={prefetch_k}, top_k={top_k}")
+    logger.info(f"   (Both methods return top-{top_k} results - realistic retrieval scenario)")
+    results = {}
+    # Two-stage retrieval (NOVEL)
+    logger.info(f"\n🔬 Running Two-Stage retrieval (prefetch top-{prefetch_k}, rerank to top-{top_k})...")
+    two_stage_results = {}
+    two_stage_times = []
+    for sample in tqdm(samples, desc="Two-Stage"):
+        query_id = sample["query_id"]
+        query_emb = queries[query_id]
+        start = time.time()
+        ranking = search_two_stage(query_emb, docs, prefetch_k=prefetch_k, top_k=top_k)
+        two_stage_times.append(time.time() - start)
+        two_stage_results[query_id] = ranking
+    two_stage_metrics = compute_metrics(two_stage_results, samples)
+    two_stage_metrics["avg_time_ms"] = np.mean(two_stage_times) * 1000
+    two_stage_metrics["prefetch_k"] = prefetch_k
+    two_stage_metrics["top_k"] = top_k
+    results["two_stage"] = two_stage_metrics
+    # Exhaustive search (baseline)
+    if not skip_exhaustive:
+        logger.info(f"🔬 Running Exhaustive MaxSim (searches ALL {num_docs} docs, returns top-{top_k})...")
+        exhaustive_results = {}
+        exhaustive_times = []
+        for sample in tqdm(samples, desc="Exhaustive"):
+            query_id = sample["query_id"]
+            query_emb = queries[query_id]
+            start = time.time()
+            ranking = search_exhaustive(query_emb, docs, top_k=top_k)
+            exhaustive_times.append(time.time() - start)
+            exhaustive_results[query_id] = ranking
+        exhaustive_metrics = compute_metrics(exhaustive_results, samples)
+        exhaustive_metrics["avg_time_ms"] = np.mean(exhaustive_times) * 1000
+        exhaustive_metrics["top_k"] = top_k
+        results["exhaustive"] = exhaustive_metrics
+    return results
+def print_results(data: Dict, benchmark_results: Dict, show_precision: bool = False):
+    """Print benchmark results."""
+    print("\n" + "=" * 80)
+    print("📊 BENCHMARK RESULTS")
+    print("=" * 80)
+    num_docs = len(data['docs'])
+    print(f"\n🤖 Model: {data['model']}")
+    print(f"📄 Documents: {num_docs}")
+    print(f"🔍 Queries: {len(data['queries'])}")
+    # Embedding stats
+    sample_doc = list(data['docs'].values())[0]
+    print(f"\n📏 Embedding (after visual token filtering):")
+    print(f"   Visual tokens per doc: {sample_doc['num_visual_tokens']}")
+    print(f"   Tile-pooled vectors: {sample_doc['num_tiles']}")
+    if "two_stage" in benchmark_results:
+        prefetch_k = benchmark_results["two_stage"].get("prefetch_k", "?")
+        print(f"   Two-stage prefetch_k: {prefetch_k} (of {num_docs} docs)")
+    # Method labels - clearer naming
+    def get_label(method):
+        if method == "two_stage":
+            return "Pooled+Rerank"  # Tile-pooled prefetch + MaxSim rerank
+        else:
+            return "Full MaxSim"    # Exhaustive MaxSim on all docs
+    # Recall / Hit Rate table
+    print(f"\n🎯 RECALL (Hit Rate) @ K:")
+    print(f"   {'Method':<20} {'@1':>8} {'@3':>8} {'@5':>8} {'@7':>8} {'@10':>8}")
+    print(f"   {'-'*60}")
+    for method, metrics in benchmark_results.items():
+        print(f"   {get_label(method):<20} "
+              f"{metrics.get('Recall@1', 0):>8.3f} "
+              f"{metrics.get('Recall@3', 0):>8.3f} "
+              f"{metrics.get('Recall@5', 0):>8.3f} "
+              f"{metrics.get('Recall@7', 0):>8.3f} "
+              f"{metrics.get('Recall@10', 0):>8.3f}")
+    # Precision table (optional)
+    if show_precision:
+        print(f"\n📐 PRECISION @ K:")
+        print(f"   {'Method':<20} {'@1':>8} {'@3':>8} {'@5':>8} {'@7':>8} {'@10':>8}")
+        print(f"   {'-'*60}")
+        for method, metrics in benchmark_results.items():
+            print(f"   {get_label(method):<20} "
+                  f"{metrics.get('P@1', 0):>8.3f} "
+                  f"{metrics.get('P@3', 0):>8.3f} "
+                  f"{metrics.get('P@5', 0):>8.3f} "
+                  f"{metrics.get('P@7', 0):>8.3f} "
+                  f"{metrics.get('P@10', 0):>8.3f}")
+    # NDCG table
+    print(f"\n📈 NDCG @ K:")
+    print(f"   {'Method':<20} {'@1':>8} {'@3':>8} {'@5':>8} {'@7':>8} {'@10':>8}")
+    print(f"   {'-'*60}")
+    for method, metrics in benchmark_results.items():
+        print(f"   {get_label(method):<20} "
+              f"{metrics.get('NDCG@1', 0):>8.3f} "
+              f"{metrics.get('NDCG@3', 0):>8.3f} "
+              f"{metrics.get('NDCG@5', 0):>8.3f} "
+              f"{metrics.get('NDCG@7', 0):>8.3f} "
+              f"{metrics.get('NDCG@10', 0):>8.3f}")
+    # MRR table
+    print(f"\n🔍 MRR @ K:")
+    print(f"   {'Method':<20} {'@1':>8} {'@3':>8} {'@5':>8} {'@7':>8} {'@10':>8}")
+    print(f"   {'-'*60}")
+    for method, metrics in benchmark_results.items():
+        print(f"   {get_label(method):<20} "
+              f"{metrics.get('MRR@1', 0):>8.3f} "
+              f"{metrics.get('MRR@3', 0):>8.3f} "
+              f"{metrics.get('MRR@5', 0):>8.3f} "
+              f"{metrics.get('MRR@7', 0):>8.3f} "
+              f"{metrics.get('MRR@10', 0):>8.3f}")
+    # Speed comparison
+    top_k = benchmark_results.get("two_stage", benchmark_results.get("exhaustive", {})).get("top_k", 10)
+    print(f"\n⏱️  SPEED (both return top-{top_k} results):")
+    print(f"   {'Method':<20} {'Time (ms)':>12} {'Docs searched':>15}")
+    print(f"   {'-'*50}")
+    for method, metrics in benchmark_results.items():
+        if method == "two_stage":
+            searched = metrics.get("prefetch_k", "?")
+            label = f"{searched} (stage-1)"
+        else:
+            searched = num_docs
+            label = f"{searched} (all)"
+        print(f"   {get_label(method):<20} {metrics.get('avg_time_ms', 0):>12.2f} {label:>15}")
+    # Comparison summary
+    if "exhaustive" in benchmark_results and "two_stage" in benchmark_results:
+        ex = benchmark_results["exhaustive"]
+        ts = benchmark_results["two_stage"]
+        print(f"\n💡 POOLED+RERANK vs FULL MAXSIM:")
+        for k in [1, 5, 10]:
+            ex_recall = ex.get(f"Recall@{k}", 0)
+            ts_recall = ts.get(f"Recall@{k}", 0)
+            if ex_recall > 0:
+                retention = ts_recall / ex_recall * 100
+                print(f"   • Recall@{k} retention: {retention:.1f}% ({ts_recall:.3f} vs {ex_recall:.3f})")
+        speedup = ex["avg_time_ms"] / ts["avg_time_ms"] if ts["avg_time_ms"] > 0 else 0
+        print(f"   • Speedup: {speedup:.1f}x")
+        # Rank stats with explanation
+        if "avg_rank" in ts:
+            prefetch_k = ts.get("prefetch_k", "?")
+            top_k = ts.get("top_k", 10)
+            not_found = ts.get("not_found", 0)
+            total = len(data["queries"])
+            print(f"\n📊 POOLED+RERANK STATISTICS:")
+            print(f"   Stage-1 (pooled prefetch):")
+            print(f"      • Searches top-{prefetch_k} candidates using tile-pooled vectors")
+            print(f"      • {total - not_found}/{total} queries ({100 - not_found/total*100:.1f}%) had relevant doc in prefetch")
+            print(f"      • {not_found}/{total} queries ({not_found/total*100:.1f}%) missed (relevant doc ranked >{prefetch_k})")
+            print(f"   Stage-2 (MaxSim reranking):")
+            print(f"      • Reranks prefetch candidates with exact MaxSim")
+            print(f"      • Returns final top-{top_k} results")
+            if ts['avg_rank'] < float('inf'):
+                print(f"      • Avg rank of relevant doc (when found): {ts['avg_rank']:.1f}")
+                print(f"      • Median rank: {ts['median_rank']:.1f}")
+            print(f"\n   💡 The {not_found/total*100:.1f}% miss rate is for stage-1 prefetch.")
+            print(f"      Final Recall@{top_k} shows how many relevant docs ARE in top-{top_k} results.")
+    print("\n" + "=" * 80)
+    print("✅ Benchmark complete!")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Quick benchmark for visual-rag-toolkit",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--samples", type=int, default=100,
+        help="Number of samples (default: 100)"
+    )
+    parser.add_argument(
+        "--model", type=str, default="vidore/colSmol-500M",
+        help="Model: vidore/colSmol-500M (default), vidore/colpali-v1.3"
+    )
+    parser.add_argument(
+        "--prefetch-k", type=int, default=None,
+        help="Stage 1 candidates for two-stage (default: 20 for <=100 docs, auto for larger)"
+    )
+    parser.add_argument(
+        "--skip-exhaustive", action="store_true",
+        help="Skip exhaustive baseline (faster)"
+    )
+    parser.add_argument(
+        "--show-precision", action="store_true",
+        help="Show Precision@K metrics (hidden by default)"
+    )
+    parser.add_argument(
+        "--top-k", type=int, default=10,
+        help="Number of results to return (default: 10, realistic retrieval scenario)"
+    )
+    args = parser.parse_args()
+    print("\n" + "=" * 70)
+    print("🧪 VISUAL RAG TOOLKIT - RETRIEVAL BENCHMARK")
+    print("=" * 70)
+    # Load samples
+    samples = load_vidore_sample(args.samples)
+    if not samples:
+        logger.error("No samples loaded!")
+        sys.exit(1)
+    # Embed all
+    data = embed_all(samples, args.model)
+    # Run benchmark
+    benchmark_results = run_benchmark(
+        data,
+        skip_exhaustive=args.skip_exhaustive,
+        prefetch_k=args.prefetch_k,
+        top_k=args.top_k,
+    )
+    # Print results
+    print_results(data, benchmark_results, show_precision=args.show_precision)
+if __name__ == "__main__":
+    main()

visual_rag/__init__.py CHANGED Viewed

@@ -14,16 +14,16 @@ Components:
 Quick Start:
 ------------
 >>> from visual_rag import VisualEmbedder, PDFProcessor, TwoStageRetriever
->>>
 >>> # Process PDFs
 >>> processor = PDFProcessor(dpi=140)
 >>> images, texts = processor.process_pdf("report.pdf")
->>>
 >>> # Generate embeddings
 >>> embedder = VisualEmbedder()
 >>> embeddings = embedder.embed_images(images)
 >>> query_emb = embedder.embed_query("What is the budget?")
->>>
 >>> # Search with two-stage retrieval
 >>> retriever = TwoStageRetriever(qdrant_client, "my_collection")
 >>> results = retriever.search(query_emb, top_k=10)
@@ -31,7 +31,7 @@ Quick Start:
 Each component works independently - use only what you need.
 """
-__version__ = "0.1.0"
 # Import main classes at package level for convenience
 # These are optional - if dependencies aren't installed, we catch the error
@@ -71,13 +71,17 @@ try:
 except ImportError:
     QdrantAdmin = None
 # Config utilities (always available)
-from visual_rag.config import load_config, get, get_section
 __all__ = [
     # Version
     "__version__",
     # Main classes
     "VisualEmbedder",
     "PDFProcessor",
@@ -86,7 +90,7 @@ __all__ = [
     "TwoStageRetriever",
     "MultiVectorRetriever",
     "QdrantAdmin",
     # Config utilities
     "load_config",
     "get",

 Quick Start:
 ------------
 >>> from visual_rag import VisualEmbedder, PDFProcessor, TwoStageRetriever
+>>>
 >>> # Process PDFs
 >>> processor = PDFProcessor(dpi=140)
 >>> images, texts = processor.process_pdf("report.pdf")
+>>>
 >>> # Generate embeddings
 >>> embedder = VisualEmbedder()
 >>> embeddings = embedder.embed_images(images)
 >>> query_emb = embedder.embed_query("What is the budget?")
+>>>
 >>> # Search with two-stage retrieval
 >>> retriever = TwoStageRetriever(qdrant_client, "my_collection")
 >>> results = retriever.search(query_emb, top_k=10)
 Each component works independently - use only what you need.
 """
+__version__ = "0.1.3"
 # Import main classes at package level for convenience
 # These are optional - if dependencies aren't installed, we catch the error
 except ImportError:
     QdrantAdmin = None
+try:
+    from visual_rag.demo_runner import demo
+except ImportError:
+    demo = None
 # Config utilities (always available)
+from visual_rag.config import get, get_section, load_config
 __all__ = [
     # Version
     "__version__",
     # Main classes
     "VisualEmbedder",
     "PDFProcessor",
     "TwoStageRetriever",
     "MultiVectorRetriever",
     "QdrantAdmin",
+    "demo",
     # Config utilities
     "load_config",
     "get",

visual_rag/cli/__init__.py CHANGED Viewed

	@@ -1,3 +1 @@
1	"""CLI entry point for visual-rag-toolkit."""
2	-
3	-


1	"""CLI entry point for visual-rag-toolkit."""

visual_rag/cli/main.py CHANGED Viewed

@@ -10,20 +10,19 @@ Provides command-line interface for:
 Usage:
     # Process PDFs (like process_pdfs_saliency_v2.py)
     visual-rag process --reports-dir ./pdfs --metadata-file metadata.json
     # Search
     visual-rag search --query "budget allocation" --collection my_docs
     # Show collection info
     visual-rag info --collection my_docs
 """
-import os
-import sys
 import argparse
 import logging
 from pathlib import Path
-from typing import Optional
 from urllib.parse import urlparse
 from dotenv import load_dotenv
@@ -44,38 +43,38 @@ def setup_logging(debug: bool = False):
 def cmd_process(args):
     """
     Process PDFs: convert → embed → upload to Cloudinary → index in Qdrant.
     Equivalent to process_pdfs_saliency_v2.py
     """
-    from visual_rag import VisualEmbedder, QdrantIndexer, CloudinaryUploader, load_config
     from visual_rag.indexing.pipeline import ProcessingPipeline
     # Load environment
     load_dotenv()
     # Load config
     config = {}
     if args.config and Path(args.config).exists():
         config = load_config(args.config)
     # Get PDFs
     reports_dir = Path(args.reports_dir)
     if not reports_dir.exists():
         logger.error(f"❌ Reports directory not found: {reports_dir}")
         sys.exit(1)
     pdf_paths = sorted(reports_dir.glob("*.pdf")) + sorted(reports_dir.glob("*.PDF"))
     if not pdf_paths:
         logger.error(f"❌ No PDF files found in: {reports_dir}")
         sys.exit(1)
     logger.info(f"📁 Found {len(pdf_paths)} PDF files")
     # Load metadata mapping
     metadata_mapping = {}
     if args.metadata_file:
         metadata_mapping = ProcessingPipeline.load_metadata_mapping(Path(args.metadata_file))
     # Dry run - just show summary
     if args.dry_run:
         logger.info("🏃 DRY RUN MODE")
@@ -83,21 +82,24 @@ def cmd_process(args):
         logger.info(f"   Metadata entries: {len(metadata_mapping)}")
         logger.info(f"   Collection: {args.collection}")
         logger.info(f"   Cloudinary: {'ENABLED' if not args.no_cloudinary else 'DISABLED'}")
         for pdf in pdf_paths[:10]:
             has_meta = "✓" if pdf.stem.lower() in metadata_mapping else "✗"
             logger.info(f"   {has_meta} {pdf.name}")
         if len(pdf_paths) > 10:
             logger.info(f"   ... and {len(pdf_paths) - 10} more")
         return
     # Get settings
     model_name = args.model or config.get("model", {}).get("name", "vidore/colSmol-500M")
-    collection_name = args.collection or config.get("qdrant", {}).get("collection_name", "visual_documents")
     torch_dtype = None
     if args.torch_dtype != "auto":
         import torch
         torch_dtype = {
             "float32": torch.float32,
             "float16": torch.float16,
@@ -111,20 +113,22 @@ def cmd_process(args):
         torch_dtype=torch_dtype,
         processor_speed=str(getattr(args, "processor_speed", "fast")),
     )
     # Initialize Qdrant indexer
-    qdrant_url = os.getenv("SIGIR_QDRANT_URL") or os.getenv("DEST_QDRANT_URL") or os.getenv("QDRANT_URL")
     qdrant_api_key = (
         os.getenv("SIGIR_QDRANT_KEY")
         or os.getenv("SIGIR_QDRANT_API_KEY")
         or os.getenv("DEST_QDRANT_API_KEY")
         or os.getenv("QDRANT_API_KEY")
     )
     if not qdrant_url:
         logger.error("❌ QDRANT_URL environment variable not set")
         sys.exit(1)
     logger.info(f"🔌 Connecting to Qdrant: {qdrant_url}")
     indexer = QdrantIndexer(
         url=qdrant_url,
@@ -133,7 +137,7 @@ def cmd_process(args):
         prefer_grpc=args.prefer_grpc,
         vector_datatype=args.qdrant_vector_dtype,
     )
     # Create collection if needed
     indexer.create_collection(force_recreate=args.force_recreate)
     inferred_fields = []
@@ -166,7 +170,7 @@ def cmd_process(args):
             inferred_fields.append({"field": k, "type": inferred_type})
     indexer.create_payload_indexes(fields=inferred_fields)
     # Initialize Cloudinary uploader (optional)
     cloudinary_uploader = None
     if not args.no_cloudinary:
@@ -176,7 +180,7 @@ def cmd_process(args):
         except ValueError as e:
             logger.warning(f"⚠️ Cloudinary not configured: {e}")
             logger.warning("   Continuing without Cloudinary uploads")
     # Create pipeline
     pipeline = ProcessingPipeline(
         embedder=embedder,
@@ -186,42 +190,44 @@ def cmd_process(args):
         config=config,
         embedding_strategy=args.strategy,
         crop_empty=bool(getattr(args, "crop_empty", False)),
-        crop_empty_percentage_to_remove=float(getattr(args, "crop_empty_percentage_to_remove", 0.9)),
         crop_empty_remove_page_number=bool(getattr(args, "crop_empty_remove_page_number", False)),
     )
     # Process PDFs
     total_uploaded = 0
     total_skipped = 0
     total_failed = 0
     skip_existing = not args.no_skip_existing
     for pdf_idx, pdf_path in enumerate(pdf_paths, 1):
         logger.info(f"\n{'='*60}")
         logger.info(f"📄 [{pdf_idx}/{len(pdf_paths)}] {pdf_path.name}")
         logger.info(f"{'='*60}")
         result = pipeline.process_pdf(
             pdf_path,
             skip_existing=skip_existing,
             upload_to_cloudinary=(not args.no_cloudinary),
             upload_to_qdrant=True,
         )
         total_uploaded += result["uploaded"]
         total_skipped += result["skipped"]
         total_failed += result["failed"]
     # Summary
     logger.info(f"\n{'='*60}")
-    logger.info(f"📊 SUMMARY")
     logger.info(f"{'='*60}")
     logger.info(f"   Total PDFs: {len(pdf_paths)}")
     logger.info(f"   Uploaded: {total_uploaded}")
     logger.info(f"   Skipped: {total_skipped}")
     logger.info(f"   Failed: {total_failed}")
     info = indexer.get_collection_info()
     if info:
         logger.info(f"   Collection points: {info.get('points_count', 'N/A')}")
@@ -229,29 +235,34 @@ def cmd_process(args):
 def cmd_search(args):
     """Search documents."""
-    from visual_rag import VisualEmbedder
-    from visual_rag.retrieval import TwoStageRetriever, SingleStageRetriever
     from qdrant_client import QdrantClient
     load_dotenv()
-    qdrant_url = os.getenv("SIGIR_QDRANT_URL") or os.getenv("DEST_QDRANT_URL") or os.getenv("QDRANT_URL")
     qdrant_api_key = (
         os.getenv("SIGIR_QDRANT_KEY")
         or os.getenv("SIGIR_QDRANT_API_KEY")
         or os.getenv("DEST_QDRANT_API_KEY")
         or os.getenv("QDRANT_API_KEY")
     )
     if not qdrant_url:
         logger.error("❌ QDRANT_URL not set")
         sys.exit(1)
     # Initialize
     logger.info(f"🤖 Loading model: {args.model}")
-    embedder = VisualEmbedder(model_name=args.model, processor_speed=str(getattr(args, "processor_speed", "fast")))
-    logger.info(f"🔌 Connecting to Qdrant")
     grpc_port = 6334 if args.prefer_grpc and urlparse(qdrant_url).port == 6333 else None
     client = QdrantClient(
         url=qdrant_url,
@@ -262,11 +273,11 @@ def cmd_search(args):
     )
     two_stage = TwoStageRetriever(client, args.collection)
     single_stage = SingleStageRetriever(client, args.collection)
     # Embed query
     logger.info(f"🔍 Query: {args.query}")
     query_embedding = embedder.embed_query(args.query)
     # Build filter
     filter_obj = None
     if args.year or args.source or args.district:
@@ -275,7 +286,7 @@ def cmd_search(args):
             source=args.source,
             district=args.district,
         )
     # Search
     query_np = query_embedding.detach().cpu().numpy()
     if args.strategy == "single_full":
@@ -307,21 +318,21 @@ def cmd_search(args):
             filter_obj=filter_obj,
             stage1_mode=args.stage1_mode,
         )
     # Display results
     logger.info(f"\n📊 Results ({len(results)}):")
     for i, result in enumerate(results, 1):
         payload = result.get("payload", {})
         score = result.get("score_final", result.get("score_stage1", 0))
         filename = payload.get("filename", "N/A")
         page_num = payload.get("page_number", "N/A")
         year = payload.get("year", "N/A")
         source = payload.get("source", "N/A")
         logger.info(f"  {i}. {filename} p.{page_num}")
         logger.info(f"     Score: {score:.4f} | Year: {year} | Source: {source}")
         # Text snippet
         text = payload.get("text", "")
         if text and args.show_text:
@@ -332,21 +343,23 @@ def cmd_search(args):
 def cmd_info(args):
     """Show collection info."""
     from qdrant_client import QdrantClient
     load_dotenv()
-    qdrant_url = os.getenv("SIGIR_QDRANT_URL") or os.getenv("DEST_QDRANT_URL") or os.getenv("QDRANT_URL")
     qdrant_api_key = (
         os.getenv("SIGIR_QDRANT_KEY")
         or os.getenv("SIGIR_QDRANT_API_KEY")
         or os.getenv("DEST_QDRANT_API_KEY")
         or os.getenv("QDRANT_API_KEY")
     )
     if not qdrant_url:
         logger.error("❌ QDRANT_URL not set")
         sys.exit(1)
     grpc_port = 6334 if args.prefer_grpc and urlparse(qdrant_url).port == 6333 else None
     client = QdrantClient(
         url=qdrant_url,
@@ -355,29 +368,29 @@ def cmd_info(args):
         grpc_port=grpc_port,
         check_compatibility=False,
     )
     try:
         info = client.get_collection(args.collection)
         status = info.status
         if hasattr(status, "value"):
             status = status.value
         indexed_count = getattr(info, "indexed_vectors_count", 0) or 0
         if isinstance(indexed_count, dict):
             indexed_count = sum(indexed_count.values())
         logger.info(f"📊 Collection: {args.collection}")
         logger.info(f"   Status: {status}")
         logger.info(f"   Points: {info.points_count}")
         logger.info(f"   Indexed vectors: {indexed_count}")
         # Show vector config
         if hasattr(info, "config") and hasattr(info.config, "params"):
             vectors = getattr(info.config.params, "vectors", {})
             if vectors:
                 logger.info(f"   Vectors: {list(vectors.keys())}")
     except Exception as e:
         logger.error(f"❌ Could not get collection info: {e}")
         sys.exit(1)
@@ -393,24 +406,24 @@ def main():
 Examples:
   # Process PDFs (like process_pdfs_saliency_v2.py)
   visual-rag process --reports-dir ./pdfs --metadata-file metadata.json
   # Process without Cloudinary
   visual-rag process --reports-dir ./pdfs --no-cloudinary
   # Search
   visual-rag search --query "budget allocation" --collection my_docs
   # Search with filters
   visual-rag search --query "budget" --year 2023 --source "Local Government"
   # Show collection info
   visual-rag info --collection my_docs
         """,
     )
     parser.add_argument("--debug", action="store_true", help="Enable debug logging")
     subparsers = parser.add_subparsers(dest="command", help="Command")
     # =========================================================================
     # PROCESS command
     # =========================================================================
@@ -420,32 +433,26 @@ Examples:
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
     process_parser.add_argument(
-        "--reports-dir", type=str, required=True,
-        help="Directory containing PDF files"
-    )
-    process_parser.add_argument(
-        "--metadata-file", type=str,
-        help="JSON file with filename → metadata mapping (like filename_metadata.json)"
-    )
-    process_parser.add_argument(
-        "--collection", type=str, default="visual_documents",
-        help="Qdrant collection name"
     )
     process_parser.add_argument(
-        "--model", type=str, default="vidore/colSmol-500M",
-        help="Model name (vidore/colSmol-500M, vidore/colpali-v1.3, etc.)"
     )
     process_parser.add_argument(
-        "--batch-size", type=int, default=8,
-        help="Embedding batch size"
     )
     process_parser.add_argument(
-        "--config", type=str,
-        help="Path to config.yaml file"
     )
     process_parser.add_argument(
-        "--no-cloudinary", action="store_true",
-        help="Skip Cloudinary uploads"
     )
     process_parser.add_argument(
         "--crop-empty",
@@ -464,22 +471,23 @@ Examples:
         help="If set, attempts to crop away the bottom region that contains sparse page numbers (default: off).",
     )
     process_parser.add_argument(
-        "--no-skip-existing", action="store_true",
-        help="Process all pages even if they exist in Qdrant"
     )
     process_parser.add_argument(
-        "--force-recreate", action="store_true",
-        help="Delete and recreate collection"
     )
     process_parser.add_argument(
-        "--dry-run", action="store_true",
-        help="Show what would be processed without doing it"
     )
     process_parser.add_argument(
-        "--strategy", type=str, default="pooling",
         choices=["pooling", "standard", "all"],
         help="Embedding strategy: 'pooling' (NOVEL), 'standard' (BASELINE), "
-             "'all' (embed once, store BOTH for comparison)"
     )
     process_parser.add_argument(
         "--torch-dtype",
@@ -517,7 +525,7 @@ Examples:
         help="Disable gRPC for Qdrant client.",
     )
     process_parser.set_defaults(func=cmd_process)
     # =========================================================================
     # SEARCH command
     # =========================================================================
@@ -525,17 +533,12 @@ Examples:
         "search",
         help="Search documents",
     )
     search_parser.add_argument(
-        "--query", type=str, required=True,
-        help="Search query"
     )
     search_parser.add_argument(
-        "--collection", type=str, default="visual_documents",
-        help="Qdrant collection name"
-    )
-    search_parser.add_argument(
-        "--model", type=str, default="vidore/colSmol-500M",
-        help="Model name"
     )
     search_parser.add_argument(
         "--processor-speed",
@@ -544,39 +547,29 @@ Examples:
         choices=["fast", "slow", "auto"],
         help="Processor implementation: fast (default, with fallback to slow), slow, or auto.",
     )
     search_parser.add_argument(
-        "--top-k", type=int, default=10,
-        help="Number of results"
-    )
-    search_parser.add_argument(
-        "--strategy", type=str, default="single_full",
         choices=["single_full", "single_tiles", "single_global", "two_stage"],
-        help="Search strategy"
     )
     search_parser.add_argument(
-        "--prefetch-k", type=int, default=200,
-        help="Prefetch candidates for two-stage retrieval"
     )
     search_parser.add_argument(
-        "--stage1-mode", type=str, default="pooled_query_vs_tiles",
         choices=["pooled_query_vs_tiles", "tokens_vs_tiles", "pooled_query_vs_global"],
-        help="Stage 1 mode for two-stage retrieval"
     )
     search_parser.add_argument(
-        "--year", type=int,
-        help="Filter by year"
-    )
-    search_parser.add_argument(
-        "--source", type=str,
-        help="Filter by source"
-    )
-    search_parser.add_argument(
-        "--district", type=str,
-        help="Filter by district"
-    )
-    search_parser.add_argument(
-        "--show-text", action="store_true",
-        help="Show text snippets in results"
     )
     search_grpc_group = search_parser.add_mutually_exclusive_group()
     search_grpc_group.add_argument(
@@ -593,7 +586,7 @@ Examples:
         help="Disable gRPC for Qdrant client.",
     )
     search_parser.set_defaults(func=cmd_search)
     # =========================================================================
     # INFO command
     # =========================================================================
@@ -602,8 +595,7 @@ Examples:
         help="Show collection info",
     )
     info_parser.add_argument(
-        "--collection", type=str, default="visual_documents",
-        help="Qdrant collection name"
     )
     info_grpc_group = info_parser.add_mutually_exclusive_group()
     info_grpc_group.add_argument(
@@ -620,16 +612,16 @@ Examples:
         help="Disable gRPC for Qdrant client.",
     )
     info_parser.set_defaults(func=cmd_info)
     # Parse and execute
     args = parser.parse_args()
     setup_logging(args.debug)
     if not args.command:
         parser.print_help()
         sys.exit(0)
     args.func(args)

 Usage:
     # Process PDFs (like process_pdfs_saliency_v2.py)
     visual-rag process --reports-dir ./pdfs --metadata-file metadata.json
     # Search
     visual-rag search --query "budget allocation" --collection my_docs
     # Show collection info
     visual-rag info --collection my_docs
 """
 import argparse
 import logging
+import os
+import sys
 from pathlib import Path
 from urllib.parse import urlparse
 from dotenv import load_dotenv
 def cmd_process(args):
     """
     Process PDFs: convert → embed → upload to Cloudinary → index in Qdrant.
     Equivalent to process_pdfs_saliency_v2.py
     """
+    from visual_rag import CloudinaryUploader, QdrantIndexer, VisualEmbedder, load_config
     from visual_rag.indexing.pipeline import ProcessingPipeline
     # Load environment
     load_dotenv()
     # Load config
     config = {}
     if args.config and Path(args.config).exists():
         config = load_config(args.config)
     # Get PDFs
     reports_dir = Path(args.reports_dir)
     if not reports_dir.exists():
         logger.error(f"❌ Reports directory not found: {reports_dir}")
         sys.exit(1)
     pdf_paths = sorted(reports_dir.glob("*.pdf")) + sorted(reports_dir.glob("*.PDF"))
     if not pdf_paths:
         logger.error(f"❌ No PDF files found in: {reports_dir}")
         sys.exit(1)
     logger.info(f"📁 Found {len(pdf_paths)} PDF files")
     # Load metadata mapping
     metadata_mapping = {}
     if args.metadata_file:
         metadata_mapping = ProcessingPipeline.load_metadata_mapping(Path(args.metadata_file))
     # Dry run - just show summary
     if args.dry_run:
         logger.info("🏃 DRY RUN MODE")
         logger.info(f"   Metadata entries: {len(metadata_mapping)}")
         logger.info(f"   Collection: {args.collection}")
         logger.info(f"   Cloudinary: {'ENABLED' if not args.no_cloudinary else 'DISABLED'}")
         for pdf in pdf_paths[:10]:
             has_meta = "✓" if pdf.stem.lower() in metadata_mapping else "✗"
             logger.info(f"   {has_meta} {pdf.name}")
         if len(pdf_paths) > 10:
             logger.info(f"   ... and {len(pdf_paths) - 10} more")
         return
     # Get settings
     model_name = args.model or config.get("model", {}).get("name", "vidore/colSmol-500M")
+    collection_name = args.collection or config.get("qdrant", {}).get(
+        "collection_name", "visual_documents"
+    )
     torch_dtype = None
     if args.torch_dtype != "auto":
         import torch
         torch_dtype = {
             "float32": torch.float32,
             "float16": torch.float16,
         torch_dtype=torch_dtype,
         processor_speed=str(getattr(args, "processor_speed", "fast")),
     )
     # Initialize Qdrant indexer
+    qdrant_url = (
+        os.getenv("SIGIR_QDRANT_URL") or os.getenv("DEST_QDRANT_URL") or os.getenv("QDRANT_URL")
+    )
     qdrant_api_key = (
         os.getenv("SIGIR_QDRANT_KEY")
         or os.getenv("SIGIR_QDRANT_API_KEY")
         or os.getenv("DEST_QDRANT_API_KEY")
         or os.getenv("QDRANT_API_KEY")
     )
     if not qdrant_url:
         logger.error("❌ QDRANT_URL environment variable not set")
         sys.exit(1)
     logger.info(f"🔌 Connecting to Qdrant: {qdrant_url}")
     indexer = QdrantIndexer(
         url=qdrant_url,
         prefer_grpc=args.prefer_grpc,
         vector_datatype=args.qdrant_vector_dtype,
     )
     # Create collection if needed
     indexer.create_collection(force_recreate=args.force_recreate)
     inferred_fields = []
             inferred_fields.append({"field": k, "type": inferred_type})
     indexer.create_payload_indexes(fields=inferred_fields)
     # Initialize Cloudinary uploader (optional)
     cloudinary_uploader = None
     if not args.no_cloudinary:
         except ValueError as e:
             logger.warning(f"⚠️ Cloudinary not configured: {e}")
             logger.warning("   Continuing without Cloudinary uploads")
     # Create pipeline
     pipeline = ProcessingPipeline(
         embedder=embedder,
         config=config,
         embedding_strategy=args.strategy,
         crop_empty=bool(getattr(args, "crop_empty", False)),
+        crop_empty_percentage_to_remove=float(
+            getattr(args, "crop_empty_percentage_to_remove", 0.9)
+        ),
         crop_empty_remove_page_number=bool(getattr(args, "crop_empty_remove_page_number", False)),
     )
     # Process PDFs
     total_uploaded = 0
     total_skipped = 0
     total_failed = 0
     skip_existing = not args.no_skip_existing
     for pdf_idx, pdf_path in enumerate(pdf_paths, 1):
         logger.info(f"\n{'='*60}")
         logger.info(f"📄 [{pdf_idx}/{len(pdf_paths)}] {pdf_path.name}")
         logger.info(f"{'='*60}")
         result = pipeline.process_pdf(
             pdf_path,
             skip_existing=skip_existing,
             upload_to_cloudinary=(not args.no_cloudinary),
             upload_to_qdrant=True,
         )
         total_uploaded += result["uploaded"]
         total_skipped += result["skipped"]
         total_failed += result["failed"]
     # Summary
     logger.info(f"\n{'='*60}")
+    logger.info("📊 SUMMARY")
     logger.info(f"{'='*60}")
     logger.info(f"   Total PDFs: {len(pdf_paths)}")
     logger.info(f"   Uploaded: {total_uploaded}")
     logger.info(f"   Skipped: {total_skipped}")
     logger.info(f"   Failed: {total_failed}")
     info = indexer.get_collection_info()
     if info:
         logger.info(f"   Collection points: {info.get('points_count', 'N/A')}")
 def cmd_search(args):
     """Search documents."""
     from qdrant_client import QdrantClient
+    from visual_rag import VisualEmbedder
+    from visual_rag.retrieval import SingleStageRetriever, TwoStageRetriever
     load_dotenv()
+    qdrant_url = (
+        os.getenv("SIGIR_QDRANT_URL") or os.getenv("DEST_QDRANT_URL") or os.getenv("QDRANT_URL")
+    )
     qdrant_api_key = (
         os.getenv("SIGIR_QDRANT_KEY")
         or os.getenv("SIGIR_QDRANT_API_KEY")
         or os.getenv("DEST_QDRANT_API_KEY")
         or os.getenv("QDRANT_API_KEY")
     )
     if not qdrant_url:
         logger.error("❌ QDRANT_URL not set")
         sys.exit(1)
     # Initialize
     logger.info(f"🤖 Loading model: {args.model}")
+    embedder = VisualEmbedder(
+        model_name=args.model, processor_speed=str(getattr(args, "processor_speed", "fast"))
+    )
+    logger.info("🔌 Connecting to Qdrant")
     grpc_port = 6334 if args.prefer_grpc and urlparse(qdrant_url).port == 6333 else None
     client = QdrantClient(
         url=qdrant_url,
     )
     two_stage = TwoStageRetriever(client, args.collection)
     single_stage = SingleStageRetriever(client, args.collection)
     # Embed query
     logger.info(f"🔍 Query: {args.query}")
     query_embedding = embedder.embed_query(args.query)
     # Build filter
     filter_obj = None
     if args.year or args.source or args.district:
             source=args.source,
             district=args.district,
         )
     # Search
     query_np = query_embedding.detach().cpu().numpy()
     if args.strategy == "single_full":
             filter_obj=filter_obj,
             stage1_mode=args.stage1_mode,
         )
     # Display results
     logger.info(f"\n📊 Results ({len(results)}):")
     for i, result in enumerate(results, 1):
         payload = result.get("payload", {})
         score = result.get("score_final", result.get("score_stage1", 0))
         filename = payload.get("filename", "N/A")
         page_num = payload.get("page_number", "N/A")
         year = payload.get("year", "N/A")
         source = payload.get("source", "N/A")
         logger.info(f"  {i}. {filename} p.{page_num}")
         logger.info(f"     Score: {score:.4f} | Year: {year} | Source: {source}")
         # Text snippet
         text = payload.get("text", "")
         if text and args.show_text:
 def cmd_info(args):
     """Show collection info."""
     from qdrant_client import QdrantClient
     load_dotenv()
+    qdrant_url = (
+        os.getenv("SIGIR_QDRANT_URL") or os.getenv("DEST_QDRANT_URL") or os.getenv("QDRANT_URL")
+    )
     qdrant_api_key = (
         os.getenv("SIGIR_QDRANT_KEY")
         or os.getenv("SIGIR_QDRANT_API_KEY")
         or os.getenv("DEST_QDRANT_API_KEY")
         or os.getenv("QDRANT_API_KEY")
     )
     if not qdrant_url:
         logger.error("❌ QDRANT_URL not set")
         sys.exit(1)
     grpc_port = 6334 if args.prefer_grpc and urlparse(qdrant_url).port == 6333 else None
     client = QdrantClient(
         url=qdrant_url,
         grpc_port=grpc_port,
         check_compatibility=False,
     )
     try:
         info = client.get_collection(args.collection)
         status = info.status
         if hasattr(status, "value"):
             status = status.value
         indexed_count = getattr(info, "indexed_vectors_count", 0) or 0
         if isinstance(indexed_count, dict):
             indexed_count = sum(indexed_count.values())
         logger.info(f"📊 Collection: {args.collection}")
         logger.info(f"   Status: {status}")
         logger.info(f"   Points: {info.points_count}")
         logger.info(f"   Indexed vectors: {indexed_count}")
         # Show vector config
         if hasattr(info, "config") and hasattr(info.config, "params"):
             vectors = getattr(info.config.params, "vectors", {})
             if vectors:
                 logger.info(f"   Vectors: {list(vectors.keys())}")
     except Exception as e:
         logger.error(f"❌ Could not get collection info: {e}")
         sys.exit(1)
 Examples:
   # Process PDFs (like process_pdfs_saliency_v2.py)
   visual-rag process --reports-dir ./pdfs --metadata-file metadata.json
   # Process without Cloudinary
   visual-rag process --reports-dir ./pdfs --no-cloudinary
   # Search
   visual-rag search --query "budget allocation" --collection my_docs
   # Search with filters
   visual-rag search --query "budget" --year 2023 --source "Local Government"
   # Show collection info
   visual-rag info --collection my_docs
         """,
     )
     parser.add_argument("--debug", action="store_true", help="Enable debug logging")
     subparsers = parser.add_subparsers(dest="command", help="Command")
     # =========================================================================
     # PROCESS command
     # =========================================================================
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
     process_parser.add_argument(
+        "--reports-dir", type=str, required=True, help="Directory containing PDF files"
     )
     process_parser.add_argument(
+        "--metadata-file",
+        type=str,
+        help="JSON file with filename → metadata mapping (like filename_metadata.json)",
     )
     process_parser.add_argument(
+        "--collection", type=str, default="visual_documents", help="Qdrant collection name"
     )
     process_parser.add_argument(
+        "--model",
+        type=str,
+        default="vidore/colSmol-500M",
+        help="Model name (vidore/colSmol-500M, vidore/colpali-v1.3, etc.)",
     )
+    process_parser.add_argument("--batch-size", type=int, default=8, help="Embedding batch size")
+    process_parser.add_argument("--config", type=str, help="Path to config.yaml file")
     process_parser.add_argument(
+        "--no-cloudinary", action="store_true", help="Skip Cloudinary uploads"
     )
     process_parser.add_argument(
         "--crop-empty",
         help="If set, attempts to crop away the bottom region that contains sparse page numbers (default: off).",
     )
     process_parser.add_argument(
+        "--no-skip-existing",
+        action="store_true",
+        help="Process all pages even if they exist in Qdrant",
     )
     process_parser.add_argument(
+        "--force-recreate", action="store_true", help="Delete and recreate collection"
     )
     process_parser.add_argument(
+        "--dry-run", action="store_true", help="Show what would be processed without doing it"
     )
     process_parser.add_argument(
+        "--strategy",
+        type=str,
+        default="pooling",
         choices=["pooling", "standard", "all"],
         help="Embedding strategy: 'pooling' (NOVEL), 'standard' (BASELINE), "
+        "'all' (embed once, store BOTH for comparison)",
     )
     process_parser.add_argument(
         "--torch-dtype",
         help="Disable gRPC for Qdrant client.",
     )
     process_parser.set_defaults(func=cmd_process)
     # =========================================================================
     # SEARCH command
     # =========================================================================
         "search",
         help="Search documents",
     )
+    search_parser.add_argument("--query", type=str, required=True, help="Search query")
     search_parser.add_argument(
+        "--collection", type=str, default="visual_documents", help="Qdrant collection name"
     )
     search_parser.add_argument(
+        "--model", type=str, default="vidore/colSmol-500M", help="Model name"
     )
     search_parser.add_argument(
         "--processor-speed",
         choices=["fast", "slow", "auto"],
         help="Processor implementation: fast (default, with fallback to slow), slow, or auto.",
     )
+    search_parser.add_argument("--top-k", type=int, default=10, help="Number of results")
     search_parser.add_argument(
+        "--strategy",
+        type=str,
+        default="single_full",
         choices=["single_full", "single_tiles", "single_global", "two_stage"],
+        help="Search strategy",
     )
     search_parser.add_argument(
+        "--prefetch-k", type=int, default=200, help="Prefetch candidates for two-stage retrieval"
     )
     search_parser.add_argument(
+        "--stage1-mode",
+        type=str,
+        default="pooled_query_vs_tiles",
         choices=["pooled_query_vs_tiles", "tokens_vs_tiles", "pooled_query_vs_global"],
+        help="Stage 1 mode for two-stage retrieval",
     )
+    search_parser.add_argument("--year", type=int, help="Filter by year")
+    search_parser.add_argument("--source", type=str, help="Filter by source")
+    search_parser.add_argument("--district", type=str, help="Filter by district")
     search_parser.add_argument(
+        "--show-text", action="store_true", help="Show text snippets in results"
     )
     search_grpc_group = search_parser.add_mutually_exclusive_group()
     search_grpc_group.add_argument(
         help="Disable gRPC for Qdrant client.",
     )
     search_parser.set_defaults(func=cmd_search)
     # =========================================================================
     # INFO command
     # =========================================================================
         help="Show collection info",
     )
     info_parser.add_argument(
+        "--collection", type=str, default="visual_documents", help="Qdrant collection name"
     )
     info_grpc_group = info_parser.add_mutually_exclusive_group()
     info_grpc_group.add_argument(
         help="Disable gRPC for Qdrant client.",
     )
     info_parser.set_defaults(func=cmd_info)
     # Parse and execute
     args = parser.parse_args()
     setup_logging(args.debug)
     if not args.command:
         parser.print_help()
         sys.exit(0)
     args.func(args)

visual_rag/config.py CHANGED Viewed

@@ -7,57 +7,56 @@ Provides:
 - Convenience getters for common settings
 """
-import os
 import logging
 from pathlib import Path
-from typing import Any, Optional, Dict
 logger = logging.getLogger(__name__)
-# Global config cache
-_config_cache: Optional[Dict[str, Any]] = None
 def _env_qdrant_url() -> Optional[str]:
-    return os.getenv("SIGIR_QDRANT_URL") or os.getenv("DEST_QDRANT_URL") or os.getenv("QDRANT_URL")
 def _env_qdrant_api_key() -> Optional[str]:
-    return (
-        os.getenv("SIGIR_QDRANT_KEY")
-        or os.getenv("SIGIR_QDRANT_API_KEY")
-        or os.getenv("DEST_QDRANT_API_KEY")
-        or os.getenv("QDRANT_API_KEY")
-    )
 def load_config(
     config_path: Optional[str] = None,
     force_reload: bool = False,
 ) -> Dict[str, Any]:
     """
     Load configuration from YAML file.
     Uses caching to avoid repeated file I/O.
     Environment variables can override config values.
     Args:
         config_path: Path to config file (auto-detected if None)
         force_reload: Bypass cache and reload from file
     Returns:
         Configuration dictionary
     """
-    global _config_cache
-    # Return cached config if available
-    if _config_cache is not None and not force_reload:
-        return _config_cache
     # Find config file
     if config_path is None:
         config_path = os.getenv("VISUALRAG_CONFIG")
         if config_path is None:
             # Check common locations
             search_paths = [
@@ -65,65 +64,75 @@ def load_config(
                 Path.cwd() / "visual_rag.yaml",
                 Path.home() / ".visual_rag" / "config.yaml",
             ]
             for path in search_paths:
                 if path.exists():
                     config_path = str(path)
                     break
     # Load YAML if file exists
     config = {}
     if config_path and Path(config_path).exists():
         try:
             import yaml
             with open(config_path, "r") as f:
                 config = yaml.safe_load(f) or {}
             logger.info(f"Loaded config from: {config_path}")
         except ImportError:
             logger.warning("PyYAML not installed, using environment variables only")
         except Exception as e:
             logger.warning(f"Could not load config file: {e}")
-    # Apply environment variable overrides
-    config = _apply_env_overrides(config)
-    _config_cache = config
-    return config
 def _apply_env_overrides(config: Dict[str, Any]) -> Dict[str, Any]:
     """Apply environment variable overrides."""
     env_mappings = {
         # Qdrant
         "QDRANT_URL": ["qdrant", "url"],
         "QDRANT_API_KEY": ["qdrant", "api_key"],
         "QDRANT_COLLECTION": ["qdrant", "collection"],
         # Model
         "VISUALRAG_MODEL": ["model", "name"],
         "COLPALI_MODEL_NAME": ["model", "name"],  # Alias
         "EMBEDDING_BATCH_SIZE": ["model", "batch_size"],
         # Cloudinary
         "CLOUDINARY_CLOUD_NAME": ["cloudinary", "cloud_name"],
         "CLOUDINARY_API_KEY": ["cloudinary", "api_key"],
         "CLOUDINARY_API_SECRET": ["cloudinary", "api_secret"],
         # Processing
         "PDF_DPI": ["processing", "dpi"],
         "JPEG_QUALITY": ["processing", "jpeg_quality"],
         # Search
         "SEARCH_STRATEGY": ["search", "strategy"],
         "PREFETCH_K": ["search", "prefetch_k"],
         # Special token handling
         "VISUALRAG_INCLUDE_SPECIAL_TOKENS": ["embedding", "include_special_tokens"],
     }
     for env_var, path in env_mappings.items():
         value = os.getenv(env_var)
         if value is not None:
@@ -133,50 +142,51 @@ def _apply_env_overrides(config: Dict[str, Any]) -> Dict[str, Any]:
                 if key not in current:
                     current[key] = {}
                 current = current[key]
             # Convert value to appropriate type
             final_key = path[-1]
             if final_key in current:
                 existing_type = type(current[final_key])
-                if existing_type == bool:
                     value = value.lower() in ("true", "1", "yes", "on")
-                elif existing_type == int:
                     value = int(value)
-                elif existing_type == float:
                     value = float(value)
             current[final_key] = value
             logger.debug(f"Config override: {'.'.join(path)} = {value}")
     return config
 def get(key: str, default: Any = None) -> Any:
     """
     Get a configuration value by dot-notation path.
     Examples:
         >>> get("qdrant.url")
         >>> get("model.name", "vidore/colSmol-500M")
         >>> get("search.strategy", "multi_vector")
     """
-    config = load_config()
     keys = key.split(".")
     current = config
     for k in keys:
         if isinstance(current, dict) and k in current:
             current = current[k]
         else:
             return default
     return current
-def get_section(section: str) -> Dict[str, Any]:
     """Get an entire configuration section."""
-    config = load_config()
     return config.get(section, {})
@@ -215,5 +225,3 @@ def get_search_config() -> Dict[str, Any]:
         "prefetch_k": get("search.prefetch_k", 200),
         "top_k": get("search.top_k", 10),
     }

 - Convenience getters for common settings
 """
+import copy
 import logging
+import os
 from pathlib import Path
+from typing import Any, Dict, Optional
 logger = logging.getLogger(__name__)
+# Global config cache (raw YAML only; env overrides applied on demand)
+_raw_config_cache: Optional[Dict[str, Any]] = None
+_raw_config_cache_path: Optional[str] = None
 def _env_qdrant_url() -> Optional[str]:
+    """Get Qdrant URL from environment. Prefers QDRANT_URL."""
+    return os.getenv("QDRANT_URL") or os.getenv("SIGIR_QDRANT_URL")  # legacy fallback
 def _env_qdrant_api_key() -> Optional[str]:
+    """Get Qdrant API key from environment. Prefers QDRANT_API_KEY."""
+    return os.getenv("QDRANT_API_KEY") or os.getenv("SIGIR_QDRANT_KEY")  # legacy fallback
 def load_config(
     config_path: Optional[str] = None,
     force_reload: bool = False,
+    apply_env_overrides: bool = True,
 ) -> Dict[str, Any]:
     """
     Load configuration from YAML file.
     Uses caching to avoid repeated file I/O.
     Environment variables can override config values.
     Args:
         config_path: Path to config file (auto-detected if None)
         force_reload: Bypass cache and reload from file
     Returns:
         Configuration dictionary
     """
+    global _raw_config_cache, _raw_config_cache_path
+    # Determine the effective config path (used for caching)
+    effective_path: Optional[str] = None
     # Find config file
     if config_path is None:
         config_path = os.getenv("VISUALRAG_CONFIG")
         if config_path is None:
             # Check common locations
             search_paths = [
                 Path.cwd() / "visual_rag.yaml",
                 Path.home() / ".visual_rag" / "config.yaml",
             ]
             for path in search_paths:
                 if path.exists():
                     config_path = str(path)
                     break
+    effective_path = str(config_path) if config_path else None
+    # Return cached raw config if available.
+    # - If caller doesn't specify a path (effective_path is None), use whatever was
+    #   loaded most recently (common pattern in apps).
+    # - If a path is specified, only reuse cache when it matches.
+    if (
+        _raw_config_cache is not None
+        and not force_reload
+        and (effective_path is None or _raw_config_cache_path == effective_path)
+    ):
+        cfg = copy.deepcopy(_raw_config_cache)
+        return _apply_env_overrides(cfg) if apply_env_overrides else cfg
     # Load YAML if file exists
     config = {}
     if config_path and Path(config_path).exists():
         try:
             import yaml
             with open(config_path, "r") as f:
                 config = yaml.safe_load(f) or {}
             logger.info(f"Loaded config from: {config_path}")
         except ImportError:
             logger.warning("PyYAML not installed, using environment variables only")
         except Exception as e:
             logger.warning(f"Could not load config file: {e}")
+    # Cache RAW config (no env overrides)
+    _raw_config_cache = copy.deepcopy(config)
+    _raw_config_cache_path = effective_path
+    # Return resolved or raw depending on caller preference
+    cfg = copy.deepcopy(config)
+    return _apply_env_overrides(cfg) if apply_env_overrides else cfg
 def _apply_env_overrides(config: Dict[str, Any]) -> Dict[str, Any]:
     """Apply environment variable overrides."""
     env_mappings = {
         # Qdrant
         "QDRANT_URL": ["qdrant", "url"],
         "QDRANT_API_KEY": ["qdrant", "api_key"],
         "QDRANT_COLLECTION": ["qdrant", "collection"],
         # Model
         "VISUALRAG_MODEL": ["model", "name"],
         "COLPALI_MODEL_NAME": ["model", "name"],  # Alias
         "EMBEDDING_BATCH_SIZE": ["model", "batch_size"],
         # Cloudinary
         "CLOUDINARY_CLOUD_NAME": ["cloudinary", "cloud_name"],
         "CLOUDINARY_API_KEY": ["cloudinary", "api_key"],
         "CLOUDINARY_API_SECRET": ["cloudinary", "api_secret"],
         # Processing
         "PDF_DPI": ["processing", "dpi"],
         "JPEG_QUALITY": ["processing", "jpeg_quality"],
         # Search
         "SEARCH_STRATEGY": ["search", "strategy"],
         "PREFETCH_K": ["search", "prefetch_k"],
         # Special token handling
         "VISUALRAG_INCLUDE_SPECIAL_TOKENS": ["embedding", "include_special_tokens"],
     }
     for env_var, path in env_mappings.items():
         value = os.getenv(env_var)
         if value is not None:
                 if key not in current:
                     current[key] = {}
                 current = current[key]
             # Convert value to appropriate type
             final_key = path[-1]
             if final_key in current:
                 existing_type = type(current[final_key])
+                # Use `is` for type comparisons (Ruff E721).
+                if existing_type is bool:
                     value = value.lower() in ("true", "1", "yes", "on")
+                elif existing_type is int:
                     value = int(value)
+                elif existing_type is float:
                     value = float(value)
             current[final_key] = value
             logger.debug(f"Config override: {'.'.join(path)} = {value}")
     return config
 def get(key: str, default: Any = None) -> Any:
     """
     Get a configuration value by dot-notation path.
     Examples:
         >>> get("qdrant.url")
         >>> get("model.name", "vidore/colSmol-500M")
         >>> get("search.strategy", "multi_vector")
     """
+    config = load_config(apply_env_overrides=True)
     keys = key.split(".")
     current = config
     for k in keys:
         if isinstance(current, dict) and k in current:
             current = current[k]
         else:
             return default
     return current
+def get_section(section: str, *, apply_env_overrides: bool = True) -> Dict[str, Any]:
     """Get an entire configuration section."""
+    config = load_config(apply_env_overrides=apply_env_overrides)
     return config.get(section, {})
         "prefetch_k": get("search.prefetch_k", 200),
         "top_k": get("search.top_k", 10),
     }

visual_rag/demo_runner.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+Launch the Streamlit demo from an installed package.
+Why:
+- After `pip install visual-rag-toolkit`, the repo layout isn't present.
+- We package the `demo/` module and expose `visual_rag.demo()` + `visual-rag-demo`.
+"""
+from __future__ import annotations
+import argparse
+import importlib
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Optional
+def demo(
+    *,
+    host: str = "0.0.0.0",
+    port: int = 7860,
+    headless: bool = True,
+    open_browser: bool = False,
+    extra_args: Optional[list[str]] = None,
+) -> int:
+    """
+    Launch the Streamlit demo UI.
+    Requirements:
+    - `visual-rag-toolkit[ui,qdrant,embedding,pdf]` (or `visual-rag-toolkit[all]`)
+    Returns:
+        Streamlit process exit code.
+    """
+    try:
+        import streamlit  # noqa: F401
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "Streamlit is not installed. Install with:\n"
+            '  pip install "visual-rag-toolkit[ui,qdrant,embedding,pdf]"'
+        ) from e
+    # Resolve the installed demo entrypoint path.
+    mod = importlib.import_module("demo.app")
+    app_path = Path(getattr(mod, "__file__", "")).resolve()
+    if not app_path.exists():  # pragma: no cover
+        raise RuntimeError("Could not locate installed demo app (demo.app).")
+    # Build a stable Streamlit invocation.
+    cmd = [sys.executable, "-m", "streamlit", "run", str(app_path)]
+    cmd += ["--server.address", str(host)]
+    cmd += ["--server.port", str(int(port))]
+    cmd += ["--server.headless", "true" if headless else "false"]
+    cmd += ["--browser.gatherUsageStats", "false"]
+    cmd += ["--server.runOnSave", "false"]
+    cmd += ["--browser.serverAddress", str(host)]
+    if not open_browser:
+        cmd += ["--browser.serverPort", str(int(port))]
+        cmd += ["--browser.open", "false"]
+    if extra_args:
+        cmd += list(extra_args)
+    env = os.environ.copy()
+    # Make sure the demo doesn't spam internal Streamlit warnings in logs.
+    env.setdefault("STREAMLIT_BROWSER_GATHER_USAGE_STATS", "false")
+    return subprocess.call(cmd, env=env)
+def main() -> None:
+    p = argparse.ArgumentParser(description="Launch the Visual RAG Toolkit Streamlit demo.")
+    p.add_argument("--host", default="0.0.0.0")
+    p.add_argument("--port", type=int, default=7860)
+    p.add_argument(
+        "--no-headless", action="store_true", help="Run with a browser window (not headless)."
+    )
+    p.add_argument("--open", action="store_true", help="Open browser automatically.")
+    args, unknown = p.parse_known_args()
+    rc = demo(
+        host=args.host,
+        port=args.port,
+        headless=(not args.no_headless),
+        open_browser=bool(args.open),
+        extra_args=unknown,
+    )
+    raise SystemExit(rc)

visual_rag/embedding/__init__.py CHANGED Viewed

@@ -6,19 +6,18 @@ Provides:
 - Pooling utilities: tile-level, global, MaxSim scoring
 """
-from visual_rag.embedding.visual_embedder import VisualEmbedder, ColPaliEmbedder
 from visual_rag.embedding.pooling import (
-    tile_level_mean_pooling,
-    global_mean_pooling,
-    compute_maxsim_score,
     compute_maxsim_batch,
 )
 __all__ = [
     # Main embedder
     "VisualEmbedder",
     "ColPaliEmbedder",  # Backward compatibility alias
     # Pooling functions
     "tile_level_mean_pooling",
     "global_mean_pooling",

 - Pooling utilities: tile-level, global, MaxSim scoring
 """
 from visual_rag.embedding.pooling import (
     compute_maxsim_batch,
+    compute_maxsim_score,
+    global_mean_pooling,
+    tile_level_mean_pooling,
 )
+from visual_rag.embedding.visual_embedder import ColPaliEmbedder, VisualEmbedder
 __all__ = [
     # Main embedder
     "VisualEmbedder",
     "ColPaliEmbedder",  # Backward compatibility alias
     # Pooling functions
     "tile_level_mean_pooling",
     "global_mean_pooling",

visual_rag/embedding/pooling.py CHANGED Viewed

@@ -7,10 +7,11 @@ Provides:
 - MaxSim scoring for ColBERT-style late interaction
 """
 import numpy as np
 import torch
-from typing import Union, Optional
-import logging
 logger = logging.getLogger(__name__)
@@ -39,24 +40,24 @@ def tile_level_mean_pooling(
 ) -> np.ndarray:
     """
     Compute tile-level mean pooling for multi-vector embeddings.
     Instead of collapsing to 1×dim (global pooling), this preserves spatial
     structure by computing mean per tile → num_tiles × dim.
     This is our NOVEL contribution for scalable visual retrieval:
     - Faster than full MaxSim (fewer vectors to compare)
     - More accurate than global pooling (preserves spatial info)
     - Ideal for two-stage retrieval (prefetch with pooled, rerank with full)
     Args:
         embedding: Visual token embeddings [num_visual_tokens, dim]
         num_tiles: Number of tiles (including global tile)
         patches_per_tile: Patches per tile (64 for ColSmol)
         output_dtype: Output dtype (default: infer from input, fp16→fp16, bf16→fp32)
     Returns:
         Tile-level pooled embeddings [num_tiles, dim]
     Example:
         >>> # Image with 4×3 tiles + 1 global = 13 tiles
         >>> # Each tile has 64 patches → 832 visual tokens
@@ -71,31 +72,29 @@ def tile_level_mean_pooling(
             emb_np = embedding.cpu().numpy().astype(np.float32)
     else:
         emb_np = np.array(embedding, dtype=np.float32)
     num_visual_tokens = emb_np.shape[0]
     expected_tokens = num_tiles * patches_per_tile
     if num_visual_tokens != expected_tokens:
-        logger.debug(
-            f"Token count mismatch: {num_visual_tokens} vs expected {expected_tokens}"
-        )
         actual_tiles = num_visual_tokens // patches_per_tile
         if actual_tiles * patches_per_tile != num_visual_tokens:
             actual_tiles += 1
         num_tiles = actual_tiles
     tile_embeddings = []
     for tile_idx in range(num_tiles):
         start_idx = tile_idx * patches_per_tile
         end_idx = min(start_idx + patches_per_tile, num_visual_tokens)
         if start_idx >= num_visual_tokens:
             break
         tile_patches = emb_np[start_idx:end_idx]
         tile_mean = tile_patches.mean(axis=0)
         tile_embeddings.append(tile_mean)
     return np.array(tile_embeddings, dtype=out_dtype)
@@ -116,7 +115,9 @@ def colpali_row_mean_pooling(
     num_tokens, dim = emb_np.shape
     expected = int(grid_size) * int(grid_size)
     if num_tokens != expected:
-        raise ValueError(f"Expected {expected} visual tokens for grid_size={grid_size}, got {num_tokens}")
     grid = emb_np.reshape(int(grid_size), int(grid_size), int(dim))
     pooled = grid.mean(axis=1)
@@ -157,7 +158,9 @@ def colsmol_experimental_pooling(
         last_tile_start = (int(num_tiles) - 1) * int(patches_per_tile)
     prefix = emb_np[:last_tile_start]
-    last_tile = emb_np[last_tile_start : min(last_tile_start + int(patches_per_tile), num_visual_tokens)]
     if prefix.size:
         prefix_tiles = prefix.reshape(-1, int(patches_per_tile), int(dim))
@@ -174,7 +177,7 @@ def colpali_experimental_pooling_from_rows(
 ) -> np.ndarray:
     """
     Experimental "convolution-style" pooling with window size 3.
     For N input rows, produces N + 2 output vectors:
     - Position 0: row[0] alone (1 row)
     - Position 1: mean(rows[0:2]) (2 rows)
@@ -182,7 +185,7 @@ def colpali_experimental_pooling_from_rows(
     - Positions 3 to N-1: sliding window of 3 (rows[i-2:i+1])
     - Position N: mean(rows[N-2:N]) (last 2 rows)
     - Position N+1: row[N-1] alone (last row)
     For N=32 rows: produces 34 vectors.
     """
     out_dtype = _infer_output_dtype(row_vectors, output_dtype)
@@ -202,13 +205,16 @@ def colpali_experimental_pooling_from_rows(
     if n == 2:
         return np.stack([rows[0], rows[:2].mean(axis=0), rows[1]], axis=0).astype(out_dtype)
     if n == 3:
-        return np.stack([
-            rows[0],
-            rows[:2].mean(axis=0),
-            rows[:3].mean(axis=0),
-            rows[1:3].mean(axis=0),
-            rows[2],
-        ], axis=0).astype(out_dtype)
     out = np.zeros((n + 2, dim), dtype=np.float32)
     out[0] = rows[0]
@@ -227,14 +233,14 @@ def global_mean_pooling(
 ) -> np.ndarray:
     """
     Compute global mean pooling → single vector.
     This is the simplest pooling but loses all spatial information.
     Use for fastest retrieval when accuracy can be sacrificed.
     Args:
         embedding: Multi-vector embeddings [num_tokens, dim]
         output_dtype: Output dtype (default: infer from input, fp16→fp16, bf16→fp32)
     Returns:
         Pooled vector [dim]
     """
@@ -246,7 +252,7 @@ def global_mean_pooling(
             emb_np = embedding.cpu().numpy()
     else:
         emb_np = np.array(embedding)
     return emb_np.mean(axis=0).astype(out_dtype)
@@ -257,21 +263,21 @@ def compute_maxsim_score(
 ) -> float:
     """
     Compute ColBERT-style MaxSim late interaction score.
     For each query token, finds max similarity with any document token,
     then sums across query tokens.
     This is the standard scoring for ColBERT/ColPali:
     score = Σ_q max_d (sim(q, d))
     Args:
         query_embedding: Query embeddings [num_query_tokens, dim]
         doc_embedding: Document embeddings [num_doc_tokens, dim]
         normalize: L2 normalize embeddings before scoring (recommended)
     Returns:
         MaxSim score (higher is better)
     Example:
         >>> query = embedder.embed_query("budget allocation")
         >>> doc = embeddings[0]  # From embed_images
@@ -282,22 +288,20 @@ def compute_maxsim_score(
         query_norm = query_embedding / (
             np.linalg.norm(query_embedding, axis=1, keepdims=True) + 1e-8
         )
-        doc_norm = doc_embedding / (
-            np.linalg.norm(doc_embedding, axis=1, keepdims=True) + 1e-8
-        )
     else:
         query_norm = query_embedding
         doc_norm = doc_embedding
     # Compute similarity matrix: [num_query, num_doc]
     similarity_matrix = np.dot(query_norm, doc_norm.T)
     # MaxSim: For each query token, take max similarity with any doc token
     max_similarities = similarity_matrix.max(axis=1)
     # Sum across query tokens
     score = float(max_similarities.sum())
     return score
@@ -308,12 +312,12 @@ def compute_maxsim_batch(
 ) -> list:
     """
     Compute MaxSim scores for multiple documents efficiently.
     Args:
         query_embedding: Query embeddings [num_query_tokens, dim]
         doc_embeddings: List of document embeddings
         normalize: L2 normalize embeddings
     Returns:
         List of MaxSim scores
     """
@@ -324,18 +328,16 @@ def compute_maxsim_batch(
         )
     else:
         query_norm = query_embedding
     scores = []
     for doc_emb in doc_embeddings:
         if normalize:
-            doc_norm = doc_emb / (
-                np.linalg.norm(doc_emb, axis=1, keepdims=True) + 1e-8
-            )
         else:
             doc_norm = doc_emb
         sim_matrix = np.dot(query_norm, doc_norm.T)
         max_sims = sim_matrix.max(axis=1)
         scores.append(float(max_sims.sum()))
     return scores

 - MaxSim scoring for ColBERT-style late interaction
 """
+import logging
+from typing import Optional, Union
 import numpy as np
 import torch
 logger = logging.getLogger(__name__)
 ) -> np.ndarray:
     """
     Compute tile-level mean pooling for multi-vector embeddings.
     Instead of collapsing to 1×dim (global pooling), this preserves spatial
     structure by computing mean per tile → num_tiles × dim.
     This is our NOVEL contribution for scalable visual retrieval:
     - Faster than full MaxSim (fewer vectors to compare)
     - More accurate than global pooling (preserves spatial info)
     - Ideal for two-stage retrieval (prefetch with pooled, rerank with full)
     Args:
         embedding: Visual token embeddings [num_visual_tokens, dim]
         num_tiles: Number of tiles (including global tile)
         patches_per_tile: Patches per tile (64 for ColSmol)
         output_dtype: Output dtype (default: infer from input, fp16→fp16, bf16→fp32)
     Returns:
         Tile-level pooled embeddings [num_tiles, dim]
     Example:
         >>> # Image with 4×3 tiles + 1 global = 13 tiles
         >>> # Each tile has 64 patches → 832 visual tokens
             emb_np = embedding.cpu().numpy().astype(np.float32)
     else:
         emb_np = np.array(embedding, dtype=np.float32)
     num_visual_tokens = emb_np.shape[0]
     expected_tokens = num_tiles * patches_per_tile
     if num_visual_tokens != expected_tokens:
+        logger.debug(f"Token count mismatch: {num_visual_tokens} vs expected {expected_tokens}")
         actual_tiles = num_visual_tokens // patches_per_tile
         if actual_tiles * patches_per_tile != num_visual_tokens:
             actual_tiles += 1
         num_tiles = actual_tiles
     tile_embeddings = []
     for tile_idx in range(num_tiles):
         start_idx = tile_idx * patches_per_tile
         end_idx = min(start_idx + patches_per_tile, num_visual_tokens)
         if start_idx >= num_visual_tokens:
             break
         tile_patches = emb_np[start_idx:end_idx]
         tile_mean = tile_patches.mean(axis=0)
         tile_embeddings.append(tile_mean)
     return np.array(tile_embeddings, dtype=out_dtype)
     num_tokens, dim = emb_np.shape
     expected = int(grid_size) * int(grid_size)
     if num_tokens != expected:
+        raise ValueError(
+            f"Expected {expected} visual tokens for grid_size={grid_size}, got {num_tokens}"
+        )
     grid = emb_np.reshape(int(grid_size), int(grid_size), int(dim))
     pooled = grid.mean(axis=1)
         last_tile_start = (int(num_tiles) - 1) * int(patches_per_tile)
     prefix = emb_np[:last_tile_start]
+    last_tile = emb_np[
+        last_tile_start : min(last_tile_start + int(patches_per_tile), num_visual_tokens)
+    ]
     if prefix.size:
         prefix_tiles = prefix.reshape(-1, int(patches_per_tile), int(dim))
 ) -> np.ndarray:
     """
     Experimental "convolution-style" pooling with window size 3.
     For N input rows, produces N + 2 output vectors:
     - Position 0: row[0] alone (1 row)
     - Position 1: mean(rows[0:2]) (2 rows)
     - Positions 3 to N-1: sliding window of 3 (rows[i-2:i+1])
     - Position N: mean(rows[N-2:N]) (last 2 rows)
     - Position N+1: row[N-1] alone (last row)
     For N=32 rows: produces 34 vectors.
     """
     out_dtype = _infer_output_dtype(row_vectors, output_dtype)
     if n == 2:
         return np.stack([rows[0], rows[:2].mean(axis=0), rows[1]], axis=0).astype(out_dtype)
     if n == 3:
+        return np.stack(
+            [
+                rows[0],
+                rows[:2].mean(axis=0),
+                rows[:3].mean(axis=0),
+                rows[1:3].mean(axis=0),
+                rows[2],
+            ],
+            axis=0,
+        ).astype(out_dtype)
     out = np.zeros((n + 2, dim), dtype=np.float32)
     out[0] = rows[0]
 ) -> np.ndarray:
     """
     Compute global mean pooling → single vector.
     This is the simplest pooling but loses all spatial information.
     Use for fastest retrieval when accuracy can be sacrificed.
     Args:
         embedding: Multi-vector embeddings [num_tokens, dim]
         output_dtype: Output dtype (default: infer from input, fp16→fp16, bf16→fp32)
     Returns:
         Pooled vector [dim]
     """
             emb_np = embedding.cpu().numpy()
     else:
         emb_np = np.array(embedding)
     return emb_np.mean(axis=0).astype(out_dtype)
 ) -> float:
     """
     Compute ColBERT-style MaxSim late interaction score.
     For each query token, finds max similarity with any document token,
     then sums across query tokens.
     This is the standard scoring for ColBERT/ColPali:
     score = Σ_q max_d (sim(q, d))
     Args:
         query_embedding: Query embeddings [num_query_tokens, dim]
         doc_embedding: Document embeddings [num_doc_tokens, dim]
         normalize: L2 normalize embeddings before scoring (recommended)
     Returns:
         MaxSim score (higher is better)
     Example:
         >>> query = embedder.embed_query("budget allocation")
         >>> doc = embeddings[0]  # From embed_images
         query_norm = query_embedding / (
             np.linalg.norm(query_embedding, axis=1, keepdims=True) + 1e-8
         )
+        doc_norm = doc_embedding / (np.linalg.norm(doc_embedding, axis=1, keepdims=True) + 1e-8)
     else:
         query_norm = query_embedding
         doc_norm = doc_embedding
     # Compute similarity matrix: [num_query, num_doc]
     similarity_matrix = np.dot(query_norm, doc_norm.T)
     # MaxSim: For each query token, take max similarity with any doc token
     max_similarities = similarity_matrix.max(axis=1)
     # Sum across query tokens
     score = float(max_similarities.sum())
     return score
 ) -> list:
     """
     Compute MaxSim scores for multiple documents efficiently.
     Args:
         query_embedding: Query embeddings [num_query_tokens, dim]
         doc_embeddings: List of document embeddings
         normalize: L2 normalize embeddings
     Returns:
         List of MaxSim scores
     """
         )
     else:
         query_norm = query_embedding
     scores = []
     for doc_emb in doc_embeddings:
         if normalize:
+            doc_norm = doc_emb / (np.linalg.norm(doc_emb, axis=1, keepdims=True) + 1e-8)
         else:
             doc_norm = doc_emb
         sim_matrix = np.dot(query_norm, doc_norm.T)
         max_sims = sim_matrix.max(axis=1)
         scores.append(float(max_sims.sum()))
     return scores

visual_rag/embedding/visual_embedder.py CHANGED Viewed

@@ -12,12 +12,12 @@ The embedder is BACKEND-AGNOSTIC - configure which model to use via the
 """
 import gc
-import os
 import logging
-from typing import List, Dict, Any, Optional, Tuple, Union
-import torch
 import numpy as np
 from PIL import Image
 from tqdm import tqdm
@@ -27,11 +27,11 @@ logger = logging.getLogger(__name__)
 class VisualEmbedder:
     """
     Visual document embedder supporting multiple backends.
     Currently supports:
     - ColPali family (ColSmol-500M, ColPali, ColQwen2)
     - More backends can be added
     Args:
         model_name: HuggingFace model name (e.g., "vidore/colSmol-500M")
         backend: Backend type ("colpali", "auto"). "auto" detects from model_name.
@@ -39,23 +39,23 @@ class VisualEmbedder:
         torch_dtype: Data type for model weights
         batch_size: Batch size for image processing
         filter_special_tokens: Filter special tokens from query embeddings
     Example:
         >>> # Auto-detect backend from model name
         >>> embedder = VisualEmbedder(model_name="vidore/colSmol-500M")
-        >>>
         >>> # Embed images
         >>> image_embeddings = embedder.embed_images(images)
-        >>>
         >>> # Embed query
         >>> query_embedding = embedder.embed_query("What is the budget?")
-        >>>
         >>> # Get token info for saliency maps
         >>> embeddings, token_infos = embedder.embed_images(
         ...     images, return_token_info=True
         ... )
     """
     # Known model families and their backends
     MODEL_BACKENDS = {
         "colsmol": "colpali",
@@ -63,7 +63,7 @@ class VisualEmbedder:
         "colqwen": "colpali",
         "colidefics": "colpali",
     }
     def __init__(
         self,
         model_name: str = "vidore/colSmol-500M",
@@ -81,15 +81,15 @@ class VisualEmbedder:
         if processor_speed not in ("fast", "slow", "auto"):
             raise ValueError("processor_speed must be one of: fast, slow, auto")
         self.processor_speed = processor_speed
         if os.getenv("VISUALRAG_INCLUDE_SPECIAL_TOKENS"):
             self.filter_special_tokens = False
             logger.info("Special token filtering disabled via VISUALRAG_INCLUDE_SPECIAL_TOKENS")
         if backend == "auto":
             backend = self._detect_backend(model_name)
         self.backend = backend
         if device is None:
             if torch.cuda.is_available():
                 device = "cuda"
@@ -98,53 +98,55 @@ class VisualEmbedder:
             else:
                 device = "cpu"
         self.device = device
         if torch_dtype is None:
             if device == "cuda":
                 torch_dtype = torch.bfloat16
             else:
                 torch_dtype = torch.float32
         self.torch_dtype = torch_dtype
         if output_dtype is None:
             if torch_dtype == torch.float16:
                 output_dtype = np.float16
             else:
                 output_dtype = np.float32
         self.output_dtype = output_dtype
         self._model = None
         self._processor = None
         self._image_token_id = None
-        logger.info(f"🤖 VisualEmbedder initialized")
         logger.info(f"   Model: {model_name}")
         logger.info(f"   Backend: {backend}")
-        logger.info(f"   Device: {device}, torch_dtype: {torch_dtype}, output_dtype: {output_dtype}")
     def _detect_backend(self, model_name: str) -> str:
         """Auto-detect backend from model name."""
         model_lower = model_name.lower()
         for key, backend in self.MODEL_BACKENDS.items():
             if key in model_lower:
                 logger.debug(f"Detected backend '{backend}' from model name")
                 return backend
         # Default to colpali for unknown models
         logger.warning(f"Unknown model '{model_name}', defaulting to 'colpali' backend")
         return "colpali"
     def _load_model(self):
         """Lazy load the model when first needed."""
         if self._model is not None:
             return
         if self.backend == "colpali":
             self._load_colpali_model()
         else:
             raise ValueError(f"Unknown backend: {self.backend}")
     def _load_colpali_model(self):
         """Load ColPali-family model."""
         try:
@@ -162,7 +164,7 @@ class VisualEmbedder:
                 "pip install visual-rag-toolkit[embedding] or "
                 "pip install colpali-engine"
             )
         logger.info(f"🤖 Loading ColPali model: {self.model_name}")
         logger.info(f"   Device: {self.device}, dtype: {self.torch_dtype}")
@@ -170,7 +172,7 @@ class VisualEmbedder:
             if self.processor_speed == "auto":
                 return {}
             return {"use_fast": self.processor_speed == "fast"}
         from transformers import AutoConfig
         cfg = AutoConfig.from_pretrained(self.model_name)
@@ -183,12 +185,16 @@ class VisualEmbedder:
                 device_map=self.device,
             ).eval()
             try:
-                self._processor = ColPaliProcessor.from_pretrained(self.model_name, **_processor_kwargs())
             except TypeError:
                 self._processor = ColPaliProcessor.from_pretrained(self.model_name)
             except Exception:
                 if self.processor_speed == "fast":
-                    self._processor = ColPaliProcessor.from_pretrained(self.model_name, use_fast=False)
                 else:
                     raise
             self._image_token_id = self._processor.image_token_id
@@ -202,12 +208,18 @@ class VisualEmbedder:
                 device_map=self.device,
             ).eval()
             try:
-                self._processor = ColQwen2Processor.from_pretrained(self.model_name, device_map=self.device, **_processor_kwargs())
             except TypeError:
-                self._processor = ColQwen2Processor.from_pretrained(self.model_name, device_map=self.device)
             except Exception:
                 if self.processor_speed == "fast":
-                    self._processor = ColQwen2Processor.from_pretrained(self.model_name, device_map=self.device, use_fast=False)
                 else:
                     raise
             self._image_token_id = self._processor.image_token_id
@@ -231,33 +243,37 @@ class VisualEmbedder:
             attn_implementation=attn_implementation,
         ).eval()
         try:
-            self._processor = ColIdefics3Processor.from_pretrained(self.model_name, **_processor_kwargs())
         except TypeError:
             self._processor = ColIdefics3Processor.from_pretrained(self.model_name)
         except Exception:
             if self.processor_speed == "fast":
-                self._processor = ColIdefics3Processor.from_pretrained(self.model_name, use_fast=False)
             else:
                 raise
         self._image_token_id = self._processor.image_token_id
         logger.info("✅ Model loaded successfully")
     @property
     def model(self):
         self._load_model()
         return self._model
     @property
     def processor(self):
         self._load_model()
         return self._processor
     @property
     def image_token_id(self):
         self._load_model()
         return self._image_token_id
     def embed_query(
         self,
         query_text: str,
@@ -265,31 +281,31 @@ class VisualEmbedder:
     ) -> torch.Tensor:
         """
         Generate embedding for a text query.
         By default, filters out special tokens (CLS, SEP, PAD) to keep only
         meaningful text tokens for better MaxSim matching.
         Args:
             query_text: Natural language query string
             filter_special_tokens: Override instance-level setting
         Returns:
             Query embedding tensor of shape [num_tokens, embedding_dim]
         """
         should_filter = (
-            filter_special_tokens
-            if filter_special_tokens is not None
             else self.filter_special_tokens
         )
         with torch.no_grad():
             processed = self.processor.process_queries([query_text]).to(self.model.device)
             embedding = self.model(**processed)
         # Remove batch dimension: [1, tokens, dim] -> [tokens, dim]
         if embedding.dim() == 3:
             embedding = embedding.squeeze(0)
         if should_filter:
             # Filter special tokens based on attention mask
             attention_mask = processed.get("attention_mask")
@@ -297,7 +313,7 @@ class VisualEmbedder:
                 # Keep only tokens with attention_mask = 1
                 valid_mask = attention_mask.squeeze(0).bool()
                 embedding = embedding[valid_mask]
                 # Additionally filter padding tokens if present
                 input_ids = processed.get("input_ids")
                 if input_ids is not None:
@@ -307,11 +323,11 @@ class VisualEmbedder:
                     non_special_mask = input_ids >= 4
                     if non_special_mask.any():
                         embedding = embedding[non_special_mask]
             logger.debug(f"Query embedding: {embedding.shape[0]} tokens after filtering")
         else:
             logger.debug(f"Query embedding: {embedding.shape[0]} tokens (unfiltered)")
         return embedding
     def embed_queries(
@@ -327,7 +343,9 @@ class VisualEmbedder:
         Returns a list of tensors, each of shape [num_tokens, embedding_dim].
         """
         should_filter = (
-            filter_special_tokens if filter_special_tokens is not None else self.filter_special_tokens
         )
         batch_size = batch_size or self.batch_size
@@ -368,7 +386,7 @@ class VisualEmbedder:
                 torch.mps.empty_cache()
         return outputs
     def embed_images(
         self,
         images: List[Image.Image],
@@ -378,19 +396,19 @@ class VisualEmbedder:
     ) -> Union[List[torch.Tensor], Tuple[List[torch.Tensor], List[Dict[str, Any]]]]:
         """
         Generate embeddings for a list of images.
         Args:
             images: List of PIL Images
             batch_size: Override instance batch size
             return_token_info: Also return token metadata (for saliency maps)
             show_progress: Show progress bar
         Returns:
             If return_token_info=False:
                 List of embedding tensors [num_patches, dim]
             If return_token_info=True:
                 Tuple of (embeddings, token_infos)
         Token info contains:
             - visual_token_indices: Indices of visual tokens in embedding
             - num_visual_tokens: Count of visual tokens
@@ -398,54 +416,60 @@ class VisualEmbedder:
             - num_tiles: Total tiles (n_rows × n_cols + 1 global)
         """
         batch_size = batch_size or self.batch_size
-        if self.device == "mps" and "colpali" in (self.model_name or "").lower() and int(batch_size) > 1:
             batch_size = 1
         embeddings = []
         token_infos = [] if return_token_info else None
         iterator = range(0, len(images), batch_size)
         if show_progress:
             iterator = tqdm(iterator, desc="🎨 Embedding", unit="batch")
         for i in iterator:
-            batch = images[i:i + batch_size]
             with torch.no_grad():
                 processed = self.processor.process_images(batch).to(self.model.device)
                 # Extract token info before model forward
                 if return_token_info:
                     input_ids = processed["input_ids"]
                     batch_n_rows = processed.get("n_rows")
                     batch_n_cols = processed.get("n_cols")
                     for j in range(input_ids.shape[0]):
                         # Find visual token indices
-                        image_token_mask = (input_ids[j] == self.image_token_id)
                         visual_indices = torch.where(image_token_mask)[0].cpu().numpy().tolist()
                         n_rows = batch_n_rows[j].item() if batch_n_rows is not None else None
                         n_cols = batch_n_cols[j].item() if batch_n_cols is not None else None
-                        token_infos.append({
-                            "visual_token_indices": visual_indices,
-                            "num_visual_tokens": len(visual_indices),
-                            "n_rows": n_rows,
-                            "n_cols": n_cols,
-                            "num_tiles": (n_rows * n_cols + 1) if n_rows and n_cols else None,
-                        })
                 # Generate embeddings
                 batch_embeddings = self.model(**processed)
             # Extract per-image embeddings
             if isinstance(batch_embeddings, torch.Tensor) and batch_embeddings.dim() == 3:
                 for j in range(batch_embeddings.shape[0]):
                     embeddings.append(batch_embeddings[j].cpu())
             else:
                 embeddings.extend([e.cpu() for e in batch_embeddings])
             # Memory cleanup
             del processed, batch_embeddings
             gc.collect()
@@ -453,11 +477,11 @@ class VisualEmbedder:
                 torch.cuda.empty_cache()
             elif torch.backends.mps.is_available():
                 torch.mps.empty_cache()
         if return_token_info:
             return embeddings, token_infos
         return embeddings
     def extract_visual_embedding(
         self,
         full_embedding: torch.Tensor,
@@ -465,18 +489,18 @@ class VisualEmbedder:
     ) -> np.ndarray:
         """
         Extract only visual token embeddings from full embedding.
         Filters out special tokens, keeping only visual patches for MaxSim.
         Args:
             full_embedding: Full embedding [all_tokens, dim]
             token_info: Token info dict from embed_images
         Returns:
             Visual embedding array [num_visual_tokens, dim]
         """
         visual_indices = token_info["visual_token_indices"]
         if isinstance(full_embedding, torch.Tensor):
             if full_embedding.dtype == torch.bfloat16:
                 visual_emb = full_embedding[visual_indices].cpu().float().numpy()
@@ -484,7 +508,7 @@ class VisualEmbedder:
                 visual_emb = full_embedding[visual_indices].cpu().numpy()
         else:
             visual_emb = np.array(full_embedding)[visual_indices]
         return visual_emb.astype(self.output_dtype)
     def mean_pool_visual_embedding(
@@ -511,17 +535,23 @@ class VisualEmbedder:
             n_rows = (token_info or {}).get("n_rows")
             n_cols = (token_info or {}).get("n_cols")
             num_tiles = int(n_rows) * int(n_cols) + 1 if n_rows and n_cols else 13
-            return tile_level_mean_pooling(visual_np, num_tiles=num_tiles, patches_per_tile=64, output_dtype=self.output_dtype)
         num_tokens = int(visual_np.shape[0])
         grid = int(round(float(num_tokens) ** 0.5))
         if grid * grid != num_tokens:
-            raise ValueError(f"Cannot infer square grid from num_visual_tokens={num_tokens} for model={self.model_name}")
         if int(target_vectors) != int(grid):
             raise ValueError(
                 f"target_vectors={target_vectors} does not match inferred grid_size={grid} for model={self.model_name}"
             )
-        return colpali_row_mean_pooling(visual_np, grid_size=int(target_vectors), output_dtype=self.output_dtype)
     def global_pool_from_mean_pool(self, mean_pool: np.ndarray) -> np.ndarray:
         if mean_pool.size == 0:
@@ -536,7 +566,10 @@ class VisualEmbedder:
         target_vectors: int = 32,
         mean_pool: Optional[np.ndarray] = None,
     ) -> np.ndarray:
-        from visual_rag.embedding.pooling import colpali_experimental_pooling_from_rows, colsmol_experimental_pooling
         model_lower = (self.model_name or "").lower()
         is_colsmol = "colsmol" in model_lower
@@ -550,7 +583,11 @@ class VisualEmbedder:
             visual_np = np.array(visual_embedding, dtype=np.float32)
         if is_colsmol:
-            if mean_pool is not None and getattr(mean_pool, "shape", None) is not None and int(mean_pool.shape[0]) > 0:
                 num_tiles = int(mean_pool.shape[0])
             else:
                 num_tiles = (token_info or {}).get("num_tiles")
@@ -563,14 +600,23 @@ class VisualEmbedder:
                     if int(num_tiles) * patches_per_tile != int(num_visual_tokens):
                         num_tiles = int(num_tiles) + 1
                 num_tiles = int(num_tiles)
-            return colsmol_experimental_pooling(visual_np, num_tiles=num_tiles, patches_per_tile=64, output_dtype=self.output_dtype)
-        rows = mean_pool if mean_pool is not None else self.mean_pool_visual_embedding(visual_np, token_info, target_vectors=target_vectors)
         if int(rows.shape[0]) != int(target_vectors):
             raise ValueError(
                 f"experimental pooling expects mean_pool to have {target_vectors} rows, got {rows.shape[0]} for model={self.model_name}"
             )
         return colpali_experimental_pooling_from_rows(rows, output_dtype=self.output_dtype)
 # Backward compatibility alias
 ColPaliEmbedder = VisualEmbedder

 """
 import gc
 import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
+import torch
 from PIL import Image
 from tqdm import tqdm
 class VisualEmbedder:
     """
     Visual document embedder supporting multiple backends.
     Currently supports:
     - ColPali family (ColSmol-500M, ColPali, ColQwen2)
     - More backends can be added
     Args:
         model_name: HuggingFace model name (e.g., "vidore/colSmol-500M")
         backend: Backend type ("colpali", "auto"). "auto" detects from model_name.
         torch_dtype: Data type for model weights
         batch_size: Batch size for image processing
         filter_special_tokens: Filter special tokens from query embeddings
     Example:
         >>> # Auto-detect backend from model name
         >>> embedder = VisualEmbedder(model_name="vidore/colSmol-500M")
+        >>>
         >>> # Embed images
         >>> image_embeddings = embedder.embed_images(images)
+        >>>
         >>> # Embed query
         >>> query_embedding = embedder.embed_query("What is the budget?")
+        >>>
         >>> # Get token info for saliency maps
         >>> embeddings, token_infos = embedder.embed_images(
         ...     images, return_token_info=True
         ... )
     """
     # Known model families and their backends
     MODEL_BACKENDS = {
         "colsmol": "colpali",
         "colqwen": "colpali",
         "colidefics": "colpali",
     }
     def __init__(
         self,
         model_name: str = "vidore/colSmol-500M",
         if processor_speed not in ("fast", "slow", "auto"):
             raise ValueError("processor_speed must be one of: fast, slow, auto")
         self.processor_speed = processor_speed
         if os.getenv("VISUALRAG_INCLUDE_SPECIAL_TOKENS"):
             self.filter_special_tokens = False
             logger.info("Special token filtering disabled via VISUALRAG_INCLUDE_SPECIAL_TOKENS")
         if backend == "auto":
             backend = self._detect_backend(model_name)
         self.backend = backend
         if device is None:
             if torch.cuda.is_available():
                 device = "cuda"
             else:
                 device = "cpu"
         self.device = device
         if torch_dtype is None:
             if device == "cuda":
                 torch_dtype = torch.bfloat16
             else:
                 torch_dtype = torch.float32
         self.torch_dtype = torch_dtype
         if output_dtype is None:
             if torch_dtype == torch.float16:
                 output_dtype = np.float16
             else:
                 output_dtype = np.float32
         self.output_dtype = output_dtype
         self._model = None
         self._processor = None
         self._image_token_id = None
+        logger.info("🤖 VisualEmbedder initialized")
         logger.info(f"   Model: {model_name}")
         logger.info(f"   Backend: {backend}")
+        logger.info(
+            f"   Device: {device}, torch_dtype: {torch_dtype}, output_dtype: {output_dtype}"
+        )
     def _detect_backend(self, model_name: str) -> str:
         """Auto-detect backend from model name."""
         model_lower = model_name.lower()
         for key, backend in self.MODEL_BACKENDS.items():
             if key in model_lower:
                 logger.debug(f"Detected backend '{backend}' from model name")
                 return backend
         # Default to colpali for unknown models
         logger.warning(f"Unknown model '{model_name}', defaulting to 'colpali' backend")
         return "colpali"
     def _load_model(self):
         """Lazy load the model when first needed."""
         if self._model is not None:
             return
         if self.backend == "colpali":
             self._load_colpali_model()
         else:
             raise ValueError(f"Unknown backend: {self.backend}")
     def _load_colpali_model(self):
         """Load ColPali-family model."""
         try:
                 "pip install visual-rag-toolkit[embedding] or "
                 "pip install colpali-engine"
             )
         logger.info(f"🤖 Loading ColPali model: {self.model_name}")
         logger.info(f"   Device: {self.device}, dtype: {self.torch_dtype}")
             if self.processor_speed == "auto":
                 return {}
             return {"use_fast": self.processor_speed == "fast"}
         from transformers import AutoConfig
         cfg = AutoConfig.from_pretrained(self.model_name)
                 device_map=self.device,
             ).eval()
             try:
+                self._processor = ColPaliProcessor.from_pretrained(
+                    self.model_name, **_processor_kwargs()
+                )
             except TypeError:
                 self._processor = ColPaliProcessor.from_pretrained(self.model_name)
             except Exception:
                 if self.processor_speed == "fast":
+                    self._processor = ColPaliProcessor.from_pretrained(
+                        self.model_name, use_fast=False
+                    )
                 else:
                     raise
             self._image_token_id = self._processor.image_token_id
                 device_map=self.device,
             ).eval()
             try:
+                self._processor = ColQwen2Processor.from_pretrained(
+                    self.model_name, device_map=self.device, **_processor_kwargs()
+                )
             except TypeError:
+                self._processor = ColQwen2Processor.from_pretrained(
+                    self.model_name, device_map=self.device
+                )
             except Exception:
                 if self.processor_speed == "fast":
+                    self._processor = ColQwen2Processor.from_pretrained(
+                        self.model_name, device_map=self.device, use_fast=False
+                    )
                 else:
                     raise
             self._image_token_id = self._processor.image_token_id
             attn_implementation=attn_implementation,
         ).eval()
         try:
+            self._processor = ColIdefics3Processor.from_pretrained(
+                self.model_name, **_processor_kwargs()
+            )
         except TypeError:
             self._processor = ColIdefics3Processor.from_pretrained(self.model_name)
         except Exception:
             if self.processor_speed == "fast":
+                self._processor = ColIdefics3Processor.from_pretrained(
+                    self.model_name, use_fast=False
+                )
             else:
                 raise
         self._image_token_id = self._processor.image_token_id
         logger.info("✅ Model loaded successfully")
     @property
     def model(self):
         self._load_model()
         return self._model
     @property
     def processor(self):
         self._load_model()
         return self._processor
     @property
     def image_token_id(self):
         self._load_model()
         return self._image_token_id
     def embed_query(
         self,
         query_text: str,
     ) -> torch.Tensor:
         """
         Generate embedding for a text query.
         By default, filters out special tokens (CLS, SEP, PAD) to keep only
         meaningful text tokens for better MaxSim matching.
         Args:
             query_text: Natural language query string
             filter_special_tokens: Override instance-level setting
         Returns:
             Query embedding tensor of shape [num_tokens, embedding_dim]
         """
         should_filter = (
+            filter_special_tokens
+            if filter_special_tokens is not None
             else self.filter_special_tokens
         )
         with torch.no_grad():
             processed = self.processor.process_queries([query_text]).to(self.model.device)
             embedding = self.model(**processed)
         # Remove batch dimension: [1, tokens, dim] -> [tokens, dim]
         if embedding.dim() == 3:
             embedding = embedding.squeeze(0)
         if should_filter:
             # Filter special tokens based on attention mask
             attention_mask = processed.get("attention_mask")
                 # Keep only tokens with attention_mask = 1
                 valid_mask = attention_mask.squeeze(0).bool()
                 embedding = embedding[valid_mask]
                 # Additionally filter padding tokens if present
                 input_ids = processed.get("input_ids")
                 if input_ids is not None:
                     non_special_mask = input_ids >= 4
                     if non_special_mask.any():
                         embedding = embedding[non_special_mask]
             logger.debug(f"Query embedding: {embedding.shape[0]} tokens after filtering")
         else:
             logger.debug(f"Query embedding: {embedding.shape[0]} tokens (unfiltered)")
         return embedding
     def embed_queries(
         Returns a list of tensors, each of shape [num_tokens, embedding_dim].
         """
         should_filter = (
+            filter_special_tokens
+            if filter_special_tokens is not None
+            else self.filter_special_tokens
         )
         batch_size = batch_size or self.batch_size
                 torch.mps.empty_cache()
         return outputs
     def embed_images(
         self,
         images: List[Image.Image],
     ) -> Union[List[torch.Tensor], Tuple[List[torch.Tensor], List[Dict[str, Any]]]]:
         """
         Generate embeddings for a list of images.
         Args:
             images: List of PIL Images
             batch_size: Override instance batch size
             return_token_info: Also return token metadata (for saliency maps)
             show_progress: Show progress bar
         Returns:
             If return_token_info=False:
                 List of embedding tensors [num_patches, dim]
             If return_token_info=True:
                 Tuple of (embeddings, token_infos)
         Token info contains:
             - visual_token_indices: Indices of visual tokens in embedding
             - num_visual_tokens: Count of visual tokens
             - num_tiles: Total tiles (n_rows × n_cols + 1 global)
         """
         batch_size = batch_size or self.batch_size
+        if (
+            self.device == "mps"
+            and "colpali" in (self.model_name or "").lower()
+            and int(batch_size) > 1
+        ):
             batch_size = 1
         embeddings = []
         token_infos = [] if return_token_info else None
         iterator = range(0, len(images), batch_size)
         if show_progress:
             iterator = tqdm(iterator, desc="🎨 Embedding", unit="batch")
         for i in iterator:
+            batch = images[i : i + batch_size]
             with torch.no_grad():
                 processed = self.processor.process_images(batch).to(self.model.device)
                 # Extract token info before model forward
                 if return_token_info:
                     input_ids = processed["input_ids"]
                     batch_n_rows = processed.get("n_rows")
                     batch_n_cols = processed.get("n_cols")
                     for j in range(input_ids.shape[0]):
                         # Find visual token indices
+                        image_token_mask = input_ids[j] == self.image_token_id
                         visual_indices = torch.where(image_token_mask)[0].cpu().numpy().tolist()
                         n_rows = batch_n_rows[j].item() if batch_n_rows is not None else None
                         n_cols = batch_n_cols[j].item() if batch_n_cols is not None else None
+                        token_infos.append(
+                            {
+                                "visual_token_indices": visual_indices,
+                                "num_visual_tokens": len(visual_indices),
+                                "n_rows": n_rows,
+                                "n_cols": n_cols,
+                                "num_tiles": (n_rows * n_cols + 1) if n_rows and n_cols else None,
+                            }
+                        )
                 # Generate embeddings
                 batch_embeddings = self.model(**processed)
             # Extract per-image embeddings
             if isinstance(batch_embeddings, torch.Tensor) and batch_embeddings.dim() == 3:
                 for j in range(batch_embeddings.shape[0]):
                     embeddings.append(batch_embeddings[j].cpu())
             else:
                 embeddings.extend([e.cpu() for e in batch_embeddings])
             # Memory cleanup
             del processed, batch_embeddings
             gc.collect()
                 torch.cuda.empty_cache()
             elif torch.backends.mps.is_available():
                 torch.mps.empty_cache()
         if return_token_info:
             return embeddings, token_infos
         return embeddings
     def extract_visual_embedding(
         self,
         full_embedding: torch.Tensor,
     ) -> np.ndarray:
         """
         Extract only visual token embeddings from full embedding.
         Filters out special tokens, keeping only visual patches for MaxSim.
         Args:
             full_embedding: Full embedding [all_tokens, dim]
             token_info: Token info dict from embed_images
         Returns:
             Visual embedding array [num_visual_tokens, dim]
         """
         visual_indices = token_info["visual_token_indices"]
         if isinstance(full_embedding, torch.Tensor):
             if full_embedding.dtype == torch.bfloat16:
                 visual_emb = full_embedding[visual_indices].cpu().float().numpy()
                 visual_emb = full_embedding[visual_indices].cpu().numpy()
         else:
             visual_emb = np.array(full_embedding)[visual_indices]
         return visual_emb.astype(self.output_dtype)
     def mean_pool_visual_embedding(
             n_rows = (token_info or {}).get("n_rows")
             n_cols = (token_info or {}).get("n_cols")
             num_tiles = int(n_rows) * int(n_cols) + 1 if n_rows and n_cols else 13
+            return tile_level_mean_pooling(
+                visual_np, num_tiles=num_tiles, patches_per_tile=64, output_dtype=self.output_dtype
+            )
         num_tokens = int(visual_np.shape[0])
         grid = int(round(float(num_tokens) ** 0.5))
         if grid * grid != num_tokens:
+            raise ValueError(
+                f"Cannot infer square grid from num_visual_tokens={num_tokens} for model={self.model_name}"
+            )
         if int(target_vectors) != int(grid):
             raise ValueError(
                 f"target_vectors={target_vectors} does not match inferred grid_size={grid} for model={self.model_name}"
             )
+        return colpali_row_mean_pooling(
+            visual_np, grid_size=int(target_vectors), output_dtype=self.output_dtype
+        )
     def global_pool_from_mean_pool(self, mean_pool: np.ndarray) -> np.ndarray:
         if mean_pool.size == 0:
         target_vectors: int = 32,
         mean_pool: Optional[np.ndarray] = None,
     ) -> np.ndarray:
+        from visual_rag.embedding.pooling import (
+            colpali_experimental_pooling_from_rows,
+            colsmol_experimental_pooling,
+        )
         model_lower = (self.model_name or "").lower()
         is_colsmol = "colsmol" in model_lower
             visual_np = np.array(visual_embedding, dtype=np.float32)
         if is_colsmol:
+            if (
+                mean_pool is not None
+                and getattr(mean_pool, "shape", None) is not None
+                and int(mean_pool.shape[0]) > 0
+            ):
                 num_tiles = int(mean_pool.shape[0])
             else:
                 num_tiles = (token_info or {}).get("num_tiles")
                     if int(num_tiles) * patches_per_tile != int(num_visual_tokens):
                         num_tiles = int(num_tiles) + 1
                 num_tiles = int(num_tiles)
+            return colsmol_experimental_pooling(
+                visual_np, num_tiles=num_tiles, patches_per_tile=64, output_dtype=self.output_dtype
+            )
+        rows = (
+            mean_pool
+            if mean_pool is not None
+            else self.mean_pool_visual_embedding(
+                visual_np, token_info, target_vectors=target_vectors
+            )
+        )
         if int(rows.shape[0]) != int(target_vectors):
             raise ValueError(
                 f"experimental pooling expects mean_pool to have {target_vectors} rows, got {rows.shape[0]} for model={self.model_name}"
             )
         return colpali_experimental_pooling_from_rows(rows, output_dtype=self.output_dtype)
 # Backward compatibility alias
 ColPaliEmbedder = VisualEmbedder

visual_rag/indexing/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+Indexing module - PDF processing, embedding storage, and CDN uploads.
+Components:
+- PDFProcessor: Convert PDFs to images and extract text
+- QdrantIndexer: Upload embeddings to Qdrant vector database
+- CloudinaryUploader: Upload images to Cloudinary CDN
+- ProcessingPipeline: End-to-end PDF → Qdrant pipeline
+"""
+# Lazy imports to avoid failures when optional dependencies aren't installed
+try:
+    from visual_rag.indexing.pdf_processor import PDFProcessor
+except ImportError:
+    PDFProcessor = None
+try:
+    from visual_rag.indexing.qdrant_indexer import QdrantIndexer
+except ImportError:
+    QdrantIndexer = None
+try:
+    from visual_rag.indexing.cloudinary_uploader import CloudinaryUploader
+except ImportError:
+    CloudinaryUploader = None
+try:
+    from visual_rag.indexing.pipeline import ProcessingPipeline
+except ImportError:
+    ProcessingPipeline = None
+__all__ = [
+    "PDFProcessor",
+    "QdrantIndexer",
+    "CloudinaryUploader",
+    "ProcessingPipeline",
+]

visual_rag/indexing/cloudinary_uploader.py CHANGED Viewed

@@ -15,14 +15,15 @@ Environment Variables:
 """
 import io
-import os
-import time
-import signal
 import logging
 import platform
 import threading
 from typing import Optional
-from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
 from PIL import Image
@@ -34,9 +35,9 @@ THREAD_SAFE_MODE = os.getenv("VISUAL_RAG_THREAD_SAFE", "").lower() in ("1", "tru
 class CloudinaryUploader:
     """
     Upload images to Cloudinary CDN.
     Works independently - just needs PIL images.
     Args:
         cloud_name: Cloudinary cloud name
         api_key: Cloudinary API key
@@ -44,7 +45,7 @@ class CloudinaryUploader:
         folder: Base folder for uploads
         max_retries: Number of retry attempts
         timeout_seconds: Timeout per upload
     Example:
         >>> uploader = CloudinaryUploader(
         ...     cloud_name="my-cloud",
@@ -52,11 +53,11 @@ class CloudinaryUploader:
         ...     api_secret="yyy",
         ...     folder="my-project",
         ... )
-        >>>
         >>> url = uploader.upload(image, "doc_page_1")
         >>> print(url)  # https://res.cloudinary.com/.../doc_page_1.jpg
     """
     def __init__(
         self,
         cloud_name: Optional[str] = None,
@@ -71,19 +72,19 @@ class CloudinaryUploader:
         self.cloud_name = cloud_name or os.getenv("CLOUDINARY_CLOUD_NAME")
         self.api_key = api_key or os.getenv("CLOUDINARY_API_KEY")
         self.api_secret = api_secret or os.getenv("CLOUDINARY_API_SECRET")
         if not all([self.cloud_name, self.api_key, self.api_secret]):
             raise ValueError(
                 "Cloudinary credentials required. Set CLOUDINARY_CLOUD_NAME, "
                 "CLOUDINARY_API_KEY, CLOUDINARY_API_SECRET environment variables "
                 "or pass them as arguments."
             )
         self.folder = folder
         self.max_retries = max_retries
         self.timeout_seconds = timeout_seconds
         self.jpeg_quality = jpeg_quality
         # Check dependency
         try:
             import cloudinary  # noqa
@@ -92,10 +93,10 @@ class CloudinaryUploader:
                 "Cloudinary not installed. "
                 "Install with: pip install visual-rag-toolkit[cloudinary]"
             )
-        logger.info(f"☁️ Cloudinary uploader initialized")
         logger.info(f"   Folder: {folder}")
     def upload(
         self,
         image: Image.Image,
@@ -104,34 +105,34 @@ class CloudinaryUploader:
     ) -> Optional[str]:
         """
         Upload a single image to Cloudinary.
         Args:
             image: PIL Image to upload
             public_id: Public ID (filename without extension)
             subfolder: Optional subfolder within base folder
         Returns:
             Secure URL of uploaded image, or None if failed
         """
         import cloudinary
         import cloudinary.uploader
         # Prepare buffer
         buffer = io.BytesIO()
         image.save(buffer, format="JPEG", quality=self.jpeg_quality, optimize=True)
         # Configure Cloudinary
         cloudinary.config(
             cloud_name=self.cloud_name,
             api_key=self.api_key,
             api_secret=self.api_secret,
         )
         # Build folder path
         folder_path = self.folder
         if subfolder:
             folder_path = f"{self.folder}/{subfolder}"
         def do_upload():
             buffer.seek(0)
             result = cloudinary.uploader.upload(
@@ -143,14 +144,14 @@ class CloudinaryUploader:
                 timeout=self.timeout_seconds,
             )
             return result["secure_url"]
         # Use thread-safe mode for Streamlit/Flask/threaded contexts
         # Set VISUAL_RAG_THREAD_SAFE=1 to enable
         if THREAD_SAFE_MODE or threading.current_thread() is not threading.main_thread():
             return self._upload_with_thread_timeout(do_upload, public_id)
         else:
             return self._upload_with_signal_timeout(do_upload, public_id)
     def _upload_with_thread_timeout(self, do_upload, public_id: str) -> Optional[str]:
         """Thread-safe upload with ThreadPoolExecutor timeout."""
         for attempt in range(self.max_retries):
@@ -158,64 +159,60 @@ class CloudinaryUploader:
                 with ThreadPoolExecutor(max_workers=1) as executor:
                     future = executor.submit(do_upload)
                     return future.result(timeout=self.timeout_seconds)
             except FuturesTimeoutError:
                 logger.warning(
                     f"Upload timeout (attempt {attempt + 1}/{self.max_retries}): {public_id}"
                 )
                 if attempt < self.max_retries - 1:
-                    time.sleep(2 ** attempt)
             except Exception as e:
-                logger.warning(
-                    f"Upload failed (attempt {attempt + 1}/{self.max_retries}): {e}"
-                )
                 if attempt < self.max_retries - 1:
-                    time.sleep(2 ** attempt)
         logger.error(f"❌ Upload failed after {self.max_retries} attempts: {public_id}")
         return None
     def _upload_with_signal_timeout(self, do_upload, public_id: str) -> Optional[str]:
         """Signal-based upload timeout (main thread only, Unix/macOS)."""
         use_timeout = platform.system() != "Windows"
         class SignalTimeoutError(Exception):
             pass
         def timeout_handler(signum, frame):
             raise SignalTimeoutError(f"Upload timed out after {self.timeout_seconds}s")
         for attempt in range(self.max_retries):
             try:
                 if use_timeout:
                     old_handler = signal.signal(signal.SIGALRM, timeout_handler)
                     signal.alarm(self.timeout_seconds)
                 try:
                     return do_upload()
                 finally:
                     if use_timeout:
                         signal.alarm(0)
                         signal.signal(signal.SIGALRM, old_handler)
             except SignalTimeoutError:
                 logger.warning(
                     f"Upload timeout (attempt {attempt + 1}/{self.max_retries}): {public_id}"
                 )
                 if attempt < self.max_retries - 1:
-                    time.sleep(2 ** attempt)
             except Exception as e:
-                logger.warning(
-                    f"Upload failed (attempt {attempt + 1}/{self.max_retries}): {e}"
-                )
                 if attempt < self.max_retries - 1:
-                    time.sleep(2 ** attempt)
         logger.error(f"❌ Upload failed after {self.max_retries} attempts: {public_id}")
         return None
     def upload_original_and_resized(
         self,
         original_image: Image.Image,
@@ -224,12 +221,12 @@ class CloudinaryUploader:
     ) -> tuple:
         """
         Upload both original and resized versions.
         Args:
             original_image: Original PDF page image
             resized_image: Resized image for ColPali
             base_public_id: Base public ID (e.g., "doc_page_1")
         Returns:
             Tuple of (original_url, resized_url) - either can be None on failure
         """
@@ -238,13 +235,13 @@ class CloudinaryUploader:
             base_public_id,
             subfolder="original",
         )
         resized_url = self.upload(
             resized_image,
             base_public_id,
             subfolder="resized",
         )
         return original_url, resized_url
     def upload_original_cropped_and_resized(
@@ -275,5 +272,3 @@ class CloudinaryUploader:
         )
         return original_url, cropped_url, resized_url

 """
 import io
 import logging
+import os
 import platform
+import signal
 import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import TimeoutError as FuturesTimeoutError
 from typing import Optional
 from PIL import Image
 class CloudinaryUploader:
     """
     Upload images to Cloudinary CDN.
     Works independently - just needs PIL images.
     Args:
         cloud_name: Cloudinary cloud name
         api_key: Cloudinary API key
         folder: Base folder for uploads
         max_retries: Number of retry attempts
         timeout_seconds: Timeout per upload
     Example:
         >>> uploader = CloudinaryUploader(
         ...     cloud_name="my-cloud",
         ...     api_secret="yyy",
         ...     folder="my-project",
         ... )
+        >>>
         >>> url = uploader.upload(image, "doc_page_1")
         >>> print(url)  # https://res.cloudinary.com/.../doc_page_1.jpg
     """
     def __init__(
         self,
         cloud_name: Optional[str] = None,
         self.cloud_name = cloud_name or os.getenv("CLOUDINARY_CLOUD_NAME")
         self.api_key = api_key or os.getenv("CLOUDINARY_API_KEY")
         self.api_secret = api_secret or os.getenv("CLOUDINARY_API_SECRET")
         if not all([self.cloud_name, self.api_key, self.api_secret]):
             raise ValueError(
                 "Cloudinary credentials required. Set CLOUDINARY_CLOUD_NAME, "
                 "CLOUDINARY_API_KEY, CLOUDINARY_API_SECRET environment variables "
                 "or pass them as arguments."
             )
         self.folder = folder
         self.max_retries = max_retries
         self.timeout_seconds = timeout_seconds
         self.jpeg_quality = jpeg_quality
         # Check dependency
         try:
             import cloudinary  # noqa
                 "Cloudinary not installed. "
                 "Install with: pip install visual-rag-toolkit[cloudinary]"
             )
+        logger.info("☁️ Cloudinary uploader initialized")
         logger.info(f"   Folder: {folder}")
     def upload(
         self,
         image: Image.Image,
     ) -> Optional[str]:
         """
         Upload a single image to Cloudinary.
         Args:
             image: PIL Image to upload
             public_id: Public ID (filename without extension)
             subfolder: Optional subfolder within base folder
         Returns:
             Secure URL of uploaded image, or None if failed
         """
         import cloudinary
         import cloudinary.uploader
         # Prepare buffer
         buffer = io.BytesIO()
         image.save(buffer, format="JPEG", quality=self.jpeg_quality, optimize=True)
         # Configure Cloudinary
         cloudinary.config(
             cloud_name=self.cloud_name,
             api_key=self.api_key,
             api_secret=self.api_secret,
         )
         # Build folder path
         folder_path = self.folder
         if subfolder:
             folder_path = f"{self.folder}/{subfolder}"
         def do_upload():
             buffer.seek(0)
             result = cloudinary.uploader.upload(
                 timeout=self.timeout_seconds,
             )
             return result["secure_url"]
         # Use thread-safe mode for Streamlit/Flask/threaded contexts
         # Set VISUAL_RAG_THREAD_SAFE=1 to enable
         if THREAD_SAFE_MODE or threading.current_thread() is not threading.main_thread():
             return self._upload_with_thread_timeout(do_upload, public_id)
         else:
             return self._upload_with_signal_timeout(do_upload, public_id)
     def _upload_with_thread_timeout(self, do_upload, public_id: str) -> Optional[str]:
         """Thread-safe upload with ThreadPoolExecutor timeout."""
         for attempt in range(self.max_retries):
                 with ThreadPoolExecutor(max_workers=1) as executor:
                     future = executor.submit(do_upload)
                     return future.result(timeout=self.timeout_seconds)
             except FuturesTimeoutError:
                 logger.warning(
                     f"Upload timeout (attempt {attempt + 1}/{self.max_retries}): {public_id}"
                 )
                 if attempt < self.max_retries - 1:
+                    time.sleep(2**attempt)
             except Exception as e:
+                logger.warning(f"Upload failed (attempt {attempt + 1}/{self.max_retries}): {e}")
                 if attempt < self.max_retries - 1:
+                    time.sleep(2**attempt)
         logger.error(f"❌ Upload failed after {self.max_retries} attempts: {public_id}")
         return None
     def _upload_with_signal_timeout(self, do_upload, public_id: str) -> Optional[str]:
         """Signal-based upload timeout (main thread only, Unix/macOS)."""
         use_timeout = platform.system() != "Windows"
         class SignalTimeoutError(Exception):
             pass
         def timeout_handler(signum, frame):
             raise SignalTimeoutError(f"Upload timed out after {self.timeout_seconds}s")
         for attempt in range(self.max_retries):
             try:
                 if use_timeout:
                     old_handler = signal.signal(signal.SIGALRM, timeout_handler)
                     signal.alarm(self.timeout_seconds)
                 try:
                     return do_upload()
                 finally:
                     if use_timeout:
                         signal.alarm(0)
                         signal.signal(signal.SIGALRM, old_handler)
             except SignalTimeoutError:
                 logger.warning(
                     f"Upload timeout (attempt {attempt + 1}/{self.max_retries}): {public_id}"
                 )
                 if attempt < self.max_retries - 1:
+                    time.sleep(2**attempt)
             except Exception as e:
+                logger.warning(f"Upload failed (attempt {attempt + 1}/{self.max_retries}): {e}")
                 if attempt < self.max_retries - 1:
+                    time.sleep(2**attempt)
         logger.error(f"❌ Upload failed after {self.max_retries} attempts: {public_id}")
         return None
     def upload_original_and_resized(
         self,
         original_image: Image.Image,
     ) -> tuple:
         """
         Upload both original and resized versions.
         Args:
             original_image: Original PDF page image
             resized_image: Resized image for ColPali
             base_public_id: Base public ID (e.g., "doc_page_1")
         Returns:
             Tuple of (original_url, resized_url) - either can be None on failure
         """
             base_public_id,
             subfolder="original",
         )
         resized_url = self.upload(
             resized_image,
             base_public_id,
             subfolder="resized",
         )
         return original_url, resized_url
     def upload_original_cropped_and_resized(
         )
         return original_url, cropped_url, resized_url

visual_rag/indexing/pdf_processor.py CHANGED Viewed

@@ -11,10 +11,10 @@ Features:
 """
 import gc
-import re
 import logging
 from pathlib import Path
-from typing import List, Dict, Any, Optional, Tuple, Generator
 from PIL import Image
@@ -24,26 +24,26 @@ logger = logging.getLogger(__name__)
 class PDFProcessor:
     """
     Process PDFs into images and text for visual retrieval.
     Works independently - no embedding or storage dependencies.
     Args:
         dpi: DPI for image conversion (higher = better quality)
         output_format: Image format (RGB, L, etc.)
         page_batch_size: Pages per batch for memory efficiency
     Example:
         >>> processor = PDFProcessor(dpi=140)
-        >>>
         >>> # Convert single PDF
         >>> images, texts = processor.process_pdf(Path("report.pdf"))
-        >>>
         >>> # Stream large PDFs
         >>> for images, texts in processor.stream_pdf(Path("large.pdf"), batch_size=10):
         ...     # Process each batch
         ...     pass
     """
     def __init__(
         self,
         dpi: int = 140,
@@ -53,17 +53,24 @@ class PDFProcessor:
         self.dpi = dpi
         self.output_format = output_format
         self.page_batch_size = page_batch_size
-        # Check dependencies
         try:
-            from pdf2image import convert_from_path  # noqa
-            from pypdf import PdfReader  # noqa
-        except ImportError:
             raise ImportError(
-                "PDF processing requires pdf2image and pypdf. "
-                "Install with: pip install visual-rag-toolkit[pdf]"
             )
     def process_pdf(
         self,
         pdf_path: Path,
@@ -71,38 +78,39 @@ class PDFProcessor:
     ) -> Tuple[List[Image.Image], List[str]]:
         """
         Convert PDF to images and extract text.
         Args:
             pdf_path: Path to PDF file
             dpi: Override default DPI
         Returns:
             Tuple of (list of images, list of page texts)
         """
         from pdf2image import convert_from_path
         from pypdf import PdfReader
         dpi = dpi or self.dpi
         pdf_path = Path(pdf_path)
         logger.info(f"📄 Processing PDF: {pdf_path.name}")
         # Extract text
         reader = PdfReader(str(pdf_path))
         total_pages = len(reader.pages)
         page_texts = []
         for page in reader.pages:
             text = page.extract_text() or ""
             # Handle surrogate characters
             text = self._sanitize_text(text)
             page_texts.append(text)
         # Convert to images in batches
         all_images = []
         for start_page in range(1, total_pages + 1, self.page_batch_size):
             end_page = min(start_page + self.page_batch_size - 1, total_pages)
             batch_images = convert_from_path(
                 str(pdf_path),
                 dpi=dpi,
@@ -110,19 +118,19 @@ class PDFProcessor:
                 first_page=start_page,
                 last_page=end_page,
             )
             all_images.extend(batch_images)
             del batch_images
             gc.collect()
-        assert len(all_images) == len(page_texts), (
-            f"Mismatch: {len(all_images)} images vs {len(page_texts)} texts"
-        )
         logger.info(f"✅ Processed {len(all_images)} pages")
         return all_images, page_texts
     def stream_pdf(
         self,
         pdf_path: Path,
@@ -131,39 +139,40 @@ class PDFProcessor:
     ) -> Generator[Tuple[List[Image.Image], List[str], int], None, None]:
         """
         Stream PDF processing for large files.
         Yields batches of (images, texts, start_page) without loading
         entire PDF into memory.
         Args:
             pdf_path: Path to PDF file
             batch_size: Pages per batch
             dpi: Override default DPI
         Yields:
             Tuple of (batch_images, batch_texts, start_page_number)
         """
         from pdf2image import convert_from_path
         from pypdf import PdfReader
         dpi = dpi or self.dpi
         pdf_path = Path(pdf_path)
         reader = PdfReader(str(pdf_path))
         total_pages = len(reader.pages)
         logger.info(f"📄 Streaming PDF: {pdf_path.name} ({total_pages} pages)")
         for start_idx in range(0, total_pages, batch_size):
             end_idx = min(start_idx + batch_size, total_pages)
             # Extract text for batch
             batch_texts = []
             for page_idx in range(start_idx, end_idx):
                 text = reader.pages[page_idx].extract_text() or ""
                 text = self._sanitize_text(text)
                 batch_texts.append(text)
             # Convert images for batch
             batch_images = convert_from_path(
                 str(pdf_path),
@@ -172,18 +181,20 @@ class PDFProcessor:
                 first_page=start_idx + 1,  # 1-indexed
                 last_page=end_idx,
             )
             yield batch_images, batch_texts, start_idx + 1
             del batch_images
             gc.collect()
     def get_page_count(self, pdf_path: Path) -> int:
         """Get number of pages in PDF without loading images."""
         from pypdf import PdfReader
         reader = PdfReader(str(pdf_path))
         return len(reader.pages)
     def resize_for_colpali(
         self,
         image: Image.Image,
@@ -192,19 +203,23 @@ class PDFProcessor:
     ) -> Tuple[Image.Image, int, int]:
         """
         Resize image following ColPali/Idefics3 processor logic.
         Resizes to fit within tile grid without black padding.
         Args:
             image: PIL Image
             max_edge: Maximum edge length
             tile_size: Size of each tile
         Returns:
             Tuple of (resized_image, tile_rows, tile_cols)
         """
         w, h = image.size
         # Step 1: Resize so longest edge = max_edge
         if w > h:
             new_w = max_edge
@@ -212,25 +227,25 @@ class PDFProcessor:
         else:
             new_h = max_edge
             new_w = int(w * (max_edge / h))
         # Step 2: Calculate tile grid
         tile_cols = (new_w + tile_size - 1) // tile_size
         tile_rows = (new_h + tile_size - 1) // tile_size
         # Step 3: Calculate exact dimensions for tiles
         final_w = tile_cols * tile_size
         final_h = tile_rows * tile_size
         # Step 4: Scale to fit within tile grid
         scale_w = final_w / w
         scale_h = final_h / h
         scale = min(scale_w, scale_h)
         scaled_w = int(w * scale)
         scaled_h = int(h * scale)
         resized = image.resize((scaled_w, scaled_h), Image.LANCZOS)
         # Center on white canvas if needed
         if scaled_w != final_w or scaled_h != final_h:
             canvas = Image.new("RGB", (final_w, final_h), (255, 255, 255))
@@ -238,19 +253,17 @@ class PDFProcessor:
             offset_y = (final_h - scaled_h) // 2
             canvas.paste(resized, (offset_x, offset_y))
             resized = canvas
         return resized, tile_rows, tile_cols
     def _sanitize_text(self, text: str) -> str:
         """Remove invalid Unicode characters (surrogates) from text."""
         if not text:
             return ""
         # Remove surrogate characters (U+D800-U+DFFF)
-        return text.encode("utf-8", errors="surrogatepass").decode(
-            "utf-8", errors="ignore"
-        )
     def extract_metadata_from_filename(
         self,
         filename: str,
@@ -258,47 +271,45 @@ class PDFProcessor:
     ) -> Dict[str, Any]:
         """
         Extract metadata from PDF filename.
         Uses mapping if provided, otherwise falls back to pattern matching.
         Args:
             filename: PDF filename (with or without .pdf extension)
             mapping: Optional mapping dict {filename: metadata}
         Returns:
             Metadata dict with year, source, district, etc.
         """
         # Remove extension
         stem = Path(filename).stem
         stem_lower = stem.lower().strip()
         # Try mapping first
         if mapping:
             if stem_lower in mapping:
                 return mapping[stem_lower].copy()
             # Try without .pdf
             stem_no_ext = stem_lower.replace(".pdf", "")
             if stem_no_ext in mapping:
                 return mapping[stem_no_ext].copy()
         # Fallback: pattern matching
         metadata = {"filename": filename}
         # Extract year
         year_match = re.search(r"(20\d{2})", stem)
         if year_match:
             metadata["year"] = int(year_match.group(1))
         # Detect source type
         if "consolidated" in stem_lower or ("annual" in stem_lower and "oag" in stem_lower):
             metadata["source"] = "Consolidated"
         elif "dlg" in stem_lower or "district local government" in stem_lower:
             metadata["source"] = "Local Government"
             # Try to extract district name
-            district_match = re.search(
-                r"([a-z]+)\s+(?:dlg|district local government)", stem_lower
-            )
             if district_match:
                 metadata["district"] = district_match.group(1).title()
         elif "hospital" in stem_lower or "referral" in stem_lower:
@@ -309,7 +320,5 @@ class PDFProcessor:
             metadata["source"] = "Project"
         else:
             metadata["source"] = "Unknown"
-        return metadata

 """
 import gc
 import logging
+import re
 from pathlib import Path
+from typing import Any, Dict, Generator, List, Optional, Tuple
 from PIL import Image
 class PDFProcessor:
     """
     Process PDFs into images and text for visual retrieval.
     Works independently - no embedding or storage dependencies.
     Args:
         dpi: DPI for image conversion (higher = better quality)
         output_format: Image format (RGB, L, etc.)
         page_batch_size: Pages per batch for memory efficiency
     Example:
         >>> processor = PDFProcessor(dpi=140)
+        >>>
         >>> # Convert single PDF
         >>> images, texts = processor.process_pdf(Path("report.pdf"))
+        >>>
         >>> # Stream large PDFs
         >>> for images, texts in processor.stream_pdf(Path("large.pdf"), batch_size=10):
         ...     # Process each batch
         ...     pass
     """
     def __init__(
         self,
         dpi: int = 140,
         self.dpi = dpi
         self.output_format = output_format
         self.page_batch_size = page_batch_size
+        # PDF deps are optional: we only require them when calling PDF-specific methods.
+        # This keeps the class usable for helper utilities like `resize_for_colpali()`
+        # even in minimal installs.
+        self._pdf_deps_available = True
         try:
+            import pdf2image  # noqa: F401
+            import pypdf  # noqa: F401
+        except Exception:
+            self._pdf_deps_available = False
+    def _require_pdf_deps(self) -> None:
+        if not self._pdf_deps_available:
             raise ImportError(
+                "PDF processing requires `pdf2image` and `pypdf`.\n"
+                'Install with: pip install "visual-rag-toolkit[pdf]"'
             )
     def process_pdf(
         self,
         pdf_path: Path,
     ) -> Tuple[List[Image.Image], List[str]]:
         """
         Convert PDF to images and extract text.
         Args:
             pdf_path: Path to PDF file
             dpi: Override default DPI
         Returns:
             Tuple of (list of images, list of page texts)
         """
+        self._require_pdf_deps()
         from pdf2image import convert_from_path
         from pypdf import PdfReader
         dpi = dpi or self.dpi
         pdf_path = Path(pdf_path)
         logger.info(f"📄 Processing PDF: {pdf_path.name}")
         # Extract text
         reader = PdfReader(str(pdf_path))
         total_pages = len(reader.pages)
         page_texts = []
         for page in reader.pages:
             text = page.extract_text() or ""
             # Handle surrogate characters
             text = self._sanitize_text(text)
             page_texts.append(text)
         # Convert to images in batches
         all_images = []
         for start_page in range(1, total_pages + 1, self.page_batch_size):
             end_page = min(start_page + self.page_batch_size - 1, total_pages)
             batch_images = convert_from_path(
                 str(pdf_path),
                 dpi=dpi,
                 first_page=start_page,
                 last_page=end_page,
             )
             all_images.extend(batch_images)
             del batch_images
             gc.collect()
+        assert len(all_images) == len(
+            page_texts
+        ), f"Mismatch: {len(all_images)} images vs {len(page_texts)} texts"
         logger.info(f"✅ Processed {len(all_images)} pages")
         return all_images, page_texts
     def stream_pdf(
         self,
         pdf_path: Path,
     ) -> Generator[Tuple[List[Image.Image], List[str], int], None, None]:
         """
         Stream PDF processing for large files.
         Yields batches of (images, texts, start_page) without loading
         entire PDF into memory.
         Args:
             pdf_path: Path to PDF file
             batch_size: Pages per batch
             dpi: Override default DPI
         Yields:
             Tuple of (batch_images, batch_texts, start_page_number)
         """
+        self._require_pdf_deps()
         from pdf2image import convert_from_path
         from pypdf import PdfReader
         dpi = dpi or self.dpi
         pdf_path = Path(pdf_path)
         reader = PdfReader(str(pdf_path))
         total_pages = len(reader.pages)
         logger.info(f"📄 Streaming PDF: {pdf_path.name} ({total_pages} pages)")
         for start_idx in range(0, total_pages, batch_size):
             end_idx = min(start_idx + batch_size, total_pages)
             # Extract text for batch
             batch_texts = []
             for page_idx in range(start_idx, end_idx):
                 text = reader.pages[page_idx].extract_text() or ""
                 text = self._sanitize_text(text)
                 batch_texts.append(text)
             # Convert images for batch
             batch_images = convert_from_path(
                 str(pdf_path),
                 first_page=start_idx + 1,  # 1-indexed
                 last_page=end_idx,
             )
             yield batch_images, batch_texts, start_idx + 1
             del batch_images
             gc.collect()
     def get_page_count(self, pdf_path: Path) -> int:
         """Get number of pages in PDF without loading images."""
+        self._require_pdf_deps()
         from pypdf import PdfReader
         reader = PdfReader(str(pdf_path))
         return len(reader.pages)
     def resize_for_colpali(
         self,
         image: Image.Image,
     ) -> Tuple[Image.Image, int, int]:
         """
         Resize image following ColPali/Idefics3 processor logic.
         Resizes to fit within tile grid without black padding.
         Args:
             image: PIL Image
             max_edge: Maximum edge length
             tile_size: Size of each tile
         Returns:
             Tuple of (resized_image, tile_rows, tile_cols)
         """
+        # Ensure consistent mode for downstream processors (and predictable tests)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
         w, h = image.size
         # Step 1: Resize so longest edge = max_edge
         if w > h:
             new_w = max_edge
         else:
             new_h = max_edge
             new_w = int(w * (max_edge / h))
         # Step 2: Calculate tile grid
         tile_cols = (new_w + tile_size - 1) // tile_size
         tile_rows = (new_h + tile_size - 1) // tile_size
         # Step 3: Calculate exact dimensions for tiles
         final_w = tile_cols * tile_size
         final_h = tile_rows * tile_size
         # Step 4: Scale to fit within tile grid
         scale_w = final_w / w
         scale_h = final_h / h
         scale = min(scale_w, scale_h)
         scaled_w = int(w * scale)
         scaled_h = int(h * scale)
         resized = image.resize((scaled_w, scaled_h), Image.LANCZOS)
         # Center on white canvas if needed
         if scaled_w != final_w or scaled_h != final_h:
             canvas = Image.new("RGB", (final_w, final_h), (255, 255, 255))
             offset_y = (final_h - scaled_h) // 2
             canvas.paste(resized, (offset_x, offset_y))
             resized = canvas
         return resized, tile_rows, tile_cols
     def _sanitize_text(self, text: str) -> str:
         """Remove invalid Unicode characters (surrogates) from text."""
         if not text:
             return ""
         # Remove surrogate characters (U+D800-U+DFFF)
+        return text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore")
     def extract_metadata_from_filename(
         self,
         filename: str,
     ) -> Dict[str, Any]:
         """
         Extract metadata from PDF filename.
         Uses mapping if provided, otherwise falls back to pattern matching.
         Args:
             filename: PDF filename (with or without .pdf extension)
             mapping: Optional mapping dict {filename: metadata}
         Returns:
             Metadata dict with year, source, district, etc.
         """
         # Remove extension
         stem = Path(filename).stem
         stem_lower = stem.lower().strip()
         # Try mapping first
         if mapping:
             if stem_lower in mapping:
                 return mapping[stem_lower].copy()
             # Try without .pdf
             stem_no_ext = stem_lower.replace(".pdf", "")
             if stem_no_ext in mapping:
                 return mapping[stem_no_ext].copy()
         # Fallback: pattern matching
         metadata = {"filename": filename}
         # Extract year
         year_match = re.search(r"(20\d{2})", stem)
         if year_match:
             metadata["year"] = int(year_match.group(1))
         # Detect source type
         if "consolidated" in stem_lower or ("annual" in stem_lower and "oag" in stem_lower):
             metadata["source"] = "Consolidated"
         elif "dlg" in stem_lower or "district local government" in stem_lower:
             metadata["source"] = "Local Government"
             # Try to extract district name
+            district_match = re.search(r"([a-z]+)\s+(?:dlg|district local government)", stem_lower)
             if district_match:
                 metadata["district"] = district_match.group(1).title()
         elif "hospital" in stem_lower or "referral" in stem_lower:
             metadata["source"] = "Project"
         else:
             metadata["source"] = "Unknown"
+        return metadata

visual_rag/indexing/pipeline.py CHANGED Viewed

@@ -16,11 +16,10 @@ The metadata stored includes everything needed for saliency visualization:
 """
 import gc
-import time
 import hashlib
 import logging
 from pathlib import Path
-from typing import Dict, Any, List, Optional, Set, Tuple
 import numpy as np
 import torch
@@ -31,7 +30,7 @@ logger = logging.getLogger(__name__)
 class ProcessingPipeline:
     """
     End-to-end pipeline for PDF processing and indexing.
     This pipeline:
     1. Converts PDFs to images
     2. Resizes for ColPali processing
@@ -39,7 +38,7 @@ class ProcessingPipeline:
     4. Computes pooling (strategy-dependent)
     5. Uploads images to Cloudinary (optional)
     6. Stores in Qdrant with full saliency metadata
     Args:
         embedder: VisualEmbedder instance
         indexer: QdrantIndexer instance (optional)
@@ -52,34 +51,34 @@ class ProcessingPipeline:
               This is our NOVEL contribution - preserves spatial structure while reducing size.
             - "standard": Push ALL tokens as-is (including special tokens, padding)
               This is the baseline approach for comparison.
     Example:
         >>> from visual_rag import VisualEmbedder, QdrantIndexer, CloudinaryUploader
         >>> from visual_rag.indexing.pipeline import ProcessingPipeline
-        >>>
         >>> # Our novel pooling strategy (default)
         >>> pipeline = ProcessingPipeline(
         ...     embedder=VisualEmbedder(),
         ...     indexer=QdrantIndexer(url, api_key, "my_collection"),
         ...     embedding_strategy="pooling",  # Visual tokens only + tile pooling
         ... )
-        >>>
         >>> # Standard baseline (all tokens, no filtering)
         >>> pipeline_baseline = ProcessingPipeline(
         ...     embedder=VisualEmbedder(),
         ...     indexer=QdrantIndexer(url, api_key, "my_collection_baseline"),
         ...     embedding_strategy="standard",  # All tokens as-is
         ... )
-        >>>
         >>> pipeline.process_pdf(Path("report.pdf"))
     """
     # Valid embedding strategies
     # - "pooling": Visual tokens only + tile-level pooling (NOVEL)
     # - "standard": All tokens + global mean (BASELINE)
     # - "all": Embed once, push BOTH representations (efficient comparison)
     STRATEGIES = ["pooling", "standard", "all"]
     def __init__(
         self,
         embedder=None,
@@ -92,13 +91,15 @@ class ProcessingPipeline:
         crop_empty: bool = False,
         crop_empty_percentage_to_remove: float = 0.9,
         crop_empty_remove_page_number: bool = False,
     ):
         self.embedder = embedder
         self.indexer = indexer
         self.cloudinary_uploader = cloudinary_uploader
         self.metadata_mapping = metadata_mapping or {}
         self.config = config or {}
         # Validate and set embedding strategy
         if embedding_strategy not in self.STRATEGIES:
             raise ValueError(
@@ -110,41 +111,50 @@ class ProcessingPipeline:
         self.crop_empty = bool(crop_empty)
         self.crop_empty_percentage_to_remove = float(crop_empty_percentage_to_remove)
         self.crop_empty_remove_page_number = bool(crop_empty_remove_page_number)
         logger.info(f"📊 Embedding strategy: {embedding_strategy}")
         if embedding_strategy == "pooling":
             logger.info("   → Visual tokens only + tile-level mean pooling (NOVEL)")
         else:
             logger.info("   → All tokens as-is (BASELINE)")
         # Create PDF processor if not provided
         if pdf_processor is None:
             from visual_rag.indexing.pdf_processor import PDFProcessor
             dpi = self.config.get("processing", {}).get("dpi", 140)
             pdf_processor = PDFProcessor(dpi=dpi)
         self.pdf_processor = pdf_processor
         # Config defaults
         self.embedding_batch_size = self.config.get("batching", {}).get("embedding_batch_size", 8)
         self.upload_batch_size = self.config.get("batching", {}).get("upload_batch_size", 8)
         self.delay_between_uploads = self.config.get("delays", {}).get("between_uploads", 0.5)
     def process_pdf(
         self,
         pdf_path: Path,
         skip_existing: bool = True,
         upload_to_cloudinary: bool = True,
         upload_to_qdrant: bool = True,
     ) -> Dict[str, Any]:
         """
         Process a single PDF end-to-end.
         Args:
             pdf_path: Path to PDF file
             skip_existing: Skip pages that already exist in Qdrant
             upload_to_cloudinary: Upload images to Cloudinary
             upload_to_qdrant: Upload embeddings to Qdrant
         Returns:
             Dict with processing results:
             {
@@ -157,62 +167,73 @@ class ProcessingPipeline:
             }
         """
         pdf_path = Path(pdf_path)
-        logger.info(f"📚 Processing PDF: {pdf_path.name}")
         # Check existing pages
         existing_ids: Set[str] = set()
         if skip_existing and self.indexer:
-            existing_ids = self.indexer.get_existing_ids(pdf_path.name)
             if existing_ids:
                 logger.info(f"   Found {len(existing_ids)} existing pages")
-        # Convert PDF to images
-        logger.info(f"🖼️ Converting PDF to images...")
         images, texts = self.pdf_processor.process_pdf(pdf_path)
         total_pages = len(images)
         logger.info(f"   ✅ Converted {total_pages} pages")
-        # Get extra metadata
-        extra_metadata = self._get_extra_metadata(pdf_path.name)
         if extra_metadata:
             logger.info(f"   📋 Found extra metadata: {list(extra_metadata.keys())}")
         # Process in batches
         uploaded = 0
         skipped = 0
         failed = 0
         all_pages = []
         upload_queue = []
         for batch_start in range(0, total_pages, self.embedding_batch_size):
             batch_end = min(batch_start + self.embedding_batch_size, total_pages)
             batch_images = images[batch_start:batch_end]
             batch_texts = texts[batch_start:batch_end]
             logger.info(f"📦 Processing pages {batch_start + 1}-{batch_end}/{total_pages}")
-            # Filter pages that need processing
             pages_to_process = []
             for i, (img, text) in enumerate(zip(batch_images, batch_texts)):
                 page_num = batch_start + i + 1
-                chunk_id = self.generate_chunk_id(pdf_path.name, page_num)
                 if skip_existing and chunk_id in existing_ids:
                     skipped += 1
                     continue
-                pages_to_process.append({
-                    "index": i,
-                    "page_num": page_num,
-                    "chunk_id": chunk_id,
-                    "raw_image": img,
-                    "text": text,
-                })
             if not pages_to_process:
                 logger.info("   All pages in batch exist, skipping...")
                 continue
             # Generate embeddings with token info
             logger.info(f"🤖 Generating embeddings for {len(pages_to_process)} pages...")
             from visual_rag.preprocessing.crop_empty import CropEmptyConfig, crop_empty
@@ -226,6 +247,10 @@ class ProcessingPipeline:
                         config=CropEmptyConfig(
                             percentage_to_remove=float(self.crop_empty_percentage_to_remove),
                             remove_page_number=bool(self.crop_empty_remove_page_number),
                         ),
                     )
                     p["embed_image"] = cropped_img
@@ -235,15 +260,14 @@ class ProcessingPipeline:
                     p["embed_image"] = raw_img
                     p["crop_meta"] = None
                     images_to_embed.append(raw_img)
             embeddings, token_infos = self.embedder.embed_images(
                 images_to_embed,
                 batch_size=self.embedding_batch_size,
                 return_token_info=True,
-                show_progress=True,
             )
-            # Process each page
             for idx, page_info in enumerate(pages_to_process):
                 raw_img = page_info["raw_image"]
                 embed_img = page_info["embed_image"]
@@ -253,10 +277,19 @@ class ProcessingPipeline:
                 text = page_info["text"]
                 embedding = embeddings[idx]
                 token_info = token_infos[idx]
                 try:
                     page_data = self._process_single_page(
-                        pdf_path=pdf_path,
                         page_num=page_num,
                         chunk_id=chunk_id,
                         total_pages=total_pages,
@@ -269,46 +302,49 @@ class ProcessingPipeline:
                         upload_to_cloudinary=upload_to_cloudinary,
                         crop_meta=crop_meta,
                     )
                     all_pages.append(page_data)
                     if upload_to_qdrant and self.indexer:
                         upload_queue.append(page_data)
                         # Upload in batches
                         if len(upload_queue) >= self.upload_batch_size:
                             count = self._upload_batch(upload_queue)
                             uploaded += count
                             upload_queue = []
                 except Exception as e:
                     logger.error(f"   ❌ Failed page {page_num}: {e}")
                     failed += 1
             # Memory cleanup
             gc.collect()
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
         # Upload remaining pages
         if upload_queue and upload_to_qdrant and self.indexer:
             count = self._upload_batch(upload_queue)
             uploaded += count
-        logger.info(f"✅ Completed {pdf_path.name}: {uploaded} uploaded, {skipped} skipped, {failed} failed")
         return {
-            "filename": pdf_path.name,
             "total_pages": total_pages,
             "uploaded": uploaded,
             "skipped": skipped,
             "failed": failed,
             "pages": all_pages,
         }
     def _process_single_page(
         self,
-        pdf_path: Path,
         page_num: int,
         chunk_id: str,
         total_pages: int,
@@ -323,17 +359,17 @@ class ProcessingPipeline:
     ) -> Dict[str, Any]:
         """Process a single page with full metadata for saliency."""
         from visual_rag.embedding.pooling import global_mean_pooling
         # Resize image for ColPali
         resized_img, tile_rows, tile_cols = self.pdf_processor.resize_for_colpali(embed_img)
         # Use processor's tile info if available (more accurate)
         proc_n_rows = token_info.get("n_rows")
         proc_n_cols = token_info.get("n_cols")
         if proc_n_rows and proc_n_cols:
             tile_rows = proc_n_rows
             tile_cols = proc_n_cols
         # Convert embedding to numpy
         if isinstance(embedding, torch.Tensor):
             if embedding.dtype == torch.bfloat16:
@@ -343,24 +379,30 @@ class ProcessingPipeline:
         else:
             full_embedding = np.array(embedding)
         full_embedding = full_embedding.astype(np.float32)
         # Token info for metadata
         visual_indices = token_info["visual_token_indices"]
         num_visual_tokens = token_info["num_visual_tokens"]
         # =========================================================================
         # STRATEGY: "pooling" (NOVEL) vs "standard" (BASELINE) vs "all" (BOTH)
         # =========================================================================
         # Always compute visual-only embedding (needed for pooling and saliency)
         visual_embedding = full_embedding[visual_indices]
-        tile_pooled = self.embedder.mean_pool_visual_embedding(visual_embedding, token_info, target_vectors=32)
         experimental_pooled = self.embedder.experimental_pool_visual_embedding(
             visual_embedding, token_info, target_vectors=32, mean_pool=tile_pooled
         )
         global_pooled = global_mean_pooling(full_embedding)
-        global_pooling = self.embedder.global_pool_from_mean_pool(tile_pooled) if tile_pooled.size else global_pooled
         num_tiles = int(tile_pooled.shape[0])
         patches_per_tile = int(visual_embedding.shape[0] // max(num_tiles, 1)) if num_tiles else 0
@@ -369,64 +411,70 @@ class ProcessingPipeline:
         else:
             tile_rows = token_info.get("n_rows") or None
             tile_cols = token_info.get("n_cols") or None
         if self.embedding_strategy == "pooling":
             # NOVEL APPROACH: Visual tokens only + tile-level pooling
             embedding_for_initial = visual_embedding
             embedding_for_pooling = tile_pooled
-            global_pooling = self.embedder.global_pool_from_mean_pool(tile_pooled) if tile_pooled.size else global_pooled
         elif self.embedding_strategy == "standard":
             # BASELINE: All tokens + global mean
             embedding_for_initial = full_embedding
             embedding_for_pooling = global_pooled.reshape(1, -1)
             global_pooling = global_pooled
         else:  # "all" - Push BOTH representations (efficient for comparison)
             # Embed once, store multiple vector representations
             # This allows comparing both strategies without re-embedding
             embedding_for_initial = visual_embedding  # Use visual for search
-            embedding_for_pooling = tile_pooled       # Use tile-level for fast prefetch
-            global_pooling = self.embedder.global_pool_from_mean_pool(tile_pooled) if tile_pooled.size else global_pooled
             # ALSO store standard representations as additional vectors
             # These will be added to metadata for optional use
             pass  # Extra vectors handled in return dict below
         # Upload to Cloudinary
         original_url = None
         cropped_url = None
         resized_url = None
         if upload_to_cloudinary and self.cloudinary_uploader:
-            base_filename = f"{pdf_path.stem}_page_{page_num}"
             if self.crop_empty:
-                original_url, cropped_url, resized_url = self.cloudinary_uploader.upload_original_cropped_and_resized(
-                    raw_img, embed_img, resized_img, base_filename
                 )
             else:
                 original_url, resized_url = self.cloudinary_uploader.upload_original_and_resized(
                     raw_img, resized_img, base_filename
                 )
         # Sanitize text
         safe_text = self._sanitize_text(text[:10000]) if text else ""
-        # Build metadata (everything needed for saliency)
         metadata = {
-            # Document info
-            "filename": pdf_path.name,
             "page_number": page_num,
             "total_pages": total_pages,
             "has_text": bool(text and text.strip()),
             "text": safe_text,
             # Image URLs
             "page": resized_url or "",  # For display
             "original_url": original_url or "",
             "cropped_url": cropped_url or "",
             "resized_url": resized_url or "",
             # Dimensions (needed for saliency overlay)
             "original_width": raw_img.width,
             "original_height": raw_img.height,
@@ -434,35 +482,33 @@ class ProcessingPipeline:
             "cropped_height": int(embed_img.height) if self.crop_empty else int(raw_img.height),
             "resized_width": resized_img.width,
             "resized_height": resized_img.height,
             # Tile structure (needed for saliency)
             "num_tiles": num_tiles,
             "tile_rows": tile_rows,
             "tile_cols": tile_cols,
             "patches_per_tile": patches_per_tile,
             # Token info (needed for saliency)
             "num_visual_tokens": num_visual_tokens,
             "visual_token_indices": visual_indices,
             "total_tokens": len(full_embedding),  # Total tokens in raw embedding
             # Strategy used (important for paper comparison)
             "embedding_strategy": self.embedding_strategy,
             "model_name": getattr(self.embedder, "model_name", None),
             "crop_empty_enabled": bool(self.crop_empty),
             "crop_empty_crop_box": (crop_meta or {}).get("crop_box"),
             "crop_empty_remove_page_number": bool(self.crop_empty_remove_page_number),
             "crop_empty_percentage_to_remove": float(self.crop_empty_percentage_to_remove),
             # Extra metadata (year, district, etc.)
             **extra_metadata,
         }
         result = {
             "id": chunk_id,
-            "visual_embedding": embedding_for_initial,    # "initial" vector in Qdrant
             "tile_pooled_embedding": embedding_for_pooling,  # "mean_pooling" vector in Qdrant
             "experimental_pooled_embedding": experimental_pooled,  # "experimental_pooling" vector in Qdrant
             "global_pooled_embedding": global_pooling,  # "global_pooling" vector in Qdrant
@@ -470,70 +516,70 @@ class ProcessingPipeline:
             "image": raw_img,
             "resized_image": resized_img,
         }
         # For "all" strategy, include BOTH representations for comparison
         if self.embedding_strategy == "all":
             result["extra_vectors"] = {
                 # Standard baseline vectors (for comparison)
-                "full_embedding": full_embedding,           # All tokens [total, 128]
-                "global_pooled": global_pooled,             # Global mean [128]
                 # Pooling vectors (already in main result)
-                "visual_embedding": visual_embedding,       # Visual only [visual, 128]
-                "tile_pooled": tile_pooled,                 # Tile-level [tiles, 128]
             }
         return result
     def _upload_batch(self, upload_queue: List[Dict[str, Any]]) -> int:
         """Upload batch to Qdrant."""
         if not upload_queue or not self.indexer:
             return 0
         logger.info(f"📤 Uploading batch of {len(upload_queue)} pages...")
         count = self.indexer.upload_batch(
             upload_queue,
             delay_between_batches=self.delay_between_uploads,
         )
         return count
     def _get_extra_metadata(self, filename: str) -> Dict[str, Any]:
         """Get extra metadata for a filename."""
         if not self.metadata_mapping:
             return {}
         # Normalize filename
         filename_clean = filename.replace(".pdf", "").replace(".PDF", "").strip().lower()
         # Try exact match
         if filename_clean in self.metadata_mapping:
             return self.metadata_mapping[filename_clean].copy()
         # Try fuzzy match
         from difflib import SequenceMatcher
         best_match = None
         best_score = 0.0
         for known_filename, metadata in self.metadata_mapping.items():
             score = SequenceMatcher(None, filename_clean, known_filename.lower()).ratio()
             if score > best_score and score > 0.75:
                 best_score = score
                 best_match = metadata
         if best_match:
             logger.debug(f"Fuzzy matched '{filename}' with score {best_score:.2f}")
             return best_match.copy()
         return {}
     def _sanitize_text(self, text: str) -> str:
         """Remove invalid Unicode characters."""
         if not text:
             return ""
         return text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore")
     @staticmethod
     def generate_chunk_id(filename: str, page_number: int) -> str:
         """Generate deterministic chunk ID."""
@@ -541,12 +587,12 @@ class ProcessingPipeline:
         hash_obj = hashlib.sha256(content.encode())
         hex_str = hash_obj.hexdigest()[:32]
         return f"{hex_str[:8]}-{hex_str[8:12]}-{hex_str[12:16]}-{hex_str[16:20]}-{hex_str[20:32]}"
     @staticmethod
     def load_metadata_mapping(json_path: Path) -> Dict[str, Dict[str, Any]]:
         """
         Load metadata mapping from JSON file.
         Expected format:
         {
             "filenames": {
@@ -554,7 +600,7 @@ class ProcessingPipeline:
                 ...
             }
         }
         Or simple format:
         {
             "Report Name 2023": {"year": 2023, "source": "Local Government", ...},
@@ -562,22 +608,21 @@ class ProcessingPipeline:
         }
         """
         import json
         with open(json_path, "r") as f:
             data = json.load(f)
         # Check if nested under "filenames"
         if "filenames" in data and isinstance(data["filenames"], dict):
             mapping = data["filenames"]
         else:
             mapping = data
         # Normalize keys to lowercase
         normalized = {}
         for filename, metadata in mapping.items():
             key = filename.lower().strip().replace(".pdf", "")
             normalized[key] = metadata
         logger.info(f"📖 Loaded metadata for {len(normalized)} files")
         return normalized

 """
 import gc
 import hashlib
 import logging
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Set
 import numpy as np
 import torch
 class ProcessingPipeline:
     """
     End-to-end pipeline for PDF processing and indexing.
     This pipeline:
     1. Converts PDFs to images
     2. Resizes for ColPali processing
     4. Computes pooling (strategy-dependent)
     5. Uploads images to Cloudinary (optional)
     6. Stores in Qdrant with full saliency metadata
     Args:
         embedder: VisualEmbedder instance
         indexer: QdrantIndexer instance (optional)
               This is our NOVEL contribution - preserves spatial structure while reducing size.
             - "standard": Push ALL tokens as-is (including special tokens, padding)
               This is the baseline approach for comparison.
     Example:
         >>> from visual_rag import VisualEmbedder, QdrantIndexer, CloudinaryUploader
         >>> from visual_rag.indexing.pipeline import ProcessingPipeline
+        >>>
         >>> # Our novel pooling strategy (default)
         >>> pipeline = ProcessingPipeline(
         ...     embedder=VisualEmbedder(),
         ...     indexer=QdrantIndexer(url, api_key, "my_collection"),
         ...     embedding_strategy="pooling",  # Visual tokens only + tile pooling
         ... )
+        >>>
         >>> # Standard baseline (all tokens, no filtering)
         >>> pipeline_baseline = ProcessingPipeline(
         ...     embedder=VisualEmbedder(),
         ...     indexer=QdrantIndexer(url, api_key, "my_collection_baseline"),
         ...     embedding_strategy="standard",  # All tokens as-is
         ... )
+        >>>
         >>> pipeline.process_pdf(Path("report.pdf"))
     """
     # Valid embedding strategies
     # - "pooling": Visual tokens only + tile-level pooling (NOVEL)
     # - "standard": All tokens + global mean (BASELINE)
     # - "all": Embed once, push BOTH representations (efficient comparison)
     STRATEGIES = ["pooling", "standard", "all"]
     def __init__(
         self,
         embedder=None,
         crop_empty: bool = False,
         crop_empty_percentage_to_remove: float = 0.9,
         crop_empty_remove_page_number: bool = False,
+        crop_empty_preserve_border_px: int = 1,
+        crop_empty_uniform_rowcol_std_threshold: float = 0.0,
     ):
         self.embedder = embedder
         self.indexer = indexer
         self.cloudinary_uploader = cloudinary_uploader
         self.metadata_mapping = metadata_mapping or {}
         self.config = config or {}
         # Validate and set embedding strategy
         if embedding_strategy not in self.STRATEGIES:
             raise ValueError(
         self.crop_empty = bool(crop_empty)
         self.crop_empty_percentage_to_remove = float(crop_empty_percentage_to_remove)
         self.crop_empty_remove_page_number = bool(crop_empty_remove_page_number)
+        self.crop_empty_preserve_border_px = int(crop_empty_preserve_border_px)
+        self.crop_empty_uniform_rowcol_std_threshold = float(
+            crop_empty_uniform_rowcol_std_threshold
+        )
         logger.info(f"📊 Embedding strategy: {embedding_strategy}")
         if embedding_strategy == "pooling":
             logger.info("   → Visual tokens only + tile-level mean pooling (NOVEL)")
         else:
             logger.info("   → All tokens as-is (BASELINE)")
         # Create PDF processor if not provided
         if pdf_processor is None:
             from visual_rag.indexing.pdf_processor import PDFProcessor
             dpi = self.config.get("processing", {}).get("dpi", 140)
             pdf_processor = PDFProcessor(dpi=dpi)
         self.pdf_processor = pdf_processor
         # Config defaults
         self.embedding_batch_size = self.config.get("batching", {}).get("embedding_batch_size", 8)
         self.upload_batch_size = self.config.get("batching", {}).get("upload_batch_size", 8)
         self.delay_between_uploads = self.config.get("delays", {}).get("between_uploads", 0.5)
     def process_pdf(
         self,
         pdf_path: Path,
         skip_existing: bool = True,
         upload_to_cloudinary: bool = True,
         upload_to_qdrant: bool = True,
+        original_filename: Optional[str] = None,
+        progress_callback: Optional[callable] = None,
     ) -> Dict[str, Any]:
         """
         Process a single PDF end-to-end.
         Args:
             pdf_path: Path to PDF file
             skip_existing: Skip pages that already exist in Qdrant
             upload_to_cloudinary: Upload images to Cloudinary
             upload_to_qdrant: Upload embeddings to Qdrant
+            original_filename: Original filename (use this instead of pdf_path.name for temp files)
+            progress_callback: Optional callback(stage, current, total, message) for progress updates
         Returns:
             Dict with processing results:
             {
             }
         """
         pdf_path = Path(pdf_path)
+        filename = original_filename or pdf_path.name
+        logger.info(f"📚 Processing PDF: {filename}")
         # Check existing pages
         existing_ids: Set[str] = set()
         if skip_existing and self.indexer:
+            existing_ids = self.indexer.get_existing_ids(filename)
             if existing_ids:
                 logger.info(f"   Found {len(existing_ids)} existing pages")
+        logger.info("🖼️ Converting PDF to images...")
+        if progress_callback:
+            progress_callback("convert", 0, 0, "Converting PDF to images...")
         images, texts = self.pdf_processor.process_pdf(pdf_path)
         total_pages = len(images)
         logger.info(f"   ✅ Converted {total_pages} pages")
+        if progress_callback:
+            progress_callback("convert", total_pages, total_pages, f"Converted {total_pages} pages")
+        extra_metadata = self._get_extra_metadata(filename)
         if extra_metadata:
             logger.info(f"   📋 Found extra metadata: {list(extra_metadata.keys())}")
         # Process in batches
         uploaded = 0
         skipped = 0
         failed = 0
         all_pages = []
         upload_queue = []
         for batch_start in range(0, total_pages, self.embedding_batch_size):
             batch_end = min(batch_start + self.embedding_batch_size, total_pages)
             batch_images = images[batch_start:batch_end]
             batch_texts = texts[batch_start:batch_end]
             logger.info(f"📦 Processing pages {batch_start + 1}-{batch_end}/{total_pages}")
+            if progress_callback:
+                progress_callback(
+                    "embed",
+                    batch_start,
+                    total_pages,
+                    f"Embedding pages {batch_start + 1}-{batch_end}",
+                )
             pages_to_process = []
             for i, (img, text) in enumerate(zip(batch_images, batch_texts)):
                 page_num = batch_start + i + 1
+                chunk_id = self.generate_chunk_id(filename, page_num)
                 if skip_existing and chunk_id in existing_ids:
                     skipped += 1
                     continue
+                pages_to_process.append(
+                    {
+                        "index": i,
+                        "page_num": page_num,
+                        "chunk_id": chunk_id,
+                        "raw_image": img,
+                        "text": text,
+                    }
+                )
             if not pages_to_process:
                 logger.info("   All pages in batch exist, skipping...")
                 continue
             # Generate embeddings with token info
             logger.info(f"🤖 Generating embeddings for {len(pages_to_process)} pages...")
             from visual_rag.preprocessing.crop_empty import CropEmptyConfig, crop_empty
                         config=CropEmptyConfig(
                             percentage_to_remove=float(self.crop_empty_percentage_to_remove),
                             remove_page_number=bool(self.crop_empty_remove_page_number),
+                            preserve_border_px=int(self.crop_empty_preserve_border_px),
+                            uniform_rowcol_std_threshold=float(
+                                self.crop_empty_uniform_rowcol_std_threshold
+                            ),
                         ),
                     )
                     p["embed_image"] = cropped_img
                     p["embed_image"] = raw_img
                     p["crop_meta"] = None
                     images_to_embed.append(raw_img)
             embeddings, token_infos = self.embedder.embed_images(
                 images_to_embed,
                 batch_size=self.embedding_batch_size,
                 return_token_info=True,
+                show_progress=False,
             )
             for idx, page_info in enumerate(pages_to_process):
                 raw_img = page_info["raw_image"]
                 embed_img = page_info["embed_image"]
                 text = page_info["text"]
                 embedding = embeddings[idx]
                 token_info = token_infos[idx]
+                if progress_callback:
+                    progress_callback(
+                        "process",
+                        page_num,
+                        total_pages,
+                        f"Processing page {page_num}/{total_pages}",
+                    )
                 try:
                     page_data = self._process_single_page(
+                        filename=filename,
+                        pdf_stem=pdf_path.stem,
                         page_num=page_num,
                         chunk_id=chunk_id,
                         total_pages=total_pages,
                         upload_to_cloudinary=upload_to_cloudinary,
                         crop_meta=crop_meta,
                     )
                     all_pages.append(page_data)
                     if upload_to_qdrant and self.indexer:
                         upload_queue.append(page_data)
                         # Upload in batches
                         if len(upload_queue) >= self.upload_batch_size:
                             count = self._upload_batch(upload_queue)
                             uploaded += count
                             upload_queue = []
                 except Exception as e:
                     logger.error(f"   ❌ Failed page {page_num}: {e}")
                     failed += 1
             # Memory cleanup
             gc.collect()
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
         # Upload remaining pages
         if upload_queue and upload_to_qdrant and self.indexer:
             count = self._upload_batch(upload_queue)
             uploaded += count
+        logger.info(
+            f"✅ Completed {filename}: {uploaded} uploaded, {skipped} skipped, {failed} failed"
+        )
         return {
+            "filename": filename,
             "total_pages": total_pages,
             "uploaded": uploaded,
             "skipped": skipped,
             "failed": failed,
             "pages": all_pages,
         }
     def _process_single_page(
         self,
+        filename: str,
+        pdf_stem: str,
         page_num: int,
         chunk_id: str,
         total_pages: int,
     ) -> Dict[str, Any]:
         """Process a single page with full metadata for saliency."""
         from visual_rag.embedding.pooling import global_mean_pooling
         # Resize image for ColPali
         resized_img, tile_rows, tile_cols = self.pdf_processor.resize_for_colpali(embed_img)
         # Use processor's tile info if available (more accurate)
         proc_n_rows = token_info.get("n_rows")
         proc_n_cols = token_info.get("n_cols")
         if proc_n_rows and proc_n_cols:
             tile_rows = proc_n_rows
             tile_cols = proc_n_cols
         # Convert embedding to numpy
         if isinstance(embedding, torch.Tensor):
             if embedding.dtype == torch.bfloat16:
         else:
             full_embedding = np.array(embedding)
         full_embedding = full_embedding.astype(np.float32)
         # Token info for metadata
         visual_indices = token_info["visual_token_indices"]
         num_visual_tokens = token_info["num_visual_tokens"]
         # =========================================================================
         # STRATEGY: "pooling" (NOVEL) vs "standard" (BASELINE) vs "all" (BOTH)
         # =========================================================================
         # Always compute visual-only embedding (needed for pooling and saliency)
         visual_embedding = full_embedding[visual_indices]
+        tile_pooled = self.embedder.mean_pool_visual_embedding(
+            visual_embedding, token_info, target_vectors=32
+        )
         experimental_pooled = self.embedder.experimental_pool_visual_embedding(
             visual_embedding, token_info, target_vectors=32, mean_pool=tile_pooled
         )
         global_pooled = global_mean_pooling(full_embedding)
+        global_pooling = (
+            self.embedder.global_pool_from_mean_pool(tile_pooled)
+            if tile_pooled.size
+            else global_pooled
+        )
         num_tiles = int(tile_pooled.shape[0])
         patches_per_tile = int(visual_embedding.shape[0] // max(num_tiles, 1)) if num_tiles else 0
         else:
             tile_rows = token_info.get("n_rows") or None
             tile_cols = token_info.get("n_cols") or None
         if self.embedding_strategy == "pooling":
             # NOVEL APPROACH: Visual tokens only + tile-level pooling
             embedding_for_initial = visual_embedding
             embedding_for_pooling = tile_pooled
+            global_pooling = (
+                self.embedder.global_pool_from_mean_pool(tile_pooled)
+                if tile_pooled.size
+                else global_pooled
+            )
         elif self.embedding_strategy == "standard":
             # BASELINE: All tokens + global mean
             embedding_for_initial = full_embedding
             embedding_for_pooling = global_pooled.reshape(1, -1)
             global_pooling = global_pooled
         else:  # "all" - Push BOTH representations (efficient for comparison)
             # Embed once, store multiple vector representations
             # This allows comparing both strategies without re-embedding
             embedding_for_initial = visual_embedding  # Use visual for search
+            embedding_for_pooling = tile_pooled  # Use tile-level for fast prefetch
+            global_pooling = (
+                self.embedder.global_pool_from_mean_pool(tile_pooled)
+                if tile_pooled.size
+                else global_pooled
+            )
             # ALSO store standard representations as additional vectors
             # These will be added to metadata for optional use
             pass  # Extra vectors handled in return dict below
         # Upload to Cloudinary
         original_url = None
         cropped_url = None
         resized_url = None
         if upload_to_cloudinary and self.cloudinary_uploader:
+            base_filename = f"{pdf_stem}_page_{page_num}"
             if self.crop_empty:
+                original_url, cropped_url, resized_url = (
+                    self.cloudinary_uploader.upload_original_cropped_and_resized(
+                        raw_img, embed_img, resized_img, base_filename
+                    )
                 )
             else:
                 original_url, resized_url = self.cloudinary_uploader.upload_original_and_resized(
                     raw_img, resized_img, base_filename
                 )
         # Sanitize text
         safe_text = self._sanitize_text(text[:10000]) if text else ""
         metadata = {
+            "filename": filename,
             "page_number": page_num,
             "total_pages": total_pages,
             "has_text": bool(text and text.strip()),
             "text": safe_text,
             # Image URLs
             "page": resized_url or "",  # For display
             "original_url": original_url or "",
             "cropped_url": cropped_url or "",
             "resized_url": resized_url or "",
             # Dimensions (needed for saliency overlay)
             "original_width": raw_img.width,
             "original_height": raw_img.height,
             "cropped_height": int(embed_img.height) if self.crop_empty else int(raw_img.height),
             "resized_width": resized_img.width,
             "resized_height": resized_img.height,
             # Tile structure (needed for saliency)
             "num_tiles": num_tiles,
             "tile_rows": tile_rows,
             "tile_cols": tile_cols,
             "patches_per_tile": patches_per_tile,
             # Token info (needed for saliency)
             "num_visual_tokens": num_visual_tokens,
             "visual_token_indices": visual_indices,
             "total_tokens": len(full_embedding),  # Total tokens in raw embedding
             # Strategy used (important for paper comparison)
             "embedding_strategy": self.embedding_strategy,
             "model_name": getattr(self.embedder, "model_name", None),
             "crop_empty_enabled": bool(self.crop_empty),
             "crop_empty_crop_box": (crop_meta or {}).get("crop_box"),
             "crop_empty_remove_page_number": bool(self.crop_empty_remove_page_number),
             "crop_empty_percentage_to_remove": float(self.crop_empty_percentage_to_remove),
+            "crop_empty_preserve_border_px": int(self.crop_empty_preserve_border_px),
+            "crop_empty_uniform_rowcol_std_threshold": float(
+                self.crop_empty_uniform_rowcol_std_threshold
+            ),
             # Extra metadata (year, district, etc.)
             **extra_metadata,
         }
         result = {
             "id": chunk_id,
+            "visual_embedding": embedding_for_initial,  # "initial" vector in Qdrant
             "tile_pooled_embedding": embedding_for_pooling,  # "mean_pooling" vector in Qdrant
             "experimental_pooled_embedding": experimental_pooled,  # "experimental_pooling" vector in Qdrant
             "global_pooled_embedding": global_pooling,  # "global_pooling" vector in Qdrant
             "image": raw_img,
             "resized_image": resized_img,
         }
         # For "all" strategy, include BOTH representations for comparison
         if self.embedding_strategy == "all":
             result["extra_vectors"] = {
                 # Standard baseline vectors (for comparison)
+                "full_embedding": full_embedding,  # All tokens [total, 128]
+                "global_pooled": global_pooled,  # Global mean [128]
                 # Pooling vectors (already in main result)
+                "visual_embedding": visual_embedding,  # Visual only [visual, 128]
+                "tile_pooled": tile_pooled,  # Tile-level [tiles, 128]
             }
         return result
     def _upload_batch(self, upload_queue: List[Dict[str, Any]]) -> int:
         """Upload batch to Qdrant."""
         if not upload_queue or not self.indexer:
             return 0
         logger.info(f"📤 Uploading batch of {len(upload_queue)} pages...")
         count = self.indexer.upload_batch(
             upload_queue,
             delay_between_batches=self.delay_between_uploads,
         )
         return count
     def _get_extra_metadata(self, filename: str) -> Dict[str, Any]:
         """Get extra metadata for a filename."""
         if not self.metadata_mapping:
             return {}
         # Normalize filename
         filename_clean = filename.replace(".pdf", "").replace(".PDF", "").strip().lower()
         # Try exact match
         if filename_clean in self.metadata_mapping:
             return self.metadata_mapping[filename_clean].copy()
         # Try fuzzy match
         from difflib import SequenceMatcher
         best_match = None
         best_score = 0.0
         for known_filename, metadata in self.metadata_mapping.items():
             score = SequenceMatcher(None, filename_clean, known_filename.lower()).ratio()
             if score > best_score and score > 0.75:
                 best_score = score
                 best_match = metadata
         if best_match:
             logger.debug(f"Fuzzy matched '{filename}' with score {best_score:.2f}")
             return best_match.copy()
         return {}
     def _sanitize_text(self, text: str) -> str:
         """Remove invalid Unicode characters."""
         if not text:
             return ""
         return text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore")
     @staticmethod
     def generate_chunk_id(filename: str, page_number: int) -> str:
         """Generate deterministic chunk ID."""
         hash_obj = hashlib.sha256(content.encode())
         hex_str = hash_obj.hexdigest()[:32]
         return f"{hex_str[:8]}-{hex_str[8:12]}-{hex_str[12:16]}-{hex_str[16:20]}-{hex_str[20:32]}"
     @staticmethod
     def load_metadata_mapping(json_path: Path) -> Dict[str, Dict[str, Any]]:
         """
         Load metadata mapping from JSON file.
         Expected format:
         {
             "filenames": {
                 ...
             }
         }
         Or simple format:
         {
             "Report Name 2023": {"year": 2023, "source": "Local Government", ...},
         }
         """
         import json
         with open(json_path, "r") as f:
             data = json.load(f)
         # Check if nested under "filenames"
         if "filenames" in data and isinstance(data["filenames"], dict):
             mapping = data["filenames"]
         else:
             mapping = data
         # Normalize keys to lowercase
         normalized = {}
         for filename, metadata in mapping.items():
             key = filename.lower().strip().replace(".pdf", "")
             normalized[key] = metadata
         logger.info(f"📖 Loaded metadata for {len(normalized)} files")
         return normalized

visual_rag/indexing/qdrant_indexer.py CHANGED Viewed

@@ -11,43 +11,61 @@ Features:
 - Configurable payload indexes
 """
-import time
 import hashlib
 import logging
-from typing import List, Dict, Any, Optional, Set
 from urllib.parse import urlparse
 import numpy as np
 logger = logging.getLogger(__name__)
 class QdrantIndexer:
     """
     Upload visual embeddings to Qdrant.
     Works independently - just needs embeddings and metadata.
     Args:
         url: Qdrant server URL
         api_key: Qdrant API key
         collection_name: Name of the collection
         timeout: Request timeout in seconds
         prefer_grpc: Use gRPC protocol (faster but may have issues)
     Example:
         >>> indexer = QdrantIndexer(
         ...     url="https://your-cluster.qdrant.io:6333",
         ...     api_key="your-api-key",
         ...     collection_name="my_collection",
         ... )
-        >>>
         >>> # Create collection
         >>> indexer.create_collection()
-        >>>
         >>> # Upload points
         >>> indexer.upload_batch(points)
     """
     def __init__(
         self,
         url: str,
@@ -57,14 +75,12 @@ class QdrantIndexer:
         prefer_grpc: bool = False,
         vector_datatype: str = "float32",
     ):
-        try:
-            from qdrant_client import QdrantClient
-        except ImportError:
             raise ImportError(
                 "Qdrant client not installed. "
                 "Install with: pip install visual-rag-toolkit[qdrant]"
             )
         self.collection_name = collection_name
         self.timeout = timeout
         if vector_datatype not in ("float32", "float16"):
@@ -81,7 +97,7 @@ class QdrantIndexer:
                     grpc_port = 6334
             except Exception:
                 grpc_port = None
         def _make_client(use_grpc: bool):
             return QdrantClient(
                 url=url,
@@ -102,16 +118,16 @@ class QdrantIndexer:
                     self.client = _make_client(False)
                 else:
                     raise
         logger.info(f"🔌 Connected to Qdrant: {url}")
         logger.info(f"   Collection: {collection_name}")
         logger.info(f"   Vector datatype: {self.vector_datatype}")
     def collection_exists(self) -> bool:
         """Check if collection exists."""
         collections = self.client.get_collections().collections
         return any(c.name == self.collection_name for c in collections)
     def create_collection(
         self,
         embedding_dim: int = 128,
@@ -122,32 +138,22 @@ class QdrantIndexer:
     ) -> bool:
         """
         Create collection with multi-vector support.
         Creates named vectors:
         - initial: Full multi-vector embeddings (num_patches × dim)
         - mean_pooling: Tile-level pooled vectors (num_tiles × dim)
         - experimental_pooling: Experimental multi-vector pooling (varies by model)
         - global_pooling: Single vector pooled representation (dim)
         Args:
             embedding_dim: Embedding dimension (128 for ColSmol)
             force_recreate: Delete and recreate if exists
             enable_quantization: Enable int8 quantization
             indexing_threshold: Qdrant optimizer indexing threshold (set 0 to always build ANN indexes)
         Returns:
             True if created, False if already existed
         """
-        from qdrant_client.http import models
-        from qdrant_client.http.models import (
-            Distance,
-            VectorParams,
-            OptimizersConfigDiff,
-            HnswConfigDiff,
-            ScalarQuantizationConfig,
-            ScalarType,
-        )
         if self.collection_exists():
             if force_recreate:
                 logger.info(f"🗑️ Deleting existing collection: {self.collection_name}")
@@ -155,120 +161,99 @@ class QdrantIndexer:
             else:
                 logger.info(f"✅ Collection already exists: {self.collection_name}")
                 return False
         logger.info(f"📦 Creating collection: {self.collection_name}")
         # Multi-vector config for ColBERT-style MaxSim
-        multivector_config = models.MultiVectorConfig(
-            comparator=models.MultiVectorComparator.MAX_SIM
         )
-        # HNSW config for pooled vectors
-        hnsw_config = HnswConfigDiff(
-            m=32,
-            ef_construct=100,
-            full_scan_threshold=int(full_scan_threshold),
-            on_disk=True,
         )
-        # Optional quantization
-        quantization_config = None
-        if enable_quantization:
-            logger.info("   Quantization: ENABLED (int8)")
-            quantization_config = ScalarQuantizationConfig(
-                type=ScalarType.INT8,
-                quantile=0.99,
-                always_ram=True,
-            )
-        # Vector configs
-        datatype = models.Datatype.FLOAT16 if self.vector_datatype == "float16" else models.Datatype.FLOAT32
         vectors_config = {
             "initial": VectorParams(
                 size=embedding_dim,
                 distance=Distance.COSINE,
                 on_disk=True,
                 multivector_config=multivector_config,
-                hnsw_config=hnsw_config,
                 datatype=datatype,
-                quantization_config=quantization_config,
             ),
             "mean_pooling": VectorParams(
                 size=embedding_dim,
                 distance=Distance.COSINE,
-                on_disk=False,  # Keep in RAM for fast prefetch
                 multivector_config=multivector_config,
-                hnsw_config=hnsw_config,
                 datatype=datatype,
-                quantization_config=quantization_config,
             ),
             "experimental_pooling": VectorParams(
                 size=embedding_dim,
                 distance=Distance.COSINE,
-                on_disk=False,  # Keep in RAM for fast prefetch
                 multivector_config=multivector_config,
-                hnsw_config=hnsw_config,
                 datatype=datatype,
-                quantization_config=quantization_config,
             ),
             "global_pooling": VectorParams(
                 size=embedding_dim,
                 distance=Distance.COSINE,
-                on_disk=False,  # Keep in RAM for fast prefetch
-                hnsw_config=hnsw_config,
                 datatype=datatype,
-                quantization_config=quantization_config,
             ),
         }
-        # Optimizer config for low-RAM clusters
-        optimizer_config = OptimizersConfigDiff(
-            indexing_threshold=int(indexing_threshold),
-            memmap_threshold=0,  # Use mmap immediately
-            flush_interval_sec=5,  # Flush WAL frequently
-        )
         self.client.create_collection(
             collection_name=self.collection_name,
             vectors_config=vectors_config,
-            optimizers_config=optimizer_config,
-            hnsw_config=hnsw_config,
         )
         logger.info(f"✅ Collection created: {self.collection_name}")
         return True
     def create_payload_indexes(
         self,
         fields: Optional[List[Dict[str, str]]] = None,
     ):
         """
         Create payload indexes for filtering.
         Args:
             fields: List of {field, type} dicts
                    type can be: integer, keyword, bool, float, text
         """
-        from qdrant_client.http import models
         type_mapping = {
-            "integer": models.PayloadSchemaType.INTEGER,
-            "keyword": models.PayloadSchemaType.KEYWORD,
-            "bool": models.PayloadSchemaType.BOOL,
-            "float": models.PayloadSchemaType.FLOAT,
-            "text": models.PayloadSchemaType.TEXT,
         }
         if not fields:
             return
         logger.info("📇 Creating payload indexes...")
         for field_config in fields:
             field_name = field_config["field"]
             field_type_str = field_config.get("type", "keyword")
-            field_type = type_mapping.get(field_type_str, models.PayloadSchemaType.KEYWORD)
             try:
                 self.client.create_payload_index(
                     collection_name=self.collection_name,
@@ -278,7 +263,7 @@ class QdrantIndexer:
                 logger.info(f"   ✅ {field_name} ({field_type_str})")
             except Exception as e:
                 logger.debug(f"   Index {field_name} might already exist: {e}")
     def upload_batch(
         self,
         points: List[Dict[str, Any]],
@@ -289,7 +274,7 @@ class QdrantIndexer:
     ) -> int:
         """
         Upload a batch of points to Qdrant.
         Each point should have:
         - id: Unique point ID (string or UUID)
         - visual_embedding: Full embedding [num_patches, dim]
@@ -297,28 +282,28 @@ class QdrantIndexer:
         - experimental_pooled_embedding: Experimental pooled embedding [*, dim]
         - global_pooled_embedding: Pooled embedding [dim]
         - metadata: Payload dict
         Args:
             points: List of point dicts
             max_retries: Retry attempts on failure
             delay_between_batches: Delay after upload
             wait: Wait for operation to complete on Qdrant server
             stop_event: Optional threading.Event used to cancel uploads early
         Returns:
             Number of successfully uploaded points
         """
-        from qdrant_client.http import models
         if not points:
             return 0
         def _is_cancelled() -> bool:
             return stop_event is not None and getattr(stop_event, "is_set", lambda: False)()
         def _is_payload_too_large_error(e: Exception) -> bool:
             msg = str(e)
-            if ("JSON payload" in msg and "larger than allowed" in msg) or ("Payload error:" in msg and "limit:" in msg):
                 return True
             content = getattr(e, "content", None)
             if content is not None:
@@ -329,7 +314,9 @@ class QdrantIndexer:
                         text = str(content)
                 except Exception:
                     text = ""
-                if ("JSON payload" in text and "larger than allowed" in text) or ("Payload error" in text and "limit" in text):
                     return True
             resp = getattr(e, "response", None)
             if resp is not None:
@@ -337,7 +324,9 @@ class QdrantIndexer:
                     text = str(getattr(resp, "text", "") or "")
                 except Exception:
                     text = ""
-                if ("JSON payload" in text and "larger than allowed" in text) or ("Payload error" in text and "limit" in text):
                     return True
             return False
@@ -346,8 +335,8 @@ class QdrantIndexer:
                 return val.tolist()
             return val
-        def _build_qdrant_points(batch_points: List[Dict[str, Any]]) -> List[models.PointStruct]:
-            qdrant_points: List[models.PointStruct] = []
             for p in batch_points:
                 global_pooled = p.get("global_pooled_embedding")
                 if global_pooled is None:
@@ -355,15 +344,19 @@ class QdrantIndexer:
                     global_pooled = tile_pooled.mean(axis=0)
                 global_pooled = np.array(global_pooled, dtype=np.float32).reshape(-1)
-                initial = np.array(p["visual_embedding"], dtype=np.float32).astype(self._np_vector_dtype, copy=False)
-                mean_pooling = np.array(p["tile_pooled_embedding"], dtype=np.float32).astype(self._np_vector_dtype, copy=False)
-                experimental_pooling = np.array(p["experimental_pooled_embedding"], dtype=np.float32).astype(
                     self._np_vector_dtype, copy=False
                 )
                 global_pooling = global_pooled.astype(self._np_vector_dtype, copy=False)
                 qdrant_points.append(
-                    models.PointStruct(
                         id=p["id"],
                         vector={
                             "initial": _to_list(initial),
@@ -375,7 +368,7 @@ class QdrantIndexer:
                     )
                 )
             return qdrant_points
         # Upload with retry
         for attempt in range(max_retries):
             try:
@@ -421,11 +414,11 @@ class QdrantIndexer:
                 if attempt < max_retries - 1:
                     if _is_cancelled():
                         return 0
-                    time.sleep(2 ** attempt)  # Exponential backoff
         logger.error(f"❌ Upload failed after {max_retries} attempts")
         return 0
     def check_exists(self, chunk_id: str) -> bool:
         """Check if a point already exists."""
         try:
@@ -438,50 +431,78 @@ class QdrantIndexer:
             return len(result) > 0
         except Exception:
             return False
     def get_existing_ids(self, filename: str) -> Set[str]:
-        """Get all point IDs for a specific file."""
-        from qdrant_client.models import Filter, FieldCondition, MatchValue
         existing_ids = set()
         offset = None
-        while True:
-            results = self.client.scroll(
-                collection_name=self.collection_name,
-                scroll_filter=Filter(
-                    must=[FieldCondition(key="filename", match=MatchValue(value=filename))]
-                ),
-                limit=100,
-                offset=offset,
-                with_payload=["page_number"],
-                with_vectors=False,
-            )
-            points, next_offset = results
-            for point in points:
-                existing_ids.add(str(point.id))
-            if next_offset is None or len(points) == 0:
-                break
-            offset = next_offset
         return existing_ids
     def get_collection_info(self) -> Optional[Dict[str, Any]]:
         """Get collection statistics."""
         try:
             info = self.client.get_collection(self.collection_name)
             status = info.status
             if hasattr(status, "value"):
                 status = status.value
             indexed_count = getattr(info, "indexed_vectors_count", 0) or 0
             if isinstance(indexed_count, dict):
                 indexed_count = sum(indexed_count.values())
             return {
                 "status": str(status),
                 "points_count": getattr(info, "points_count", 0),
@@ -490,12 +511,12 @@ class QdrantIndexer:
         except Exception as e:
             logger.warning(f"Could not get collection info: {e}")
             return None
     @staticmethod
     def generate_point_id(filename: str, page_number: int) -> str:
         """
         Generate deterministic point ID from filename and page.
         Returns a valid UUID string.
         """
         content = f"{filename}:page:{page_number}"
@@ -503,5 +524,3 @@ class QdrantIndexer:
         hex_str = hash_obj.hexdigest()[:32]
         # Format as UUID
         return f"{hex_str[:8]}-{hex_str[8:12]}-{hex_str[12:16]}-{hex_str[16:20]}-{hex_str[20:32]}"

 - Configurable payload indexes
 """
 import hashlib
 import logging
+import time
+from typing import Any, Dict, List, Optional, Set
 from urllib.parse import urlparse
 import numpy as np
+try:
+    from qdrant_client import QdrantClient
+    from qdrant_client.http import models as qdrant_models
+    from qdrant_client.http.models import Distance, VectorParams
+    from qdrant_client.models import FieldCondition, Filter, MatchValue
+    QDRANT_AVAILABLE = True
+except ImportError:
+    QDRANT_AVAILABLE = False
+    QdrantClient = None
+    qdrant_models = None
+    Distance = None
+    VectorParams = None
+    FieldCondition = None
+    Filter = None
+    MatchValue = None
 logger = logging.getLogger(__name__)
 class QdrantIndexer:
     """
     Upload visual embeddings to Qdrant.
     Works independently - just needs embeddings and metadata.
     Args:
         url: Qdrant server URL
         api_key: Qdrant API key
         collection_name: Name of the collection
         timeout: Request timeout in seconds
         prefer_grpc: Use gRPC protocol (faster but may have issues)
     Example:
         >>> indexer = QdrantIndexer(
         ...     url="https://your-cluster.qdrant.io:6333",
         ...     api_key="your-api-key",
         ...     collection_name="my_collection",
         ... )
+        >>>
         >>> # Create collection
         >>> indexer.create_collection()
+        >>>
         >>> # Upload points
         >>> indexer.upload_batch(points)
     """
     def __init__(
         self,
         url: str,
         prefer_grpc: bool = False,
         vector_datatype: str = "float32",
     ):
+        if not QDRANT_AVAILABLE:
             raise ImportError(
                 "Qdrant client not installed. "
                 "Install with: pip install visual-rag-toolkit[qdrant]"
             )
         self.collection_name = collection_name
         self.timeout = timeout
         if vector_datatype not in ("float32", "float16"):
                     grpc_port = 6334
             except Exception:
                 grpc_port = None
         def _make_client(use_grpc: bool):
             return QdrantClient(
                 url=url,
                     self.client = _make_client(False)
                 else:
                     raise
         logger.info(f"🔌 Connected to Qdrant: {url}")
         logger.info(f"   Collection: {collection_name}")
         logger.info(f"   Vector datatype: {self.vector_datatype}")
     def collection_exists(self) -> bool:
         """Check if collection exists."""
         collections = self.client.get_collections().collections
         return any(c.name == self.collection_name for c in collections)
     def create_collection(
         self,
         embedding_dim: int = 128,
     ) -> bool:
         """
         Create collection with multi-vector support.
         Creates named vectors:
         - initial: Full multi-vector embeddings (num_patches × dim)
         - mean_pooling: Tile-level pooled vectors (num_tiles × dim)
         - experimental_pooling: Experimental multi-vector pooling (varies by model)
         - global_pooling: Single vector pooled representation (dim)
         Args:
             embedding_dim: Embedding dimension (128 for ColSmol)
             force_recreate: Delete and recreate if exists
             enable_quantization: Enable int8 quantization
             indexing_threshold: Qdrant optimizer indexing threshold (set 0 to always build ANN indexes)
         Returns:
             True if created, False if already existed
         """
         if self.collection_exists():
             if force_recreate:
                 logger.info(f"🗑️ Deleting existing collection: {self.collection_name}")
             else:
                 logger.info(f"✅ Collection already exists: {self.collection_name}")
                 return False
         logger.info(f"📦 Creating collection: {self.collection_name}")
         # Multi-vector config for ColBERT-style MaxSim
+        multivector_config = qdrant_models.MultiVectorConfig(
+            comparator=qdrant_models.MultiVectorComparator.MAX_SIM
         )
+        # Vector configs - simplified for compatibility
+        datatype = (
+            qdrant_models.Datatype.FLOAT16
+            if self.vector_datatype == "float16"
+            else qdrant_models.Datatype.FLOAT32
         )
         vectors_config = {
             "initial": VectorParams(
                 size=embedding_dim,
                 distance=Distance.COSINE,
                 on_disk=True,
                 multivector_config=multivector_config,
                 datatype=datatype,
             ),
             "mean_pooling": VectorParams(
                 size=embedding_dim,
                 distance=Distance.COSINE,
+                on_disk=False,
                 multivector_config=multivector_config,
                 datatype=datatype,
             ),
             "experimental_pooling": VectorParams(
                 size=embedding_dim,
                 distance=Distance.COSINE,
+                on_disk=False,
                 multivector_config=multivector_config,
                 datatype=datatype,
             ),
             "global_pooling": VectorParams(
                 size=embedding_dim,
                 distance=Distance.COSINE,
+                on_disk=False,
                 datatype=datatype,
             ),
         }
         self.client.create_collection(
             collection_name=self.collection_name,
             vectors_config=vectors_config,
         )
+        # Create required payload index for skip_existing functionality
+        # This index is needed for filtering by filename when checking existing docs
+        try:
+            self.client.create_payload_index(
+                collection_name=self.collection_name,
+                field_name="filename",
+                field_schema=qdrant_models.PayloadSchemaType.KEYWORD,
+            )
+            logger.info("   📇 Created payload index: filename")
+        except Exception as e:
+            logger.warning(f"   ⚠️ Could not create filename index: {e}")
         logger.info(f"✅ Collection created: {self.collection_name}")
         return True
     def create_payload_indexes(
         self,
         fields: Optional[List[Dict[str, str]]] = None,
     ):
         """
         Create payload indexes for filtering.
         Args:
             fields: List of {field, type} dicts
                    type can be: integer, keyword, bool, float, text
         """
         type_mapping = {
+            "integer": qdrant_models.PayloadSchemaType.INTEGER,
+            "keyword": qdrant_models.PayloadSchemaType.KEYWORD,
+            "bool": qdrant_models.PayloadSchemaType.BOOL,
+            "float": qdrant_models.PayloadSchemaType.FLOAT,
+            "text": qdrant_models.PayloadSchemaType.TEXT,
         }
         if not fields:
             return
         logger.info("📇 Creating payload indexes...")
         for field_config in fields:
             field_name = field_config["field"]
             field_type_str = field_config.get("type", "keyword")
+            field_type = type_mapping.get(field_type_str, qdrant_models.PayloadSchemaType.KEYWORD)
             try:
                 self.client.create_payload_index(
                     collection_name=self.collection_name,
                 logger.info(f"   ✅ {field_name} ({field_type_str})")
             except Exception as e:
                 logger.debug(f"   Index {field_name} might already exist: {e}")
     def upload_batch(
         self,
         points: List[Dict[str, Any]],
     ) -> int:
         """
         Upload a batch of points to Qdrant.
         Each point should have:
         - id: Unique point ID (string or UUID)
         - visual_embedding: Full embedding [num_patches, dim]
         - experimental_pooled_embedding: Experimental pooled embedding [*, dim]
         - global_pooled_embedding: Pooled embedding [dim]
         - metadata: Payload dict
         Args:
             points: List of point dicts
             max_retries: Retry attempts on failure
             delay_between_batches: Delay after upload
             wait: Wait for operation to complete on Qdrant server
             stop_event: Optional threading.Event used to cancel uploads early
         Returns:
             Number of successfully uploaded points
         """
         if not points:
             return 0
         def _is_cancelled() -> bool:
             return stop_event is not None and getattr(stop_event, "is_set", lambda: False)()
         def _is_payload_too_large_error(e: Exception) -> bool:
             msg = str(e)
+            if ("JSON payload" in msg and "larger than allowed" in msg) or (
+                "Payload error:" in msg and "limit:" in msg
+            ):
                 return True
             content = getattr(e, "content", None)
             if content is not None:
                         text = str(content)
                 except Exception:
                     text = ""
+                if ("JSON payload" in text and "larger than allowed" in text) or (
+                    "Payload error" in text and "limit" in text
+                ):
                     return True
             resp = getattr(e, "response", None)
             if resp is not None:
                     text = str(getattr(resp, "text", "") or "")
                 except Exception:
                     text = ""
+                if ("JSON payload" in text and "larger than allowed" in text) or (
+                    "Payload error" in text and "limit" in text
+                ):
                     return True
             return False
                 return val.tolist()
             return val
+        def _build_qdrant_points(batch_points: List[Dict[str, Any]]) -> List[qdrant_models.PointStruct]:
+            qdrant_points: List[qdrant_models.PointStruct] = []
             for p in batch_points:
                 global_pooled = p.get("global_pooled_embedding")
                 if global_pooled is None:
                     global_pooled = tile_pooled.mean(axis=0)
                 global_pooled = np.array(global_pooled, dtype=np.float32).reshape(-1)
+                initial = np.array(p["visual_embedding"], dtype=np.float32).astype(
+                    self._np_vector_dtype, copy=False
+                )
+                mean_pooling = np.array(p["tile_pooled_embedding"], dtype=np.float32).astype(
                     self._np_vector_dtype, copy=False
                 )
+                experimental_pooling = np.array(
+                    p["experimental_pooled_embedding"], dtype=np.float32
+                ).astype(self._np_vector_dtype, copy=False)
                 global_pooling = global_pooled.astype(self._np_vector_dtype, copy=False)
                 qdrant_points.append(
+                    qdrant_models.PointStruct(
                         id=p["id"],
                         vector={
                             "initial": _to_list(initial),
                     )
                 )
             return qdrant_points
         # Upload with retry
         for attempt in range(max_retries):
             try:
                 if attempt < max_retries - 1:
                     if _is_cancelled():
                         return 0
+                    time.sleep(2**attempt)  # Exponential backoff
         logger.error(f"❌ Upload failed after {max_retries} attempts")
         return 0
     def check_exists(self, chunk_id: str) -> bool:
         """Check if a point already exists."""
         try:
             return len(result) > 0
         except Exception:
             return False
     def get_existing_ids(self, filename: str) -> Set[str]:
+        """Get all point IDs for a specific file.
+        Requires a payload index on 'filename' field. If the index doesn't exist,
+        this method will attempt to create it automatically.
+        """
         existing_ids = set()
         offset = None
+        try:
+            while True:
+                results = self.client.scroll(
+                    collection_name=self.collection_name,
+                    scroll_filter=Filter(
+                        must=[FieldCondition(key="filename", match=MatchValue(value=filename))]
+                    ),
+                    limit=100,
+                    offset=offset,
+                    with_payload=["page_number"],
+                    with_vectors=False,
+                )
+                points, next_offset = results
+                for point in points:
+                    existing_ids.add(str(point.id))
+                if next_offset is None or len(points) == 0:
+                    break
+                offset = next_offset
+        except Exception as e:
+            error_msg = str(e).lower()
+            if "index required" in error_msg or "index" in error_msg and "filename" in error_msg:
+                # Missing payload index - try to create it
+                logger.warning(
+                    "⚠️ Missing 'filename' payload index. Creating it now... "
+                    "(skip_existing requires this index for filtering)"
+                )
+                try:
+                    self.client.create_payload_index(
+                        collection_name=self.collection_name,
+                        field_name="filename",
+                        field_schema=qdrant_models.PayloadSchemaType.KEYWORD,
+                    )
+                    logger.info("   ✅ Created 'filename' index. Retrying query...")
+                    # Retry the query
+                    return self.get_existing_ids(filename)
+                except Exception as idx_err:
+                    logger.warning(f"   ❌ Could not create index: {idx_err}")
+                    logger.warning("   Returning empty set - all pages will be processed")
+                    return set()
+            else:
+                logger.warning(f"⚠️ Error checking existing IDs: {e}")
+                return set()
         return existing_ids
     def get_collection_info(self) -> Optional[Dict[str, Any]]:
         """Get collection statistics."""
         try:
             info = self.client.get_collection(self.collection_name)
             status = info.status
             if hasattr(status, "value"):
                 status = status.value
             indexed_count = getattr(info, "indexed_vectors_count", 0) or 0
             if isinstance(indexed_count, dict):
                 indexed_count = sum(indexed_count.values())
             return {
                 "status": str(status),
                 "points_count": getattr(info, "points_count", 0),
         except Exception as e:
             logger.warning(f"Could not get collection info: {e}")
             return None
     @staticmethod
     def generate_point_id(filename: str, page_number: int) -> str:
         """
         Generate deterministic point ID from filename and page.
         Returns a valid UUID string.
         """
         content = f"{filename}:page:{page_number}"
         hex_str = hash_obj.hexdigest()[:32]
         # Format as UUID
         return f"{hex_str[:8]}-{hex_str[8:12]}-{hex_str[12:16]}-{hex_str[16:20]}-{hex_str[20:32]}"

visual_rag/preprocessing/__init__.py CHANGED Viewed

@@ -1,5 +1,3 @@
 from visual_rag.preprocessing.crop_empty import CropEmptyConfig, crop_empty
 __all__ = ["CropEmptyConfig", "crop_empty"]


1	from visual_rag.preprocessing.crop_empty import CropEmptyConfig, crop_empty
2
3	__all__ = ["CropEmptyConfig", "crop_empty"]

visual_rag/preprocessing/crop_empty.py CHANGED Viewed

@@ -20,7 +20,9 @@ class CropEmptyConfig:
     uniform_rowcol_std_threshold: float = 0.0
-def crop_empty(image: Image.Image, *, config: CropEmptyConfig) -> Tuple[Image.Image, Dict[str, Any]]:
     img = image.convert("RGB")
     arr = np.array(img)
     intensity = arr.mean(axis=2)
@@ -31,7 +33,9 @@ def crop_empty(image: Image.Image, *, config: CropEmptyConfig) -> Tuple[Image.Im
             pixels = intensity[i, :] if axis == 0 else intensity[:, i]
             white = float(np.mean(pixels > config.color_threshold))
             non_white = 1.0 - white
-            if float(config.uniform_rowcol_std_threshold) > 0.0 and float(np.std(pixels)) <= float(config.uniform_rowcol_std_threshold):
                 continue
             if (white < config.min_white_fraction) and (non_white > min_content_density_threshold):
                 return int(i)
@@ -43,7 +47,9 @@ def crop_empty(image: Image.Image, *, config: CropEmptyConfig) -> Tuple[Image.Im
             pixels = intensity[i, :] if axis == 0 else intensity[:, i]
             white = float(np.mean(pixels > config.color_threshold))
             non_white = 1.0 - white
-            if float(config.uniform_rowcol_std_threshold) > 0.0 and float(np.std(pixels)) <= float(config.uniform_rowcol_std_threshold):
                 continue
             if (white < config.min_white_fraction) and (non_white > min_content_density_threshold):
                 return int(i + 1)
@@ -53,8 +59,12 @@ def crop_empty(image: Image.Image, *, config: CropEmptyConfig) -> Tuple[Image.Im
     left = _find_border_start(1, min_content_density_threshold=float(config.content_density_sides))
     right = _find_border_end(1, min_content_density_threshold=float(config.content_density_sides))
-    main_text_end = _find_border_end(0, min_content_density_threshold=float(config.content_density_main_text))
-    last_content_end = _find_border_end(0, min_content_density_threshold=float(config.content_density_any))
     bottom = main_text_end if config.remove_page_number else last_content_end
     width, height = img.size
@@ -108,5 +118,3 @@ def crop_empty(image: Image.Image, *, config: CropEmptyConfig) -> Tuple[Image.Im
             "uniform_rowcol_std_threshold": float(config.uniform_rowcol_std_threshold),
         },
     }

     uniform_rowcol_std_threshold: float = 0.0
+def crop_empty(
+    image: Image.Image, *, config: CropEmptyConfig
+) -> Tuple[Image.Image, Dict[str, Any]]:
     img = image.convert("RGB")
     arr = np.array(img)
     intensity = arr.mean(axis=2)
             pixels = intensity[i, :] if axis == 0 else intensity[:, i]
             white = float(np.mean(pixels > config.color_threshold))
             non_white = 1.0 - white
+            if float(config.uniform_rowcol_std_threshold) > 0.0 and float(np.std(pixels)) <= float(
+                config.uniform_rowcol_std_threshold
+            ):
                 continue
             if (white < config.min_white_fraction) and (non_white > min_content_density_threshold):
                 return int(i)
             pixels = intensity[i, :] if axis == 0 else intensity[:, i]
             white = float(np.mean(pixels > config.color_threshold))
             non_white = 1.0 - white
+            if float(config.uniform_rowcol_std_threshold) > 0.0 and float(np.std(pixels)) <= float(
+                config.uniform_rowcol_std_threshold
+            ):
                 continue
             if (white < config.min_white_fraction) and (non_white > min_content_density_threshold):
                 return int(i + 1)
     left = _find_border_start(1, min_content_density_threshold=float(config.content_density_sides))
     right = _find_border_end(1, min_content_density_threshold=float(config.content_density_sides))
+    main_text_end = _find_border_end(
+        0, min_content_density_threshold=float(config.content_density_main_text)
+    )
+    last_content_end = _find_border_end(
+        0, min_content_density_threshold=float(config.content_density_any)
+    )
     bottom = main_text_end if config.remove_page_number else last_content_end
     width, height = img.size
             "uniform_rowcol_std_threshold": float(config.uniform_rowcol_std_threshold),
         },
     }

visual_rag/qdrant_admin.py CHANGED Viewed

@@ -33,9 +33,16 @@ def _resolve_qdrant_connection(
     import os
     _maybe_load_dotenv()
-    resolved_url = url or os.getenv("SIGIR_QDRANT_URL") or os.getenv("DEST_QDRANT_URL") or os.getenv("QDRANT_URL")
     if not resolved_url:
-        raise ValueError("Qdrant URL not set (pass url= or set SIGIR_QDRANT_URL/DEST_QDRANT_URL/QDRANT_URL).")
     resolved_key = (
         api_key
         or os.getenv("SIGIR_QDRANT_KEY")
@@ -105,7 +112,11 @@ class QdrantAdmin:
         from qdrant_client.http import models as m
         hnsw_diff = m.HnswConfigDiff(**hnsw_config) if isinstance(hnsw_config, dict) else None
-        params_diff = m.CollectionParamsDiff(**collection_params) if isinstance(collection_params, dict) else None
         if hnsw_diff is None and params_diff is None:
             raise ValueError("No changes provided (pass hnsw_config and/or collection_params).")
         return bool(
@@ -143,7 +154,9 @@ class QdrantAdmin:
         missing = [str(k) for k in (vectors or {}).keys() if existing and str(k) not in existing]
         if missing:
-            raise ValueError(f"Vectors do not exist in collection '{collection_name}': {missing}. Existing: {sorted(existing)}")
         ok = True
         for name, cfg in (vectors or {}).items():
@@ -158,13 +171,16 @@ class QdrantAdmin:
                 )
             }
-            ok = bool(
-                self.client.update_collection(
-                    collection_name=collection_name,
-                    vectors_config=vectors_diff,
-                    timeout=int(timeout) if timeout is not None else None,
                 )
-            ) and ok
         return ok
@@ -192,7 +208,9 @@ class QdrantAdmin:
             vectors[str(vname)] = {"on_disk": True, "hnsw_config": {"on_disk": True}}
         if vectors:
-            self.modify_collection_vector_config(collection_name=collection_name, vectors=vectors, timeout=timeout)
         self.modify_collection_config(
             collection_name=collection_name,
@@ -202,4 +220,3 @@ class QdrantAdmin:
         )
         return self.get_collection_info(collection_name=collection_name)

     import os
     _maybe_load_dotenv()
+    resolved_url = (
+        url
+        or os.getenv("SIGIR_QDRANT_URL")
+        or os.getenv("DEST_QDRANT_URL")
+        or os.getenv("QDRANT_URL")
+    )
     if not resolved_url:
+        raise ValueError(
+            "Qdrant URL not set (pass url= or set SIGIR_QDRANT_URL/DEST_QDRANT_URL/QDRANT_URL)."
+        )
     resolved_key = (
         api_key
         or os.getenv("SIGIR_QDRANT_KEY")
         from qdrant_client.http import models as m
         hnsw_diff = m.HnswConfigDiff(**hnsw_config) if isinstance(hnsw_config, dict) else None
+        params_diff = (
+            m.CollectionParamsDiff(**collection_params)
+            if isinstance(collection_params, dict)
+            else None
+        )
         if hnsw_diff is None and params_diff is None:
             raise ValueError("No changes provided (pass hnsw_config and/or collection_params).")
         return bool(
         missing = [str(k) for k in (vectors or {}).keys() if existing and str(k) not in existing]
         if missing:
+            raise ValueError(
+                f"Vectors do not exist in collection '{collection_name}': {missing}. Existing: {sorted(existing)}"
+            )
         ok = True
         for name, cfg in (vectors or {}).items():
                 )
             }
+            ok = (
+                bool(
+                    self.client.update_collection(
+                        collection_name=collection_name,
+                        vectors_config=vectors_diff,
+                        timeout=int(timeout) if timeout is not None else None,
+                    )
                 )
+                and ok
+            )
         return ok
             vectors[str(vname)] = {"on_disk": True, "hnsw_config": {"on_disk": True}}
         if vectors:
+            self.modify_collection_vector_config(
+                collection_name=collection_name, vectors=vectors, timeout=timeout
+            )
         self.modify_collection_config(
             collection_name=collection_name,
         )
         return self.get_collection_info(collection_name=collection_name)

visual_rag/retrieval/__init__.py CHANGED Viewed

@@ -6,10 +6,10 @@ Components:
 - SingleStageRetriever: Direct multi-vector or pooled search
 """
-from visual_rag.retrieval.two_stage import TwoStageRetriever
-from visual_rag.retrieval.single_stage import SingleStageRetriever
 from visual_rag.retrieval.multi_vector import MultiVectorRetriever
 from visual_rag.retrieval.three_stage import ThreeStageRetriever
 __all__ = [
     "TwoStageRetriever",

 - SingleStageRetriever: Direct multi-vector or pooled search
 """
 from visual_rag.retrieval.multi_vector import MultiVectorRetriever
+from visual_rag.retrieval.single_stage import SingleStageRetriever
 from visual_rag.retrieval.three_stage import ThreeStageRetriever
+from visual_rag.retrieval.two_stage import TwoStageRetriever
 __all__ = [
     "TwoStageRetriever",

visual_rag/retrieval/multi_vector.py CHANGED Viewed

@@ -2,18 +2,35 @@ import os
 from typing import Any, Dict, List, Optional
 from urllib.parse import urlparse
 from visual_rag.embedding.visual_embedder import VisualEmbedder
 from visual_rag.retrieval.single_stage import SingleStageRetriever
-from visual_rag.retrieval.two_stage import TwoStageRetriever
 from visual_rag.retrieval.three_stage import ThreeStageRetriever
 class MultiVectorRetriever:
     @staticmethod
     def _maybe_load_dotenv() -> None:
-        try:
-            from dotenv import load_dotenv
-        except ImportError:
             return
         if os.path.exists(".env"):
             load_dotenv(".env")
@@ -33,83 +50,84 @@ class MultiVectorRetriever:
     ):
         if qdrant_client is None:
             self._maybe_load_dotenv()
-            try:
-                from qdrant_client import QdrantClient
-            except ImportError as e:
                 raise ImportError(
                     "Qdrant client not installed. Install with: pip install visual-rag-toolkit[qdrant]"
-                ) from e
             qdrant_url = (
                 qdrant_url
-                or os.getenv("SIGIR_QDRANT_URL")
-                or os.getenv("DEST_QDRANT_URL")
                 or os.getenv("QDRANT_URL")
             )
             if not qdrant_url:
                 raise ValueError(
-                    "QDRANT_URL is required (pass qdrant_url or set env var). "
-                    "You can also set DEST_QDRANT_URL to override."
                 )
             qdrant_api_key = (
                 qdrant_api_key
-                or os.getenv("SIGIR_QDRANT_KEY")
-                or os.getenv("SIGIR_QDRANT_API_KEY")
-                or os.getenv("DEST_QDRANT_API_KEY")
                 or os.getenv("QDRANT_API_KEY")
             )
             grpc_port = None
             if prefer_grpc:
                 try:
-                    if urlparse(qdrant_url).port == 6333:
                         grpc_port = 6334
                 except Exception:
-                    grpc_port = None
             def _make_client(use_grpc: bool):
                 return QdrantClient(
                     url=qdrant_url,
                     api_key=qdrant_api_key,
                     prefer_grpc=bool(use_grpc),
                     grpc_port=grpc_port,
-                    timeout=int(request_timeout),
                     check_compatibility=False,
                 )
-            qdrant_client = _make_client(prefer_grpc)
             if prefer_grpc:
                 try:
-                    _ = qdrant_client.get_collections()
                 except Exception as e:
                     msg = str(e)
                     if "StatusCode.PERMISSION_DENIED" in msg or "http2 header with status: 403" in msg:
-                        qdrant_client = _make_client(False)
                     else:
                         raise
         self.client = qdrant_client
         self.collection_name = collection_name
         self.embedder = embedder or VisualEmbedder(model_name=model_name)
         self._two_stage = TwoStageRetriever(
-            self.client,
-            collection_name=self.collection_name,
-            request_timeout=int(request_timeout),
-            max_retries=int(max_retries),
-            retry_sleep=float(retry_sleep),
         )
         self._three_stage = ThreeStageRetriever(
-            self.client,
-            collection_name=self.collection_name,
-            request_timeout=int(request_timeout),
-            max_retries=int(max_retries),
-            retry_sleep=float(retry_sleep),
         )
         self._single_stage = SingleStageRetriever(
-            self.client,
-            collection_name=self.collection_name,
-            request_timeout=int(request_timeout),
         )
     def build_filter(
@@ -139,14 +157,10 @@ class MultiVectorRetriever:
         return_embeddings: bool = False,
     ) -> List[Dict[str, Any]]:
         q = self.embedder.embed_query(query)
-        try:
-            import torch
-        except ImportError:
-            torch = None
-        if torch is not None and isinstance(q, torch.Tensor):
             query_embedding = q.detach().cpu().numpy()
         else:
-            query_embedding = q.numpy()
         return self.search_embedded(
             query_embedding=query_embedding,
@@ -175,27 +189,17 @@ class MultiVectorRetriever:
             return self._single_stage.search(
                 query_embedding=query_embedding,
                 top_k=top_k,
-                strategy="multi_vector",
-                filter_obj=filter_obj,
-            )
-        if mode == "single_tiles":
-            return self._single_stage.search(
-                query_embedding=query_embedding,
-                top_k=top_k,
-                strategy="tiles_maxsim",
                 filter_obj=filter_obj,
             )
-        if mode == "single_global":
             return self._single_stage.search(
                 query_embedding=query_embedding,
                 top_k=top_k,
-                strategy="pooled_global",
                 filter_obj=filter_obj,
             )
-        if mode == "two_stage":
             return self._two_stage.search_server_side(
                 query_embedding=query_embedding,
                 top_k=top_k,
@@ -203,18 +207,14 @@ class MultiVectorRetriever:
                 filter_obj=filter_obj,
                 stage1_mode=stage1_mode,
             )
-        if mode == "three_stage":
-            s1 = int(stage1_k) if stage1_k is not None else 1000
-            s2 = int(stage2_k) if stage2_k is not None else 300
             return self._three_stage.search_server_side(
                 query_embedding=query_embedding,
                 top_k=top_k,
-                stage1_k=s1,
-                stage2_k=s2,
                 filter_obj=filter_obj,
             )
-        raise ValueError(f"Unknown mode: {mode}")

 from typing import Any, Dict, List, Optional
 from urllib.parse import urlparse
+import numpy as np
+import torch
+try:
+    from dotenv import load_dotenv
+    DOTENV_AVAILABLE = True
+except ImportError:
+    DOTENV_AVAILABLE = False
+    load_dotenv = None
+try:
+    from qdrant_client import QdrantClient
+    QDRANT_AVAILABLE = True
+except ImportError:
+    QDRANT_AVAILABLE = False
+    QdrantClient = None
 from visual_rag.embedding.visual_embedder import VisualEmbedder
 from visual_rag.retrieval.single_stage import SingleStageRetriever
 from visual_rag.retrieval.three_stage import ThreeStageRetriever
+from visual_rag.retrieval.two_stage import TwoStageRetriever
 class MultiVectorRetriever:
     @staticmethod
     def _maybe_load_dotenv() -> None:
+        if not DOTENV_AVAILABLE:
             return
         if os.path.exists(".env"):
             load_dotenv(".env")
     ):
         if qdrant_client is None:
             self._maybe_load_dotenv()
+            if not QDRANT_AVAILABLE:
                 raise ImportError(
                     "Qdrant client not installed. Install with: pip install visual-rag-toolkit[qdrant]"
+                )
             qdrant_url = (
                 qdrant_url
                 or os.getenv("QDRANT_URL")
+                or os.getenv("SIGIR_QDRANT_URL")  # legacy
             )
             if not qdrant_url:
                 raise ValueError(
+                    "QDRANT_URL is required (pass qdrant_url or set env var)."
                 )
             qdrant_api_key = (
                 qdrant_api_key
                 or os.getenv("QDRANT_API_KEY")
+                or os.getenv("SIGIR_QDRANT_KEY")  # legacy
             )
             grpc_port = None
             if prefer_grpc:
                 try:
+                    parsed = urlparse(qdrant_url)
+                    port = parsed.port
+                    if port == 6333:
                         grpc_port = 6334
                 except Exception:
+                    pass
             def _make_client(use_grpc: bool):
                 return QdrantClient(
                     url=qdrant_url,
                     api_key=qdrant_api_key,
+                    timeout=request_timeout,
                     prefer_grpc=bool(use_grpc),
                     grpc_port=grpc_port,
                     check_compatibility=False,
                 )
+            client = _make_client(prefer_grpc)
             if prefer_grpc:
                 try:
+                    _ = client.get_collections()
                 except Exception as e:
                     msg = str(e)
                     if "StatusCode.PERMISSION_DENIED" in msg or "http2 header with status: 403" in msg:
+                        client = _make_client(False)
                     else:
                         raise
+            qdrant_client = client
         self.client = qdrant_client
         self.collection_name = collection_name
         self.embedder = embedder or VisualEmbedder(model_name=model_name)
         self._two_stage = TwoStageRetriever(
+            qdrant_client=qdrant_client,
+            collection_name=collection_name,
+            request_timeout=request_timeout,
+            max_retries=max_retries,
+            retry_sleep=retry_sleep,
         )
         self._three_stage = ThreeStageRetriever(
+            qdrant_client=qdrant_client,
+            collection_name=collection_name,
+            request_timeout=request_timeout,
+            max_retries=max_retries,
+            retry_sleep=retry_sleep,
         )
         self._single_stage = SingleStageRetriever(
+            qdrant_client=qdrant_client,
+            collection_name=collection_name,
+            request_timeout=request_timeout,
+            max_retries=max_retries,
+            retry_sleep=retry_sleep,
         )
     def build_filter(
         return_embeddings: bool = False,
     ) -> List[Dict[str, Any]]:
         q = self.embedder.embed_query(query)
+        if isinstance(q, torch.Tensor):
             query_embedding = q.detach().cpu().numpy()
         else:
+            query_embedding = np.asarray(q)
         return self.search_embedded(
             query_embedding=query_embedding,
             return self._single_stage.search(
                 query_embedding=query_embedding,
                 top_k=top_k,
                 filter_obj=filter_obj,
+                using="initial",
             )
+        elif mode == "single_pooled":
             return self._single_stage.search(
                 query_embedding=query_embedding,
                 top_k=top_k,
                 filter_obj=filter_obj,
+                using="mean_pooling",
             )
+        elif mode == "two_stage":
             return self._two_stage.search_server_side(
                 query_embedding=query_embedding,
                 top_k=top_k,
                 filter_obj=filter_obj,
                 stage1_mode=stage1_mode,
             )
+        elif mode == "three_stage":
             return self._three_stage.search_server_side(
                 query_embedding=query_embedding,
                 top_k=top_k,
+                stage1_k=stage1_k,
+                stage2_k=stage2_k,
                 filter_obj=filter_obj,
+                stage1_mode=stage1_mode,
             )
+        else:
+            raise ValueError(f"Unknown mode: {mode}")

visual_rag/retrieval/single_stage.py CHANGED Viewed

@@ -9,7 +9,8 @@ Use when:
 """
 import logging
-from typing import List, Dict, Any, Optional, Union
 import numpy as np
 import torch
@@ -19,22 +20,22 @@ logger = logging.getLogger(__name__)
 class SingleStageRetriever:
     """
     Single-stage visual document retrieval using native Qdrant search.
     Supports strategies:
     - multi_vector: Native MaxSim on full embeddings (using="initial")
     - tiles_maxsim: Native MaxSim between query tokens and tile vectors (using="mean_pooling")
     - pooled_tile: Pooled query vs tile vectors (using="mean_pooling")
     - pooled_global: Pooled query vs global pooled doc vector (using="global_pooling")
     Args:
         qdrant_client: Connected Qdrant client
         collection_name: Name of the Qdrant collection
     Example:
         >>> retriever = SingleStageRetriever(client, "my_collection")
         >>> results = retriever.search(query, top_k=10)
     """
     def __init__(
         self,
         qdrant_client,
@@ -44,7 +45,7 @@ class SingleStageRetriever:
         self.client = qdrant_client
         self.collection_name = collection_name
         self.request_timeout = int(request_timeout)
     def search(
         self,
         query_embedding: Union[torch.Tensor, np.ndarray],
@@ -54,47 +55,47 @@ class SingleStageRetriever:
     ) -> List[Dict[str, Any]]:
         """
         Single-stage search with configurable strategy.
         Args:
             query_embedding: Query embeddings [num_tokens, dim]
             top_k: Number of results
             strategy: "multi_vector", "tiles_maxsim", "pooled_tile", or "pooled_global"
             filter_obj: Qdrant filter
         Returns:
             List of results with scores and metadata
         """
         query_np = self._to_numpy(query_embedding)
         if strategy == "multi_vector":
             # Native multi-vector MaxSim
             vector_name = "initial"
             query_vector = query_np.tolist()
             logger.debug(f"🎯 Multi-vector search on '{vector_name}'")
         elif strategy == "tiles_maxsim":
             # Native multi-vector MaxSim against tile vectors
             vector_name = "mean_pooling"
             query_vector = query_np.tolist()
             logger.debug(f"🎯 Tile MaxSim search on '{vector_name}'")
         elif strategy == "pooled_tile":
             # Tile-level pooled
             vector_name = "mean_pooling"
             query_pooled = query_np.mean(axis=0)
             query_vector = query_pooled.tolist()
             logger.debug(f"🔍 Tile-pooled search on '{vector_name}'")
         elif strategy == "pooled_global":
             # Global pooled vector (single vector)
             vector_name = "global_pooling"
             query_pooled = query_np.mean(axis=0)
             query_vector = query_pooled.tolist()
             logger.debug(f"🔍 Global-pooled search on '{vector_name}'")
         else:
             raise ValueError(f"Unknown strategy: {strategy}")
         results = self.client.query_points(
             collection_name=self.collection_name,
             query=query_vector,
@@ -105,7 +106,7 @@ class SingleStageRetriever:
             with_vectors=False,
             timeout=self.request_timeout,
         ).points
         return [
             {
                 "id": r.id,
@@ -115,7 +116,7 @@ class SingleStageRetriever:
             }
             for r in results
         ]
     def _to_numpy(self, embedding: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
         """Convert embedding to numpy array."""
         if isinstance(embedding, torch.Tensor):
@@ -123,5 +124,3 @@ class SingleStageRetriever:
                 return embedding.cpu().float().numpy()
             return embedding.cpu().numpy()
         return np.array(embedding, dtype=np.float32)

 """
 import logging
+from typing import Any, Dict, List, Union
 import numpy as np
 import torch
 class SingleStageRetriever:
     """
     Single-stage visual document retrieval using native Qdrant search.
     Supports strategies:
     - multi_vector: Native MaxSim on full embeddings (using="initial")
     - tiles_maxsim: Native MaxSim between query tokens and tile vectors (using="mean_pooling")
     - pooled_tile: Pooled query vs tile vectors (using="mean_pooling")
     - pooled_global: Pooled query vs global pooled doc vector (using="global_pooling")
     Args:
         qdrant_client: Connected Qdrant client
         collection_name: Name of the Qdrant collection
     Example:
         >>> retriever = SingleStageRetriever(client, "my_collection")
         >>> results = retriever.search(query, top_k=10)
     """
     def __init__(
         self,
         qdrant_client,
         self.client = qdrant_client
         self.collection_name = collection_name
         self.request_timeout = int(request_timeout)
     def search(
         self,
         query_embedding: Union[torch.Tensor, np.ndarray],
     ) -> List[Dict[str, Any]]:
         """
         Single-stage search with configurable strategy.
         Args:
             query_embedding: Query embeddings [num_tokens, dim]
             top_k: Number of results
             strategy: "multi_vector", "tiles_maxsim", "pooled_tile", or "pooled_global"
             filter_obj: Qdrant filter
         Returns:
             List of results with scores and metadata
         """
         query_np = self._to_numpy(query_embedding)
         if strategy == "multi_vector":
             # Native multi-vector MaxSim
             vector_name = "initial"
             query_vector = query_np.tolist()
             logger.debug(f"🎯 Multi-vector search on '{vector_name}'")
         elif strategy == "tiles_maxsim":
             # Native multi-vector MaxSim against tile vectors
             vector_name = "mean_pooling"
             query_vector = query_np.tolist()
             logger.debug(f"🎯 Tile MaxSim search on '{vector_name}'")
         elif strategy == "pooled_tile":
             # Tile-level pooled
             vector_name = "mean_pooling"
             query_pooled = query_np.mean(axis=0)
             query_vector = query_pooled.tolist()
             logger.debug(f"🔍 Tile-pooled search on '{vector_name}'")
         elif strategy == "pooled_global":
             # Global pooled vector (single vector)
             vector_name = "global_pooling"
             query_pooled = query_np.mean(axis=0)
             query_vector = query_pooled.tolist()
             logger.debug(f"🔍 Global-pooled search on '{vector_name}'")
         else:
             raise ValueError(f"Unknown strategy: {strategy}")
         results = self.client.query_points(
             collection_name=self.collection_name,
             query=query_vector,
             with_vectors=False,
             timeout=self.request_timeout,
         ).points
         return [
             {
                 "id": r.id,
             }
             for r in results
         ]
     def _to_numpy(self, embedding: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
         """Convert embedding to numpy array."""
         if isinstance(embedding, torch.Tensor):
                 return embedding.cpu().float().numpy()
             return embedding.cpu().numpy()
         return np.array(embedding, dtype=np.float32)

visual_rag/retrieval/three_stage.py CHANGED Viewed

@@ -43,7 +43,7 @@ class ThreeStageRetriever:
                 last_err = e
                 if attempt >= self.max_retries - 1:
                     break
-                time.sleep(self.retry_sleep * (2 ** attempt))
         if last_err is not None:
             raise last_err
@@ -171,4 +171,3 @@ class ThreeStageRetriever:
                 }
             )
         return out

                 last_err = e
                 if attempt >= self.max_retries - 1:
                     break
+                time.sleep(self.retry_sleep * (2**attempt))
         if last_err is not None:
             raise last_err
                 }
             )
         return out

visual_rag/retrieval/two_stage.py CHANGED Viewed

@@ -17,47 +17,54 @@ Research Context:
 """
 import logging
-from typing import List, Dict, Any, Optional, Union
 import numpy as np
 import torch
 logger = logging.getLogger(__name__)
 class TwoStageRetriever:
     """
     Two-stage visual document retrieval with pooling and reranking.
     Stage 1 (Prefetch):
         Uses tile-level mean-pooled vectors for fast HNSW search.
         Retrieves prefetch_k candidates (e.g., 100-500).
     Stage 2 (Rerank):
         Fetches full multi-vector embeddings for candidates.
         Computes exact MaxSim scores for precise ranking.
         Returns top_k results (e.g., 10).
     Args:
         qdrant_client: Connected Qdrant client
         collection_name: Name of the Qdrant collection
         full_vector_name: Name of full multi-vector field (default: "initial")
         pooled_vector_name: Name of pooled vector field (default: "mean_pooling")
     Example:
         >>> retriever = TwoStageRetriever(client, "my_collection")
-        >>>
         >>> # Two-stage search: prefetch 200, return top 10
         >>> results = retriever.search(
         ...     query_embedding=query,
         ...     top_k=10,
         ...     prefetch_k=200,
         ... )
-        >>>
         >>> # Compare latency:
         >>> # Full MaxSim (1000 docs): ~500ms
         >>> # Two-stage (200→10):     ~50ms
     """
     def __init__(
         self,
         qdrant_client,
@@ -81,8 +88,6 @@ class TwoStageRetriever:
         self.retry_sleep = float(retry_sleep)
     def _retry_call(self, fn):
-        import time
         last_err = None
         for attempt in range(self.max_retries):
             try:
@@ -91,7 +96,7 @@ class TwoStageRetriever:
                 last_err = e
                 if attempt >= self.max_retries - 1:
                     break
-                time.sleep(self.retry_sleep * (2 ** attempt))
         if last_err is not None:
             raise last_err
@@ -105,27 +110,25 @@ class TwoStageRetriever:
     ) -> List[Dict[str, Any]]:
         """
         Two-stage retrieval using Qdrant's native prefetch (all server-side).
         This is MUCH faster than search() because it avoids network transfer
         of large multi-vector embeddings. All computation happens in Qdrant.
         Args:
             query_embedding: Query embeddings [num_tokens, dim]
             top_k: Final number of results
             prefetch_k: Candidates for stage 1 (default: 10x top_k)
             filter_obj: Qdrant filter
             stage1_mode: How to do stage 1 prefetch
         Returns:
             List of results with scores
         """
-        from qdrant_client.http import models
         query_np = self._to_numpy(query_embedding)
         if prefetch_k is None:
             prefetch_k = max(100, top_k * 10)
         if stage1_mode == "pooled_query_vs_tiles":
             prefetch_query = query_np.mean(axis=0).tolist()
             prefetch_using = self.pooled_vector_name
@@ -143,9 +146,9 @@ class TwoStageRetriever:
             prefetch_using = self.global_vector_name
         else:
             raise ValueError(f"Unknown stage1_mode: {stage1_mode}")
         rerank_query = query_np.tolist()
         def _do_query():
             return self.client.query_points(
                 collection_name=self.collection_name,
@@ -154,9 +157,9 @@ class TwoStageRetriever:
                 limit=top_k,
                 query_filter=filter_obj,
                 with_payload=True,
-                search_params=models.SearchParams(exact=True),
                 prefetch=[
-                    models.Prefetch(
                         query=prefetch_query,
                         using=prefetch_using,
                         limit=prefetch_k,
@@ -164,9 +167,9 @@ class TwoStageRetriever:
                 ],
                 timeout=self.request_timeout,
             ).points
         results = self._retry_call(_do_query)
         return [
             {
                 "id": r.id,
@@ -177,7 +180,7 @@ class TwoStageRetriever:
             }
             for r in results
         ]
     def search(
         self,
         query_embedding: Union[torch.Tensor, np.ndarray],
@@ -190,7 +193,7 @@ class TwoStageRetriever:
     ) -> List[Dict[str, Any]]:
         """
         Two-stage retrieval: prefetch with pooling, rerank with MaxSim.
         Args:
             query_embedding: Query embeddings [num_tokens, dim]
             top_k: Final number of results to return
@@ -202,7 +205,7 @@ class TwoStageRetriever:
                 - "pooled_query_vs_tiles": pool query to 1×dim and search tile vectors (using="mean_pooling")
                 - "tokens_vs_tiles": search tile vectors with full query tokens (using="mean_pooling")
                 - "pooled_query_vs_global": pool query to 1×dim and search global pooled doc vectors (using="global_pooling")
         Returns:
             List of results with scores and metadata:
             [
@@ -218,11 +221,11 @@ class TwoStageRetriever:
         """
         # Convert to numpy
         query_np = self._to_numpy(query_embedding)
         # Auto-set prefetch_k
         if prefetch_k is None:
             prefetch_k = max(100, top_k * 10)
         # Stage 1: Prefetch with pooled vectors
         logger.info(f"🔍 Stage 1: Prefetching {prefetch_k} candidates ({stage1_mode})")
         candidates = self._stage1_prefetch(
@@ -231,16 +234,16 @@ class TwoStageRetriever:
             filter_obj=filter_obj,
             stage1_mode=stage1_mode,
         )
         if not candidates:
             logger.warning("No candidates found in stage 1")
             return []
         logger.info(f"✅ Stage 1: Retrieved {len(candidates)} candidates")
         # Stage 2: Rerank with full embeddings
         if use_reranking and len(candidates) > top_k:
-            logger.info(f"🎯 Stage 2: Reranking with MaxSim...")
             results = self._stage2_rerank(
                 query_np=query_np,
                 candidates=candidates,
@@ -254,9 +257,9 @@ class TwoStageRetriever:
             for r in results:
                 r["score_final"] = r["score_stage1"]
             logger.info(f"⏭️ Skipping reranking, returning top {len(results)}")
         return results
     def search_single_stage(
         self,
         query_embedding: Union[torch.Tensor, np.ndarray],
@@ -266,18 +269,18 @@ class TwoStageRetriever:
     ) -> List[Dict[str, Any]]:
         """
         Single-stage search (either pooled or full multi-vector).
         Args:
             query_embedding: Query embeddings
             top_k: Number of results
             filter_obj: Qdrant filter
             use_pooling: Use pooled vectors (faster) or full (more accurate)
         Returns:
             List of results
         """
         query_np = self._to_numpy(query_embedding)
         if use_pooling:
             # Pool query and search pooled vectors
             query_pooled = query_np.mean(axis=0)
@@ -289,7 +292,7 @@ class TwoStageRetriever:
             vector_name = self.full_vector_name
             query_vector = query_np.tolist()
             logger.info(f"🎯 Multi-vector search: {vector_name}")
         results = self.client.query_points(
             collection_name=self.collection_name,
             query=query_vector,
@@ -300,7 +303,7 @@ class TwoStageRetriever:
             with_vectors=False,
             timeout=120,
         ).points
         return [
             {
                 "id": r.id,
@@ -310,7 +313,7 @@ class TwoStageRetriever:
             }
             for r in results
         ]
     def _stage1_prefetch(
         self,
         query_np: np.ndarray,
@@ -330,7 +333,7 @@ class TwoStageRetriever:
             vector_name = self.global_vector_name
         else:
             raise ValueError(f"Unknown stage1_mode: {stage1_mode}")
         def _do_query():
             return self.client.query_points(
                 collection_name=self.collection_name,
@@ -344,7 +347,7 @@ class TwoStageRetriever:
             ).points
         results = self._retry_call(_do_query)
         return [
             {
                 "id": r.id,
@@ -353,7 +356,7 @@ class TwoStageRetriever:
             }
             for r in results
         ]
     def _stage2_rerank(
         self,
         query_np: np.ndarray,
@@ -362,11 +365,9 @@ class TwoStageRetriever:
         return_embeddings: bool = False,
     ) -> List[Dict[str, Any]]:
         """Stage 2: Rerank with full multi-vector MaxSim scoring."""
-        from visual_rag.embedding.pooling import compute_maxsim_score
         # Fetch full embeddings for candidates
         candidate_ids = [c["id"] for c in candidates]
         # Retrieve points with vectors
         def _do_retrieve():
             return self.client.retrieve(
@@ -378,7 +379,7 @@ class TwoStageRetriever:
             )
         points = self._retry_call(_do_retrieve)
         # Build ID to embedding map
         id_to_embedding = {}
         for point in points:
@@ -386,13 +387,13 @@ class TwoStageRetriever:
                 id_to_embedding[point.id] = np.array(
                     point.vector[self.full_vector_name], dtype=np.float32
                 )
         # Compute MaxSim scores
         reranked = []
         for candidate in candidates:
             point_id = candidate["id"]
             doc_embedding = id_to_embedding.get(point_id)
             if doc_embedding is None:
                 # Fallback to stage 1 score
                 candidate["score_stage2"] = candidate["score_stage1"]
@@ -402,17 +403,17 @@ class TwoStageRetriever:
                 maxsim_score = compute_maxsim_score(query_np, doc_embedding)
                 candidate["score_stage2"] = maxsim_score
                 candidate["score_final"] = maxsim_score
                 if return_embeddings:
                     candidate["embedding"] = doc_embedding
             reranked.append(candidate)
         # Sort by final score (descending)
         reranked.sort(key=lambda x: x["score_final"], reverse=True)
         return reranked[:top_k]
     def _to_numpy(self, embedding: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
         """Convert embedding to numpy array."""
         if isinstance(embedding, torch.Tensor):
@@ -420,7 +421,7 @@ class TwoStageRetriever:
                 return embedding.cpu().float().numpy()
             return embedding.cpu().numpy()
         return np.array(embedding, dtype=np.float32)
     def build_filter(
         self,
         year: Optional[Any] = None,
@@ -431,60 +432,38 @@ class TwoStageRetriever:
     ):
         """
         Build Qdrant filter from parameters.
         Supports single values or lists (using MatchAny).
         """
-        from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny
         conditions = []
         if year is not None:
             if isinstance(year, list):
                 year_values = [int(y) if isinstance(y, str) else y for y in year]
-                conditions.append(
-                    FieldCondition(key="year", match=MatchAny(any=year_values))
-                )
             else:
                 year_value = int(year) if isinstance(year, str) else year
-                conditions.append(
-                    FieldCondition(key="year", match=MatchValue(value=year_value))
-                )
         if source is not None:
             if isinstance(source, list):
-                conditions.append(
-                    FieldCondition(key="source", match=MatchAny(any=source))
-                )
             else:
-                conditions.append(
-                    FieldCondition(key="source", match=MatchValue(value=source))
-                )
         if district is not None:
             if isinstance(district, list):
-                conditions.append(
-                    FieldCondition(key="district", match=MatchAny(any=district))
-                )
             else:
-                conditions.append(
-                    FieldCondition(key="district", match=MatchValue(value=district))
-                )
         if filename is not None:
             if isinstance(filename, list):
-                conditions.append(
-                    FieldCondition(key="filename", match=MatchAny(any=filename))
-                )
             else:
-                conditions.append(
-                    FieldCondition(key="filename", match=MatchValue(value=filename))
-                )
-        if has_text is not None:
-            conditions.append(
-                FieldCondition(key="has_text", match=MatchValue(value=has_text))
-            )
-        return Filter(must=conditions) if conditions else None

 """
 import logging
+import time
+from typing import Any, Dict, List, Optional, Union
 import numpy as np
 import torch
+from qdrant_client.http import models as qdrant_models
+from qdrant_client.models import FieldCondition, Filter, MatchAny, MatchValue
+from visual_rag.embedding.pooling import compute_maxsim_score
 logger = logging.getLogger(__name__)
 class TwoStageRetriever:
     """
     Two-stage visual document retrieval with pooling and reranking.
     Stage 1 (Prefetch):
         Uses tile-level mean-pooled vectors for fast HNSW search.
         Retrieves prefetch_k candidates (e.g., 100-500).
     Stage 2 (Rerank):
         Fetches full multi-vector embeddings for candidates.
         Computes exact MaxSim scores for precise ranking.
         Returns top_k results (e.g., 10).
     Args:
         qdrant_client: Connected Qdrant client
         collection_name: Name of the Qdrant collection
         full_vector_name: Name of full multi-vector field (default: "initial")
         pooled_vector_name: Name of pooled vector field (default: "mean_pooling")
     Example:
         >>> retriever = TwoStageRetriever(client, "my_collection")
+        >>>
         >>> # Two-stage search: prefetch 200, return top 10
         >>> results = retriever.search(
         ...     query_embedding=query,
         ...     top_k=10,
         ...     prefetch_k=200,
         ... )
+        >>>
         >>> # Compare latency:
         >>> # Full MaxSim (1000 docs): ~500ms
         >>> # Two-stage (200→10):     ~50ms
     """
     def __init__(
         self,
         qdrant_client,
         self.retry_sleep = float(retry_sleep)
     def _retry_call(self, fn):
         last_err = None
         for attempt in range(self.max_retries):
             try:
                 last_err = e
                 if attempt >= self.max_retries - 1:
                     break
+                time.sleep(self.retry_sleep * (2**attempt))
         if last_err is not None:
             raise last_err
     ) -> List[Dict[str, Any]]:
         """
         Two-stage retrieval using Qdrant's native prefetch (all server-side).
         This is MUCH faster than search() because it avoids network transfer
         of large multi-vector embeddings. All computation happens in Qdrant.
         Args:
             query_embedding: Query embeddings [num_tokens, dim]
             top_k: Final number of results
             prefetch_k: Candidates for stage 1 (default: 10x top_k)
             filter_obj: Qdrant filter
             stage1_mode: How to do stage 1 prefetch
         Returns:
             List of results with scores
         """
         query_np = self._to_numpy(query_embedding)
         if prefetch_k is None:
             prefetch_k = max(100, top_k * 10)
         if stage1_mode == "pooled_query_vs_tiles":
             prefetch_query = query_np.mean(axis=0).tolist()
             prefetch_using = self.pooled_vector_name
             prefetch_using = self.global_vector_name
         else:
             raise ValueError(f"Unknown stage1_mode: {stage1_mode}")
         rerank_query = query_np.tolist()
         def _do_query():
             return self.client.query_points(
                 collection_name=self.collection_name,
                 limit=top_k,
                 query_filter=filter_obj,
                 with_payload=True,
+                search_params=qdrant_models.SearchParams(exact=True),
                 prefetch=[
+                    qdrant_models.Prefetch(
                         query=prefetch_query,
                         using=prefetch_using,
                         limit=prefetch_k,
                 ],
                 timeout=self.request_timeout,
             ).points
         results = self._retry_call(_do_query)
         return [
             {
                 "id": r.id,
             }
             for r in results
         ]
     def search(
         self,
         query_embedding: Union[torch.Tensor, np.ndarray],
     ) -> List[Dict[str, Any]]:
         """
         Two-stage retrieval: prefetch with pooling, rerank with MaxSim.
         Args:
             query_embedding: Query embeddings [num_tokens, dim]
             top_k: Final number of results to return
                 - "pooled_query_vs_tiles": pool query to 1×dim and search tile vectors (using="mean_pooling")
                 - "tokens_vs_tiles": search tile vectors with full query tokens (using="mean_pooling")
                 - "pooled_query_vs_global": pool query to 1×dim and search global pooled doc vectors (using="global_pooling")
         Returns:
             List of results with scores and metadata:
             [
         """
         # Convert to numpy
         query_np = self._to_numpy(query_embedding)
         # Auto-set prefetch_k
         if prefetch_k is None:
             prefetch_k = max(100, top_k * 10)
         # Stage 1: Prefetch with pooled vectors
         logger.info(f"🔍 Stage 1: Prefetching {prefetch_k} candidates ({stage1_mode})")
         candidates = self._stage1_prefetch(
             filter_obj=filter_obj,
             stage1_mode=stage1_mode,
         )
         if not candidates:
             logger.warning("No candidates found in stage 1")
             return []
         logger.info(f"✅ Stage 1: Retrieved {len(candidates)} candidates")
         # Stage 2: Rerank with full embeddings
         if use_reranking and len(candidates) > top_k:
+            logger.info("🎯 Stage 2: Reranking with MaxSim...")
             results = self._stage2_rerank(
                 query_np=query_np,
                 candidates=candidates,
             for r in results:
                 r["score_final"] = r["score_stage1"]
             logger.info(f"⏭️ Skipping reranking, returning top {len(results)}")
         return results
     def search_single_stage(
         self,
         query_embedding: Union[torch.Tensor, np.ndarray],
     ) -> List[Dict[str, Any]]:
         """
         Single-stage search (either pooled or full multi-vector).
         Args:
             query_embedding: Query embeddings
             top_k: Number of results
             filter_obj: Qdrant filter
             use_pooling: Use pooled vectors (faster) or full (more accurate)
         Returns:
             List of results
         """
         query_np = self._to_numpy(query_embedding)
         if use_pooling:
             # Pool query and search pooled vectors
             query_pooled = query_np.mean(axis=0)
             vector_name = self.full_vector_name
             query_vector = query_np.tolist()
             logger.info(f"🎯 Multi-vector search: {vector_name}")
         results = self.client.query_points(
             collection_name=self.collection_name,
             query=query_vector,
             with_vectors=False,
             timeout=120,
         ).points
         return [
             {
                 "id": r.id,
             }
             for r in results
         ]
     def _stage1_prefetch(
         self,
         query_np: np.ndarray,
             vector_name = self.global_vector_name
         else:
             raise ValueError(f"Unknown stage1_mode: {stage1_mode}")
         def _do_query():
             return self.client.query_points(
                 collection_name=self.collection_name,
             ).points
         results = self._retry_call(_do_query)
         return [
             {
                 "id": r.id,
             }
             for r in results
         ]
     def _stage2_rerank(
         self,
         query_np: np.ndarray,
         return_embeddings: bool = False,
     ) -> List[Dict[str, Any]]:
         """Stage 2: Rerank with full multi-vector MaxSim scoring."""
         # Fetch full embeddings for candidates
         candidate_ids = [c["id"] for c in candidates]
         # Retrieve points with vectors
         def _do_retrieve():
             return self.client.retrieve(
             )
         points = self._retry_call(_do_retrieve)
         # Build ID to embedding map
         id_to_embedding = {}
         for point in points:
                 id_to_embedding[point.id] = np.array(
                     point.vector[self.full_vector_name], dtype=np.float32
                 )
         # Compute MaxSim scores
         reranked = []
         for candidate in candidates:
             point_id = candidate["id"]
             doc_embedding = id_to_embedding.get(point_id)
             if doc_embedding is None:
                 # Fallback to stage 1 score
                 candidate["score_stage2"] = candidate["score_stage1"]
                 maxsim_score = compute_maxsim_score(query_np, doc_embedding)
                 candidate["score_stage2"] = maxsim_score
                 candidate["score_final"] = maxsim_score
                 if return_embeddings:
                     candidate["embedding"] = doc_embedding
             reranked.append(candidate)
         # Sort by final score (descending)
         reranked.sort(key=lambda x: x["score_final"], reverse=True)
         return reranked[:top_k]
     def _to_numpy(self, embedding: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
         """Convert embedding to numpy array."""
         if isinstance(embedding, torch.Tensor):
                 return embedding.cpu().float().numpy()
             return embedding.cpu().numpy()
         return np.array(embedding, dtype=np.float32)
     def build_filter(
         self,
         year: Optional[Any] = None,
     ):
         """
         Build Qdrant filter from parameters.
         Supports single values or lists (using MatchAny).
         """
         conditions = []
         if year is not None:
             if isinstance(year, list):
                 year_values = [int(y) if isinstance(y, str) else y for y in year]
+                conditions.append(FieldCondition(key="year", match=MatchAny(any=year_values)))
             else:
                 year_value = int(year) if isinstance(year, str) else year
+                conditions.append(FieldCondition(key="year", match=MatchValue(value=year_value)))
         if source is not None:
             if isinstance(source, list):
+                conditions.append(FieldCondition(key="source", match=MatchAny(any=source)))
             else:
+                conditions.append(FieldCondition(key="source", match=MatchValue(value=source)))
         if district is not None:
             if isinstance(district, list):
+                conditions.append(FieldCondition(key="district", match=MatchAny(any=district)))
             else:
+                conditions.append(FieldCondition(key="district", match=MatchValue(value=district)))
         if filename is not None:
             if isinstance(filename, list):
+                conditions.append(FieldCondition(key="filename", match=MatchAny(any=filename)))
             else:
+                conditions.append(FieldCondition(key="filename", match=MatchValue(value=filename)))
+        if has_text is not None:
+            conditions.append(FieldCondition(key="has_text", match=MatchValue(value=has_text)))
+        return Filter(must=conditions) if conditions else None

visual_rag/visualization/__init__.py CHANGED Viewed

@@ -7,8 +7,8 @@ This module provides:
 """
 from visual_rag.visualization.saliency import (
-    generate_saliency_map,
     create_saliency_overlay,
     visualize_search_results,
 )

 """
 from visual_rag.visualization.saliency import (
     create_saliency_overlay,
+    generate_saliency_map,
     visualize_search_results,
 )

visual_rag/visualization/saliency.py CHANGED Viewed

@@ -5,10 +5,11 @@ Generates attention/saliency maps to visualize which parts of documents
 are most relevant to a query.
 """
-import numpy as np
-from PIL import Image, ImageDraw, ImageFont
-from typing import List, Dict, Any, Optional, Tuple, Union
 import logging
 logger = logging.getLogger(__name__)
@@ -24,9 +25,9 @@ def generate_saliency_map(
 ) -> Tuple[Image.Image, np.ndarray]:
     """
     Generate saliency map showing which parts of the image match the query.
     Computes patch-level relevance scores and overlays them on the image.
     Args:
         query_embedding: Query embeddings [num_query_tokens, dim]
         doc_embedding: Document visual embeddings [num_visual_tokens, dim]
@@ -35,10 +36,10 @@ def generate_saliency_map(
         colormap: Matplotlib colormap name (Reds, viridis, jet, etc.)
         alpha: Overlay transparency (0-1)
         threshold_percentile: Only highlight patches above this percentile
     Returns:
         Tuple of (annotated_image, patch_scores)
     Example:
         >>> query = embedder.embed_query("budget allocation")
         >>> doc = visual_embedding  # From embed_images
@@ -51,57 +52,57 @@ def generate_saliency_map(
         >>> annotated.save("saliency.png")
     """
     # Ensure numpy arrays
-    if hasattr(query_embedding, 'numpy'):
         query_np = query_embedding.numpy()
-    elif hasattr(query_embedding, 'cpu'):
         query_np = query_embedding.cpu().numpy()
     else:
         query_np = np.array(query_embedding, dtype=np.float32)
-    if hasattr(doc_embedding, 'numpy'):
         doc_np = doc_embedding.numpy()
-    elif hasattr(doc_embedding, 'cpu'):
         doc_np = doc_embedding.cpu().numpy()
     else:
         doc_np = np.array(doc_embedding, dtype=np.float32)
     # Normalize embeddings
     query_norm = query_np / (np.linalg.norm(query_np, axis=1, keepdims=True) + 1e-8)
     doc_norm = doc_np / (np.linalg.norm(doc_np, axis=1, keepdims=True) + 1e-8)
     # Compute similarity matrix: [num_query, num_doc]
     similarity_matrix = np.dot(query_norm, doc_norm.T)
     # Get max similarity per document patch (best match from any query token)
     patch_scores = similarity_matrix.max(axis=0)
     # Normalize to [0, 1]
     score_min, score_max = patch_scores.min(), patch_scores.max()
     if score_max - score_min > 1e-8:
         patch_scores_norm = (patch_scores - score_min) / (score_max - score_min)
     else:
         patch_scores_norm = np.zeros_like(patch_scores)
     # Determine grid dimensions
     if token_info and token_info.get("n_rows") and token_info.get("n_cols"):
         n_rows = token_info["n_rows"]
         n_cols = token_info["n_cols"]
         num_tiles = n_rows * n_cols + 1  # +1 for global tile
         patches_per_tile = 64  # ColSmol standard
         # Reshape to tile grid (excluding global tile)
         try:
             # Skip global tile patches at the end
             tile_patches = num_tiles * patches_per_tile
             if len(patch_scores_norm) >= tile_patches:
-                grid_patches = patch_scores_norm[:n_rows * n_cols * patches_per_tile]
             else:
                 grid_patches = patch_scores_norm
             # Reshape: [tiles * patches_per_tile] -> [tiles, patches_per_tile]
             # Then mean per tile
             num_grid_tiles = n_rows * n_cols
-            grid_patches = grid_patches[:num_grid_tiles * patches_per_tile]
             tile_scores = grid_patches.reshape(num_grid_tiles, patches_per_tile).mean(axis=1)
             tile_scores = tile_scores.reshape(n_rows, n_cols)
         except Exception as e:
@@ -110,7 +111,7 @@ def generate_saliency_map(
     else:
         tile_scores = None
         n_rows = n_cols = None
     # Create overlay
     annotated = create_saliency_overlay(
         image=image,
@@ -121,7 +122,7 @@ def generate_saliency_map(
         grid_rows=n_rows,
         grid_cols=n_cols,
     )
     return annotated, patch_scores
@@ -136,7 +137,7 @@ def create_saliency_overlay(
 ) -> Image.Image:
     """
     Create colored overlay on image based on scores.
     Args:
         image: Base PIL Image
         scores: Score array - 1D [num_patches] or 2D [rows, cols]
@@ -144,7 +145,7 @@ def create_saliency_overlay(
         alpha: Overlay transparency
         threshold_percentile: Only color patches above this percentile
         grid_rows, grid_cols: Grid dimensions (auto-detected if not provided)
     Returns:
         Annotated PIL Image
     """
@@ -153,10 +154,10 @@ def create_saliency_overlay(
     except ImportError:
         logger.warning("matplotlib not installed, returning original image")
         return image
     img_array = np.array(image)
     h, w = img_array.shape[:2]
     # Handle 2D scores (tile grid)
     if scores.ndim == 2:
         rows, cols = scores.shape
@@ -171,58 +172,58 @@ def create_saliency_overlay(
             aspect = w / h
             cols = int(np.sqrt(num_patches * aspect))
             rows = max(1, num_patches // cols)
-            scores = scores[:rows * cols].reshape(rows, cols)
     else:
         # Auto-estimate grid
         num_patches = len(scores) if scores.ndim == 1 else scores.size
         aspect = w / h
         cols = max(1, int(np.sqrt(num_patches * aspect)))
         rows = max(1, num_patches // cols)
         if rows * cols > len(scores) if scores.ndim == 1 else scores.size:
             cols = max(1, cols - 1)
         if scores.ndim == 1:
-            scores = scores[:rows * cols].reshape(rows, cols)
     # Get colormap
     cmap = plt.cm.get_cmap(colormap)
     # Calculate threshold
     threshold = np.percentile(scores, threshold_percentile)
     # Calculate cell dimensions
     cell_h = h // rows
     cell_w = w // cols
     # Create RGBA overlay
     overlay = np.zeros((h, w, 4), dtype=np.uint8)
     for i in range(rows):
         for j in range(cols):
             score = scores[i, j]
             if score >= threshold:
                 y1 = i * cell_h
                 y2 = min((i + 1) * cell_h, h)
                 x1 = j * cell_w
                 x2 = min((j + 1) * cell_w, w)
                 # Normalize score for coloring (above threshold)
                 norm_score = (score - threshold) / (1.0 - threshold + 1e-8)
                 norm_score = min(1.0, max(0.0, norm_score))
                 # Get color
                 color = cmap(norm_score)[:3]
                 color_uint8 = (np.array(color) * 255).astype(np.uint8)
                 overlay[y1:y2, x1:x2, :3] = color_uint8
                 overlay[y1:y2, x1:x2, 3] = int(alpha * 255 * norm_score)
     # Blend with original
     overlay_img = Image.fromarray(overlay, "RGBA")
     result = Image.alpha_composite(image.convert("RGBA"), overlay_img)
     return result.convert("RGB")
@@ -237,7 +238,7 @@ def visualize_search_results(
 ) -> Optional[Image.Image]:
     """
     Visualize search results as a grid of images with scores.
     Args:
         query: Original query text
         results: List of search results with 'payload' containing 'page' (image URL/base64)
@@ -246,7 +247,7 @@ def visualize_search_results(
         output_path: Path to save visualization (optional)
         max_results: Maximum results to show
         show_saliency: Generate saliency overlays (requires query_embedding & embeddings)
     Returns:
         Combined visualization image if successful
     """
@@ -255,32 +256,32 @@ def visualize_search_results(
     except ImportError:
         logger.error("matplotlib required for visualization")
         return None
     results = results[:max_results]
     n = len(results)
     if n == 0:
         logger.warning("No results to visualize")
         return None
     fig, axes = plt.subplots(1, n, figsize=(4 * n, 4))
     if n == 1:
         axes = [axes]
     for idx, (result, ax) in enumerate(zip(results, axes)):
         payload = result.get("payload", {})
         score = result.get("score_final", result.get("score_stage1", 0))
         # Try to load image from payload
         page_data = payload.get("page", "")
         image = None
         if page_data.startswith("data:image"):
             # Base64 encoded
             try:
                 import base64
                 from io import BytesIO
                 b64_data = page_data.split(",")[1]
                 image = Image.open(BytesIO(base64.b64decode(b64_data)))
             except Exception as e:
@@ -290,50 +291,45 @@ def visualize_search_results(
             try:
                 import urllib.request
                 from io import BytesIO
                 with urllib.request.urlopen(page_data, timeout=5) as response:
                     image = Image.open(BytesIO(response.read()))
             except Exception as e:
                 logger.debug(f"Could not fetch image URL: {e}")
         if image:
             ax.imshow(image)
         else:
             # Show placeholder
-            ax.text(
-                0.5, 0.5, "No image",
-                ha="center", va="center",
-                fontsize=12, color="gray"
-            )
         # Add title
         title = f"Rank {idx + 1}\nScore: {score:.3f}"
         if payload.get("filename"):
             title += f"\n{payload['filename'][:30]}"
         if payload.get("page_number") is not None:
             title += f" p.{payload['page_number'] + 1}"
         ax.set_title(title, fontsize=9)
         ax.axis("off")
     # Add query as suptitle
     query_display = query[:80] + "..." if len(query) > 80 else query
     plt.suptitle(f"Query: {query_display}", fontsize=11, fontweight="bold")
     plt.tight_layout()
     if output_path:
         plt.savefig(output_path, dpi=150, bbox_inches="tight")
         logger.info(f"💾 Saved visualization to: {output_path}")
     # Convert to PIL Image for return
     from io import BytesIO
     buf = BytesIO()
     plt.savefig(buf, format="png", dpi=100, bbox_inches="tight")
     buf.seek(0)
     result_image = Image.open(buf)
-    plt.close()
-    return result_image

 are most relevant to a query.
 """
 import logging
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+from PIL import Image
 logger = logging.getLogger(__name__)
 ) -> Tuple[Image.Image, np.ndarray]:
     """
     Generate saliency map showing which parts of the image match the query.
     Computes patch-level relevance scores and overlays them on the image.
     Args:
         query_embedding: Query embeddings [num_query_tokens, dim]
         doc_embedding: Document visual embeddings [num_visual_tokens, dim]
         colormap: Matplotlib colormap name (Reds, viridis, jet, etc.)
         alpha: Overlay transparency (0-1)
         threshold_percentile: Only highlight patches above this percentile
     Returns:
         Tuple of (annotated_image, patch_scores)
     Example:
         >>> query = embedder.embed_query("budget allocation")
         >>> doc = visual_embedding  # From embed_images
         >>> annotated.save("saliency.png")
     """
     # Ensure numpy arrays
+    if hasattr(query_embedding, "numpy"):
         query_np = query_embedding.numpy()
+    elif hasattr(query_embedding, "cpu"):
         query_np = query_embedding.cpu().numpy()
     else:
         query_np = np.array(query_embedding, dtype=np.float32)
+    if hasattr(doc_embedding, "numpy"):
         doc_np = doc_embedding.numpy()
+    elif hasattr(doc_embedding, "cpu"):
         doc_np = doc_embedding.cpu().numpy()
     else:
         doc_np = np.array(doc_embedding, dtype=np.float32)
     # Normalize embeddings
     query_norm = query_np / (np.linalg.norm(query_np, axis=1, keepdims=True) + 1e-8)
     doc_norm = doc_np / (np.linalg.norm(doc_np, axis=1, keepdims=True) + 1e-8)
     # Compute similarity matrix: [num_query, num_doc]
     similarity_matrix = np.dot(query_norm, doc_norm.T)
     # Get max similarity per document patch (best match from any query token)
     patch_scores = similarity_matrix.max(axis=0)
     # Normalize to [0, 1]
     score_min, score_max = patch_scores.min(), patch_scores.max()
     if score_max - score_min > 1e-8:
         patch_scores_norm = (patch_scores - score_min) / (score_max - score_min)
     else:
         patch_scores_norm = np.zeros_like(patch_scores)
     # Determine grid dimensions
     if token_info and token_info.get("n_rows") and token_info.get("n_cols"):
         n_rows = token_info["n_rows"]
         n_cols = token_info["n_cols"]
         num_tiles = n_rows * n_cols + 1  # +1 for global tile
         patches_per_tile = 64  # ColSmol standard
         # Reshape to tile grid (excluding global tile)
         try:
             # Skip global tile patches at the end
             tile_patches = num_tiles * patches_per_tile
             if len(patch_scores_norm) >= tile_patches:
+                grid_patches = patch_scores_norm[: n_rows * n_cols * patches_per_tile]
             else:
                 grid_patches = patch_scores_norm
             # Reshape: [tiles * patches_per_tile] -> [tiles, patches_per_tile]
             # Then mean per tile
             num_grid_tiles = n_rows * n_cols
+            grid_patches = grid_patches[: num_grid_tiles * patches_per_tile]
             tile_scores = grid_patches.reshape(num_grid_tiles, patches_per_tile).mean(axis=1)
             tile_scores = tile_scores.reshape(n_rows, n_cols)
         except Exception as e:
     else:
         tile_scores = None
         n_rows = n_cols = None
     # Create overlay
     annotated = create_saliency_overlay(
         image=image,
         grid_rows=n_rows,
         grid_cols=n_cols,
     )
     return annotated, patch_scores
 ) -> Image.Image:
     """
     Create colored overlay on image based on scores.
     Args:
         image: Base PIL Image
         scores: Score array - 1D [num_patches] or 2D [rows, cols]
         alpha: Overlay transparency
         threshold_percentile: Only color patches above this percentile
         grid_rows, grid_cols: Grid dimensions (auto-detected if not provided)
     Returns:
         Annotated PIL Image
     """
     except ImportError:
         logger.warning("matplotlib not installed, returning original image")
         return image
     img_array = np.array(image)
     h, w = img_array.shape[:2]
     # Handle 2D scores (tile grid)
     if scores.ndim == 2:
         rows, cols = scores.shape
             aspect = w / h
             cols = int(np.sqrt(num_patches * aspect))
             rows = max(1, num_patches // cols)
+            scores = scores[: rows * cols].reshape(rows, cols)
     else:
         # Auto-estimate grid
         num_patches = len(scores) if scores.ndim == 1 else scores.size
         aspect = w / h
         cols = max(1, int(np.sqrt(num_patches * aspect)))
         rows = max(1, num_patches // cols)
         if rows * cols > len(scores) if scores.ndim == 1 else scores.size:
             cols = max(1, cols - 1)
         if scores.ndim == 1:
+            scores = scores[: rows * cols].reshape(rows, cols)
     # Get colormap
     cmap = plt.cm.get_cmap(colormap)
     # Calculate threshold
     threshold = np.percentile(scores, threshold_percentile)
     # Calculate cell dimensions
     cell_h = h // rows
     cell_w = w // cols
     # Create RGBA overlay
     overlay = np.zeros((h, w, 4), dtype=np.uint8)
     for i in range(rows):
         for j in range(cols):
             score = scores[i, j]
             if score >= threshold:
                 y1 = i * cell_h
                 y2 = min((i + 1) * cell_h, h)
                 x1 = j * cell_w
                 x2 = min((j + 1) * cell_w, w)
                 # Normalize score for coloring (above threshold)
                 norm_score = (score - threshold) / (1.0 - threshold + 1e-8)
                 norm_score = min(1.0, max(0.0, norm_score))
                 # Get color
                 color = cmap(norm_score)[:3]
                 color_uint8 = (np.array(color) * 255).astype(np.uint8)
                 overlay[y1:y2, x1:x2, :3] = color_uint8
                 overlay[y1:y2, x1:x2, 3] = int(alpha * 255 * norm_score)
     # Blend with original
     overlay_img = Image.fromarray(overlay, "RGBA")
     result = Image.alpha_composite(image.convert("RGBA"), overlay_img)
     return result.convert("RGB")
 ) -> Optional[Image.Image]:
     """
     Visualize search results as a grid of images with scores.
     Args:
         query: Original query text
         results: List of search results with 'payload' containing 'page' (image URL/base64)
         output_path: Path to save visualization (optional)
         max_results: Maximum results to show
         show_saliency: Generate saliency overlays (requires query_embedding & embeddings)
     Returns:
         Combined visualization image if successful
     """
     except ImportError:
         logger.error("matplotlib required for visualization")
         return None
     results = results[:max_results]
     n = len(results)
     if n == 0:
         logger.warning("No results to visualize")
         return None
     fig, axes = plt.subplots(1, n, figsize=(4 * n, 4))
     if n == 1:
         axes = [axes]
     for idx, (result, ax) in enumerate(zip(results, axes)):
         payload = result.get("payload", {})
         score = result.get("score_final", result.get("score_stage1", 0))
         # Try to load image from payload
         page_data = payload.get("page", "")
         image = None
         if page_data.startswith("data:image"):
             # Base64 encoded
             try:
                 import base64
                 from io import BytesIO
                 b64_data = page_data.split(",")[1]
                 image = Image.open(BytesIO(base64.b64decode(b64_data)))
             except Exception as e:
             try:
                 import urllib.request
                 from io import BytesIO
                 with urllib.request.urlopen(page_data, timeout=5) as response:
                     image = Image.open(BytesIO(response.read()))
             except Exception as e:
                 logger.debug(f"Could not fetch image URL: {e}")
         if image:
             ax.imshow(image)
         else:
             # Show placeholder
+            ax.text(0.5, 0.5, "No image", ha="center", va="center", fontsize=12, color="gray")
         # Add title
         title = f"Rank {idx + 1}\nScore: {score:.3f}"
         if payload.get("filename"):
             title += f"\n{payload['filename'][:30]}"
         if payload.get("page_number") is not None:
             title += f" p.{payload['page_number'] + 1}"
         ax.set_title(title, fontsize=9)
         ax.axis("off")
     # Add query as suptitle
     query_display = query[:80] + "..." if len(query) > 80 else query
     plt.suptitle(f"Query: {query_display}", fontsize=11, fontweight="bold")
     plt.tight_layout()
     if output_path:
         plt.savefig(output_path, dpi=150, bbox_inches="tight")
         logger.info(f"💾 Saved visualization to: {output_path}")
     # Convert to PIL Image for return
     from io import BytesIO
     buf = BytesIO()
     plt.savefig(buf, format="png", dpi=100, bbox_inches="tight")
     buf.seek(0)
     result_image = Image.open(buf)
+    plt.close()
+    return result_image