Spaces:

Subhadip007
/

researchpilot-api

Running

File size: 6,679 Bytes

"""
Loads embeddings + chunks from disk and indexes them into Qdrant.

This is a ONE-TIME operation (or run when new papers are added).
After this, all searches go through Qdrant - not numpy arrays.
"""

import json
import numpy as np
from pathlib import Path

from src.vectorstore.qdrant_store import QdrantStore
from src.embeddings.embedding_cache import EmbeddingCache
from src.utils.logger import get_logger
from config.settings import CHUNKS_DIR, EMBEDDINGS_DIR

logger = get_logger(__name__)



class VectorIndexer:
    """Orchestrates loading embeddings and indexing into Qdrant"""

    def __init__(self):
        self.store = QdrantStore()
        self.cache = EmbeddingCache()


#----------------------------------------------------------------------------------------------------------

    # def load_texts_by_chunk_id(self, chunk_ids: list[str]) -> dict[str, str]:
    #     """
    #     Build a lookup dict: chunk_id → chunk text.

    #     We need this because EmbeddingCache stores embeddings
    #     but not the original texts. We reload texts from chunk files.
    #     """
    #     # Load the metadata file which has all chunk info
    #     metadata_path = EMBEDDINGS_DIR / "chunk_metadata.json"

    #     if metadata_path.exists():
    #         with open(metadata_path, "r", encoding = 'utf-8') as f:
    #             metadata_list = json.load(f)

    #         logger.info(f"Loaded metadata for {len(metadata_list):,} chunks")
    #         return metadata_list

    #     # Fallback: reload from chunk files (slower)
    #     logger.warning("chunk_metadata.json not found, loading from chunk files...")
    #     id_to_text = {}
    #     for cf in CHUNKS_DIR.glob("*_semantic.json"):
    #         with open(cf, 'r', encoding = 'utf-8') as f:
    #             chunks = json.load(f)
    #         for c in chunks:
    #             id_to_text[c['chunk_id']] = c['text']
        
    #     return id_to_text

#----------------------------------------------------------------------------------------------------------



    def load_chunk_from_disk(self) -> tuple[list[str], list[str], list[str]]:
        """
        Load chunk texts and metadata directly from chunk files.
        This is the ground truth source - chunk files have everything.
        
        Returns:
            chunk_ids: list of chunk ID strings
            texts:     list of chunk text strings  
            metadata:  list of metadata dicts (without text)
        """
        chunk_ids = []
        texts     = []
        metadata  = []


        chunk_files = list(CHUNKS_DIR.glob("*_semantic.json"))
        logger.info(f"Loading chunks from {len(chunk_files)} files...")

        for cf in chunk_files: 
            with open(cf, 'r', encoding = "utf-8") as f:
                raw = json.load(f)

            # Handle both formats:
            #   Old local format: [{chunk_id: ..., text: ...}, ...]
            #   New Kaggle format: {"paper_id": "...", "chunks": [...]}
            if isinstance(raw, dict) and "chunks" in raw:
                chunk_list = raw["chunks"]
            elif isinstance(raw, list):
                chunk_list = raw
            else:
                logger.warning(f"Unexpected format in {cf.name}, skipping")
                continue

            for chunk in chunk_list:
                chunk_ids.append(chunk['chunk_id'])
                texts.append(chunk["text"])

                # Everything except text goes into metadata
                metadata.append(
                    {
                        k: v for k, v in chunk.items()
                        if k != "text"
                    }
                )

        logger.info(f"Loaded {len(chunk_ids):,} chunks from disk")
        return chunk_ids, texts, metadata




    def run(self, recreate: bool = False) -> dict:
        """
        Full indexing pipeline.

        Args:
            recreate: Delete existing collection and re-index everything.
                      Set True when you change embedding model or chunking.

        Returns:
            Indexing statistics
        """
        # Check if already exists
        current_size = self.store.get_collection_size()

        if current_size > 0 and not recreate:
            logger.info(
                f"Collection already has {current_size:,} points. "
                f"Run with recreate=True to re-index."
            )

            return {
                "status": "already_indexed",
                "points": current_size,
            }


        # Step 1: Load directly from chunk files - ground truth source
        # (chunk files have text + metadata, and are the source of truth)
        chunk_ids, texts, metadata = self.load_chunk_from_disk()

        # Step 2: Create the Qdrant collection (skips if already exists)
        self.store.create_collection(recreate=recreate)

        # Step 3: Load embeddings from cache and reorder to match chunk order from disk
        # (cache order may differ from disk order, so we align by chunk_id)
        logger.info("Loading embeddings from cache...")
        self.cache.load()
        embeddings_matrix, cached_ids = self.cache.get_all()

        # Build a lookup dict: chunk_id → row index in embedding matrix
        id_to_row = {cid: i for i, cid in enumerate(cached_ids)}

        # Reorder embeddings so they match the chunk_ids order we loaded from disk
        ordered_embeddings = np.array([
            embeddings_matrix[id_to_row[cid]]
            for cid in chunk_ids
            if cid in id_to_row      # only include chunks that have an embedding
        ])

        # Filter chunk_ids, texts, metadata to only those that have a matching embedding
        # (some chunks may have been added after last embedding run)
        valid_indices = [i for i, cid in enumerate(chunk_ids) if cid in id_to_row]
        chunk_ids     = [chunk_ids[i] for i in valid_indices]
        texts         = [texts[i]     for i in valid_indices]
        metadata      = [metadata[i]  for i in valid_indices]

        logger.info(f"Matched {len(chunk_ids):,} chunks with embeddings")

        # Step 4: Index everything into Qdrant
        logger.info(f"Indexing {len(chunk_ids):,} chunks into Qdrant...")
        total = self.store.index_chunks(
            embeddings = ordered_embeddings,
            chunk_ids  = chunk_ids,
            metadata   = metadata,
            texts      = texts,
        )

        stats = {
            "status":          "complete",
            "chunks_indexed":  total,
            "collection_info": self.store.get_collection_info(),
        }

        logger.info(f"Indexing completed: {stats}")
        return stats