File size: 3,496 Bytes
bb04c5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# evaluation/indexer_bridge.py

import numpy as np
from indexer.chunker import Chunker
from indexer.embedder import Embedder
from indexer.store import Store


class IndexerBridge:
    """

    Feeds the BEIR corpus directly into your existing indexing pipeline.



    The corpus documents are NOT real files on disk — they come from JSONL.

    So we bypass the Crawler/Extractor and inject text directly into

    Chunker → Embedder → Store.



    Each document gets a fake filepath: "{dataset_name}://{doc_id}"

    This lets the Store treat them like any other indexed file,

    and the Evaluator can later match doc_id back from results.

    """

    def __init__(self, config_path: str = "config.yaml"):
        self.chunker  = Chunker(chunk_size=500, overlap=50)
        self.embedder = Embedder(config_path)
        self.store    = Store(config_path)

    def index_corpus(self, corpus: dict, batch_size: int = 64, dataset_name: str = "dataset"):
        """

        Index the entire corpus into FAISS + SQLite.



        Args:

            corpus       — {doc_id: {"title": str, "text": str}}

            batch_size   — number of chunks to embed at once (memory control)

            dataset_name — used as prefix for fake file paths e.g. "scifact", "nfcorpus"

        """
        doc_ids = list(corpus.keys())
        total   = len(doc_ids)
        print(f"Indexing {total} documents from [{dataset_name}]...")

        # Clear previous entries for THIS dataset only
        existing_hashes  = self.store.load_hashes()
        prefix           = f"{dataset_name}://"
        existing_entries = [fp for fp in existing_hashes if fp.startswith(prefix)]
        for fp in existing_entries:
            self.store.remove_file_chunks(fp)
        if existing_entries:
            print(f"Cleared {len(existing_entries)} previously indexed [{dataset_name}] documents")

        chunk_buffer = []
        text_buffer  = []

        def flush(chunk_buffer, text_buffer):
            if not chunk_buffer:
                return
            embeddings = self.embedder.embed_chunks(text_buffer)
            embeddings = np.array(embeddings, dtype="float32")
            self.store.add_chunks(chunk_buffer, embeddings)

        for i, doc_id in enumerate(doc_ids, 1):
            doc       = corpus[doc_id]
            full_text = f"{doc['title']} {doc['text']}".strip()
            if not full_text:
                continue

            fake_path = f"{prefix}{doc_id}"
            chunks    = self.chunker.chunk_file(full_text, fake_path)

            for chunk in chunks:
                chunk_buffer.append(chunk)
                text_buffer.append(chunk["text"])

            self.store.save_file_info(fake_path, doc_id, len(chunks))

            if len(chunk_buffer) >= batch_size:
                flush(chunk_buffer, text_buffer)
                chunk_buffer.clear()
                text_buffer.clear()

            if i % 500 == 0:
                print(f"  Indexed {i}/{total}...")

        # flush any remaining chunks
        flush(chunk_buffer, text_buffer)
        print(f"Done. Total vectors: {self.store.get_total_vectors()}")


if __name__ == "__main__":
    from evaluation.dataset_loader import DatasetLoader

    loader = DatasetLoader("data/scifact")
    corpus = loader.load_corpus()

    bridge = IndexerBridge()
    bridge.index_corpus(corpus, batch_size=64, dataset_name="scifact")