File size: 1,347 Bytes
531b3e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from __future__ import annotations

import pickle
import faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

INDEX_FILE = "xkcd.index"
META_FILE = "meta.pkl"

# --- Build / load index ---
def build_index():
    print("Building FAISS index...")
    ds = load_dataset("olivierdehaene/xkcd", split="train")
    model = SentenceTransformer("all-MiniLM-L6-v2")
    texts = []
    for ex in ds:
        title = ex["title"] if ex["title"] else ""
        transcript = ex["transcript"] if ex["transcript"] else ""
        explanation = (
            ex["explanation"] if "explanation" in ex and ex["explanation"] else ""
        )
        texts.append(f"{title} {transcript} {explanation}")

    embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    faiss.write_index(index, INDEX_FILE)

    # Store just the metadata we need (pickle-friendly)
    meta = [
        {
            "id": ex["id"],
            "title": ex["title"],
            "transcript": ex["transcript"],
            "explanation": ex["explanation"] if "explanation" in ex else "",
        }
        for ex in ds
    ]
    with open(META_FILE, "wb") as f:
        pickle.dump(meta, f)

    return index, meta

build_index()