Spaces:
Runtime error
Runtime error
File size: 1,347 Bytes
531b3e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
from __future__ import annotations
import pickle
import faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
INDEX_FILE = "xkcd.index"
META_FILE = "meta.pkl"
# --- Build / load index ---
def build_index():
print("Building FAISS index...")
ds = load_dataset("olivierdehaene/xkcd", split="train")
model = SentenceTransformer("all-MiniLM-L6-v2")
texts = []
for ex in ds:
title = ex["title"] if ex["title"] else ""
transcript = ex["transcript"] if ex["transcript"] else ""
explanation = (
ex["explanation"] if "explanation" in ex and ex["explanation"] else ""
)
texts.append(f"{title} {transcript} {explanation}")
embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
faiss.write_index(index, INDEX_FILE)
# Store just the metadata we need (pickle-friendly)
meta = [
{
"id": ex["id"],
"title": ex["title"],
"transcript": ex["transcript"],
"explanation": ex["explanation"] if "explanation" in ex else "",
}
for ex in ds
]
with open(META_FILE, "wb") as f:
pickle.dump(meta, f)
return index, meta
build_index() |