gdscript-assistant / colab_build_index.py
vivekchakraverty's picture
Fix Colab OOM: cap seq length + smaller batch
c314e63 verified
"""Build the jina FAISS index on a free Colab/Kaggle GPU and push it to the Space.
Run this in a GPU Colab notebook (Runtime -> Change runtime type -> T4 GPU).
It pulls chunks.jsonl from your Space repo, embeds all chunks with
jina-embeddings-v2-base-code on the GPU (~minutes), builds the FAISS index in the
exact format rag.py expects (cosine / IndexIDMap2, faiss_id == chunk id), and
uploads embeddings.faiss + id_map.json back to the Space β€” so the ~280 MB index
never touches your local machine.
USAGE (paste into a Colab cell, or upload this file and `%run` it):
1) Set SPACE_REPO and HF_TOKEN below (token: https://huggingface.co/settings/tokens, write).
2) Run. When it finishes, the Space restarts with full RAG.
Cell 0 (install):
!pip install -q "transformers<5" sentence-transformers einops faiss-cpu huggingface_hub
"""
import os
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
import json
import faiss
import numpy as np
from huggingface_hub import hf_hub_download, login, upload_file
from sentence_transformers import SentenceTransformer
# ─── CONFIG ────────────────────────────────────────────────────────────────
SPACE_REPO = os.environ.get("SPACE_REPO", "<user>/gdscript-assistant") # <-- set
HF_TOKEN = os.environ.get("HF_TOKEN", "") # <-- set (write)
MODEL = "jinaai/jina-embeddings-v2-base-code"
BATCH = 32 # small batch + capped length avoids T4 OOM on long chunks
MAX_LEN = 1024 # chunks are mostly tiny (p90 ~242 tokens); cap bounds memory
# ───────────────────────────────────────────────────────────────────────────
login(token=HF_TOKEN)
# 1. Pull chunks.jsonl from the Space repo (fast on Colab's connection).
chunks_path = hf_hub_download(
repo_id=SPACE_REPO, repo_type="space", filename="data/chunks.jsonl")
ids, texts, meta = [], [], {}
with open(chunks_path, encoding="utf-8") as f:
for line in f:
if not line.strip():
continue
r = json.loads(line)
ids.append(int(r["id"]))
texts.append(r["text"])
meta[str(r["id"])] = {"origin_url": r.get("origin_url", ""),
"repo": r.get("repo", "")}
print(f"Loaded {len(ids)} chunks")
# 2. Embed on GPU (normalized -> cosine via inner product).
model = SentenceTransformer(MODEL, trust_remote_code=True, device="cuda")
model.max_seq_length = MAX_LEN
vecs = model.encode(texts, batch_size=BATCH, normalize_embeddings=True,
convert_to_numpy=True, show_progress_bar=True)
vecs = vecs.astype(np.float32)
print("Embedded:", vecs.shape)
# 3. Build FAISS index β€” IDMap2(FlatIP), faiss_id == chunk id (matches rag.py).
index = faiss.IndexIDMap2(faiss.IndexFlatIP(vecs.shape[1]))
index.add_with_ids(vecs, np.asarray(ids, dtype=np.int64))
faiss.write_index(index, "embeddings.faiss")
with open("id_map.json", "w", encoding="utf-8") as f:
json.dump(meta, f)
print("Index built:", index.ntotal, "vectors")
# 4. Push the index back to the Space repo (Colab -> HF; not your machine).
for fn in ("embeddings.faiss", "id_map.json"):
upload_file(path_or_fileobj=fn, path_in_repo=f"data/{fn}",
repo_id=SPACE_REPO, repo_type="space",
commit_message="Add jina FAISS index (built on GPU)")
print("Done β€” Space will restart with full RAG.")