Spaces:

vivekchakraverty
/

gdscript-assistant

Running on Zero

App Files Files Community

gdscript-assistant / colab_build_index.py

vivekchakraverty

Fix Colab OOM: cap seq length + smaller batch

c314e63 verified 2 days ago

raw

history blame contribute delete

3.62 kB

	"""Build the jina FAISS index on a free Colab/Kaggle GPU and push it to the Space.

	Run this in a GPU Colab notebook (Runtime -> Change runtime type -> T4 GPU).
	It pulls chunks.jsonl from your Space repo, embeds all chunks with
	jina-embeddings-v2-base-code on the GPU (~minutes), builds the FAISS index in the
	exact format rag.py expects (cosine / IndexIDMap2, faiss_id == chunk id), and
	uploads embeddings.faiss + id_map.json back to the Space — so the ~280 MB index
	never touches your local machine.

	USAGE (paste into a Colab cell, or upload this file and `%run` it):
	1) Set SPACE_REPO and HF_TOKEN below (token: https://huggingface.co/settings/tokens, write).
	2) Run. When it finishes, the Space restarts with full RAG.

	Cell 0 (install):
	!pip install -q "transformers<5" sentence-transformers einops faiss-cpu huggingface_hub
	"""
	import os
	os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

	import json

	import faiss
	import numpy as np
	from huggingface_hub import hf_hub_download, login, upload_file
	from sentence_transformers import SentenceTransformer

	# ─── CONFIG ────────────────────────────────────────────────────────────────
	SPACE_REPO = os.environ.get("SPACE_REPO", "<user>/gdscript-assistant") # <-- set
	HF_TOKEN = os.environ.get("HF_TOKEN", "") # <-- set (write)
	MODEL = "jinaai/jina-embeddings-v2-base-code"
	BATCH = 32 # small batch + capped length avoids T4 OOM on long chunks
	MAX_LEN = 1024 # chunks are mostly tiny (p90 ~242 tokens); cap bounds memory
	# ───────────────────────────────────────────────────────────────────────────

	login(token=HF_TOKEN)

	# 1. Pull chunks.jsonl from the Space repo (fast on Colab's connection).
	chunks_path = hf_hub_download(
	repo_id=SPACE_REPO, repo_type="space", filename="data/chunks.jsonl")

	ids, texts, meta = [], [], {}
	with open(chunks_path, encoding="utf-8") as f:
	for line in f:
	if not line.strip():
	continue
	r = json.loads(line)
	ids.append(int(r["id"]))
	texts.append(r["text"])
	meta[str(r["id"])] = {"origin_url": r.get("origin_url", ""),
	"repo": r.get("repo", "")}
	print(f"Loaded {len(ids)} chunks")

	# 2. Embed on GPU (normalized -> cosine via inner product).
	model = SentenceTransformer(MODEL, trust_remote_code=True, device="cuda")
	model.max_seq_length = MAX_LEN
	vecs = model.encode(texts, batch_size=BATCH, normalize_embeddings=True,
	convert_to_numpy=True, show_progress_bar=True)
	vecs = vecs.astype(np.float32)
	print("Embedded:", vecs.shape)

	# 3. Build FAISS index — IDMap2(FlatIP), faiss_id == chunk id (matches rag.py).
	index = faiss.IndexIDMap2(faiss.IndexFlatIP(vecs.shape[1]))
	index.add_with_ids(vecs, np.asarray(ids, dtype=np.int64))
	faiss.write_index(index, "embeddings.faiss")
	with open("id_map.json", "w", encoding="utf-8") as f:
	json.dump(meta, f)
	print("Index built:", index.ntotal, "vectors")

	# 4. Push the index back to the Space repo (Colab -> HF; not your machine).
	for fn in ("embeddings.faiss", "id_map.json"):
	upload_file(path_or_fileobj=fn, path_in_repo=f"data/{fn}",
	repo_id=SPACE_REPO, repo_type="space",
	commit_message="Add jina FAISS index (built on GPU)")
	print("Done — Space will restart with full RAG.")