Spaces:
Running on Zero
Running on Zero
| """Build the jina FAISS index on a free Colab/Kaggle GPU and push it to the Space. | |
| Run this in a GPU Colab notebook (Runtime -> Change runtime type -> T4 GPU). | |
| It pulls chunks.jsonl from your Space repo, embeds all chunks with | |
| jina-embeddings-v2-base-code on the GPU (~minutes), builds the FAISS index in the | |
| exact format rag.py expects (cosine / IndexIDMap2, faiss_id == chunk id), and | |
| uploads embeddings.faiss + id_map.json back to the Space β so the ~280 MB index | |
| never touches your local machine. | |
| USAGE (paste into a Colab cell, or upload this file and `%run` it): | |
| 1) Set SPACE_REPO and HF_TOKEN below (token: https://huggingface.co/settings/tokens, write). | |
| 2) Run. When it finishes, the Space restarts with full RAG. | |
| Cell 0 (install): | |
| !pip install -q "transformers<5" sentence-transformers einops faiss-cpu huggingface_hub | |
| """ | |
| import os | |
| os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") | |
| import json | |
| import faiss | |
| import numpy as np | |
| from huggingface_hub import hf_hub_download, login, upload_file | |
| from sentence_transformers import SentenceTransformer | |
| # βββ CONFIG ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SPACE_REPO = os.environ.get("SPACE_REPO", "<user>/gdscript-assistant") # <-- set | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") # <-- set (write) | |
| MODEL = "jinaai/jina-embeddings-v2-base-code" | |
| BATCH = 32 # small batch + capped length avoids T4 OOM on long chunks | |
| MAX_LEN = 1024 # chunks are mostly tiny (p90 ~242 tokens); cap bounds memory | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| login(token=HF_TOKEN) | |
| # 1. Pull chunks.jsonl from the Space repo (fast on Colab's connection). | |
| chunks_path = hf_hub_download( | |
| repo_id=SPACE_REPO, repo_type="space", filename="data/chunks.jsonl") | |
| ids, texts, meta = [], [], {} | |
| with open(chunks_path, encoding="utf-8") as f: | |
| for line in f: | |
| if not line.strip(): | |
| continue | |
| r = json.loads(line) | |
| ids.append(int(r["id"])) | |
| texts.append(r["text"]) | |
| meta[str(r["id"])] = {"origin_url": r.get("origin_url", ""), | |
| "repo": r.get("repo", "")} | |
| print(f"Loaded {len(ids)} chunks") | |
| # 2. Embed on GPU (normalized -> cosine via inner product). | |
| model = SentenceTransformer(MODEL, trust_remote_code=True, device="cuda") | |
| model.max_seq_length = MAX_LEN | |
| vecs = model.encode(texts, batch_size=BATCH, normalize_embeddings=True, | |
| convert_to_numpy=True, show_progress_bar=True) | |
| vecs = vecs.astype(np.float32) | |
| print("Embedded:", vecs.shape) | |
| # 3. Build FAISS index β IDMap2(FlatIP), faiss_id == chunk id (matches rag.py). | |
| index = faiss.IndexIDMap2(faiss.IndexFlatIP(vecs.shape[1])) | |
| index.add_with_ids(vecs, np.asarray(ids, dtype=np.int64)) | |
| faiss.write_index(index, "embeddings.faiss") | |
| with open("id_map.json", "w", encoding="utf-8") as f: | |
| json.dump(meta, f) | |
| print("Index built:", index.ntotal, "vectors") | |
| # 4. Push the index back to the Space repo (Colab -> HF; not your machine). | |
| for fn in ("embeddings.faiss", "id_map.json"): | |
| upload_file(path_or_fileobj=fn, path_in_repo=f"data/{fn}", | |
| repo_id=SPACE_REPO, repo_type="space", | |
| commit_message="Add jina FAISS index (built on GPU)") | |
| print("Done β Space will restart with full RAG.") | |