Spaces:
Running on Zero
Running on Zero
| """Retrieval over the GDScript corpus. | |
| Loads the prebuilt FAISS index (cosine / IndexIDMap2, faiss_id == chunk id) and | |
| chunks.jsonl, embeds the query with the same jina code model used to build the | |
| index, and returns the top-k chunk records. Runs on CPU (query embedding is one | |
| text at a time, fast). | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| from dataclasses import dataclass | |
| from functools import lru_cache | |
| from pathlib import Path | |
| import faiss | |
| import numpy as np | |
| DATA_DIR = Path(os.environ.get("GDRAG_SPACE_DATA", Path(__file__).parent / "data")) | |
| FAISS_PATH = DATA_DIR / "embeddings.faiss" | |
| CHUNKS_PATH = DATA_DIR / "chunks.jsonl" | |
| EMBED_MODEL = "jinaai/jina-embeddings-v2-base-code" | |
| class Hit: | |
| score: float | |
| text: str | |
| repo: str | |
| origin_url: str | |
| file_path: str | |
| kind: str | |
| # --------------------------------------------------------------------------- | |
| # Lazy singletons (loaded once per process) | |
| # --------------------------------------------------------------------------- | |
| def _index() -> faiss.Index: | |
| return faiss.read_index(str(FAISS_PATH)) | |
| def _chunks() -> dict[int, dict]: | |
| by_id: dict[int, dict] = {} | |
| with open(CHUNKS_PATH, "r", encoding="utf-8") as f: | |
| for line in f: | |
| if not line.strip(): | |
| continue | |
| try: | |
| r = json.loads(line) | |
| except json.JSONDecodeError: | |
| continue | |
| by_id[r["id"]] = r | |
| return by_id | |
| def _embedder(): | |
| # transformers ~=4.45 (pinned) loads jina's remote code without shims. | |
| # device="cpu" is REQUIRED on ZeroGPU: query embedding runs in retrieve(), | |
| # outside the @spaces.GPU block, so CUDA isn't really allocated there — left | |
| # on auto it lands on a phantom cuda device and returns zero vectors. | |
| from sentence_transformers import SentenceTransformer | |
| return SentenceTransformer(EMBED_MODEL, trust_remote_code=True, device="cpu") | |
| def _embed_query(query: str) -> np.ndarray: | |
| vec = _embedder().encode([query], normalize_embeddings=True, | |
| show_progress_bar=False) | |
| return np.asarray(vec, dtype=np.float32) | |
| # --------------------------------------------------------------------------- | |
| # Public API | |
| # --------------------------------------------------------------------------- | |
| def index_available() -> bool: | |
| return FAISS_PATH.exists() and CHUNKS_PATH.exists() | |
| def retrieve(query: str, k: int = 6) -> list[Hit]: | |
| """Return the top-k GDScript chunks most relevant to the query. | |
| Returns [] if the index hasn't been built/uploaded yet, so the Space still | |
| runs (answers without retrieval) until the Colab build pushes the index. | |
| """ | |
| if not query.strip() or not index_available(): | |
| return [] | |
| qv = _embed_query(query) | |
| scores, ids = _index().search(qv, k) | |
| chunks = _chunks() | |
| hits: list[Hit] = [] | |
| for score, cid in zip(scores[0], ids[0]): | |
| if cid < 0: | |
| continue | |
| rec = chunks.get(int(cid)) | |
| if not rec: | |
| continue | |
| hits.append(Hit( | |
| score=float(score), | |
| text=rec.get("text", ""), | |
| repo=rec.get("repo", ""), | |
| origin_url=rec.get("origin_url", ""), | |
| file_path=rec.get("file_path", ""), | |
| kind=rec.get("kind", ""), | |
| )) | |
| return hits | |
| def warmup() -> None: | |
| """Preload index, chunks and embedder (call at Space startup).""" | |
| if index_available(): | |
| _index(); _chunks(); _embedder() | |
| if __name__ == "__main__": | |
| import sys | |
| q = " ".join(sys.argv[1:]) or "how do I use @export and signals in GDScript" | |
| print(f"Query: {q}\n") | |
| for i, h in enumerate(retrieve(q, k=6), 1): | |
| print(f"[{i}] score={h.score:.3f} {h.repo} {h.file_path}") | |
| print(" " + h.text[:160].replace("\n", " ") + "...\n") | |