vivekchakraverty commited on
Commit
777ea0e
·
verified ·
1 Parent(s): 55ee315

GDScript RAG assistant: app + corpus (index added later via Colab)

Browse files
Files changed (12) hide show
  1. .gitattributes +2 -35
  2. DEPLOY.md +57 -0
  3. README.md +48 -5
  4. app.py +90 -0
  5. colab_build_index.py +70 -0
  6. data/chunks.jsonl +3 -0
  7. generate.py +78 -0
  8. prompt.py +56 -0
  9. rag.py +119 -0
  10. requirements.txt +10 -0
  11. stage_index.sh +15 -0
  12. validate.py +134 -0
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.faiss filter=lfs diff=lfs merge=lfs -text
2
+ *.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
DEPLOY.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deploying the GDScript Assistant (Colab-built jina index)
2
+
3
+ The 280 MB jina index is built on a **free Colab GPU** and pushed straight to the
4
+ Space, so it never moves over your local connection. You only push the app +
5
+ `chunks.jsonl` (~90 MB) once.
6
+
7
+ ## 0. Prerequisites
8
+ - HuggingFace account + **write token** (https://huggingface.co/settings/tokens).
9
+ - `git`, `git-lfs`, `pip install huggingface_hub`.
10
+ - `data/chunks.jsonl` is already staged in this folder.
11
+
12
+ ## Phase 1 — Push the app + corpus (your machine)
13
+ The app tolerates a missing index (it answers without retrieval until the index
14
+ is added), so deploy first:
15
+ ```bash
16
+ huggingface-cli login # write token
17
+ huggingface-cli repo create gdscript-assistant --type space --space_sdk gradio
18
+ cd hf-space/gdscript-assistant
19
+ git init && git lfs install
20
+ git add . && git commit -m "GDScript RAG assistant (app + corpus)"
21
+ git remote add origin https://huggingface.co/spaces/<user>/gdscript-assistant
22
+ git push -u origin main # ~90MB: chunks.jsonl (LFS) + code
23
+ ```
24
+ Then in **Space → Settings → Hardware → select "ZeroGPU"**.
25
+
26
+ ## Phase 2 — Build the jina index on Colab (free GPU, ~10 min)
27
+ 1. Open https://colab.research.google.com → new notebook →
28
+ **Runtime → Change runtime type → T4 GPU**.
29
+ 2. Cell 1 (install):
30
+ ```python
31
+ !pip install -q "transformers<5" sentence-transformers einops faiss-cpu huggingface_hub
32
+ ```
33
+ 3. Cell 2: paste the contents of **`colab_build_index.py`**, set at the top:
34
+ ```python
35
+ SPACE_REPO = "<user>/gdscript-assistant"
36
+ HF_TOKEN = "hf_...your_write_token..."
37
+ ```
38
+ Run it. It pulls `chunks.jsonl` from the Space, embeds 91,720 chunks with
39
+ `jina-embeddings-v2-base-code` on the GPU, builds the FAISS index, and
40
+ **uploads `data/embeddings.faiss` + `data/id_map.json` back to the Space**.
41
+ 4. The Space auto-restarts and now answers with full RAG + sources.
42
+
43
+ ## Phase 3 — Verify on the Space
44
+ - Ask *"Write a CharacterBody2D top-down movement script"* → GDScript answer, a
45
+ **✅ gdtoolkit validation** badge, and a **📚 Retrieved sources** list.
46
+ - Force a mistake to see the **🔧 auto-correct** path.
47
+ - Hitting ZeroGPU quota? HF **PRO** ($9/mo) gives much more GPU time.
48
+
49
+ ## Notes
50
+ - Index format is built to match `rag.py` exactly (cosine `IndexIDMap2`,
51
+ `faiss_id == chunk id`; `id_map.json` keyed by `str(id)`).
52
+ - `requirements.txt` pins `transformers~=4.45` so jina (query embedding) and
53
+ Qwen2.5-Coder both load with no patches.
54
+ - Validation checks **syntax + style** (gdtoolkit), not runtime/scene semantics.
55
+ - Fallback (local build): if you ever build the index locally
56
+ (`python crawl_gdscript.py embed`), run `bash stage_index.sh` then push — but
57
+ jina on this CPU is ~50h, so Colab is strongly preferred.
README.md CHANGED
@@ -1,13 +1,56 @@
1
  ---
2
- title: Gdscript Assistant
3
- emoji: 🏃
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 6.15.2
8
- python_version: '3.13'
9
  app_file: app.py
10
  pinned: false
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: GDScript Coding Assistant
3
+ emoji: 🤖
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
 
 
7
  app_file: app.py
8
  pinned: false
9
+ license: mit
10
+ short_description: RAG GDScript assistant with gdtoolkit validation
11
  ---
12
 
13
+ # 🤖 GDScript Coding Assistant
14
+
15
+ A Godot 4 / GDScript coding assistant that answers using **RAG** over a curated
16
+ **91,720-chunk** corpus crawled from the official docs, demo repos, tutorial
17
+ sites and YouTube descriptions. Generated GDScript is **syntax-validated with
18
+ `gdtoolkit`** before it's shown.
19
+
20
+ ## How it works
21
+
22
+ ```
23
+ question ─▶ jina query-embed (CPU) ─▶ FAISS top-k GDScript snippets
24
+ ─▶ Qwen2.5-Coder-7B-Instruct (ZeroGPU) ─▶ answer
25
+ ─▶ gdtoolkit parse + lint (CPU) ─▶ ✅/❌ + optional 1× self-fix
26
+ ```
27
+
28
+ - **Retriever:** `jinaai/jina-embeddings-v2-base-code` (768-dim, code-tuned),
29
+ prebuilt FAISS cosine index bundled via Git LFS (`data/embeddings.faiss`,
30
+ `data/chunks.jsonl`).
31
+ - **Generator:** `Qwen/Qwen2.5-Coder-7B-Instruct` on **ZeroGPU** (only the
32
+ generation call uses the GPU).
33
+ - **Validation:** `gdtoolkit` (`gdparse` syntax + `gdlint` style). Note: this
34
+ checks *syntax and style*, not runtime/scene semantics.
35
+
36
+ ## Setup (hardware)
37
+
38
+ In **Space → Settings → Hardware**, select **ZeroGPU**. The `spaces` package +
39
+ `@spaces.GPU` decorator in `generate.py` do the rest.
40
+
41
+ ## Local dev
42
+
43
+ ```bash
44
+ pip install -r requirements.txt
45
+ # fast UI/flow test without downloading the 7B model:
46
+ GDRAG_STUB_LLM=1 python app.py
47
+ # real retrieval needs data/embeddings.faiss + data/chunks.jsonl present
48
+ python rag.py "how do I use @export and signals"
49
+ python validate.py
50
+ ```
51
+
52
+ ## Data provenance & licensing
53
+
54
+ Snippets come from public Godot resources with **varying licenses** (docs CC-BY,
55
+ repos MIT/Apache/GPL/…). Each retrieved snippet shows its source; respect the
56
+ original licenses when reusing generated code.
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GDScript Coding Assistant — Gradio app (HF Space, ZeroGPU).
2
+
3
+ Flow per question: retrieve (CPU) -> generate (ZeroGPU) -> validate (CPU) ->
4
+ optional 1x self-correct -> render answer + validation + sources.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import gradio as gr
9
+
10
+ import rag
11
+ import prompt as promptlib
12
+ import generate as gen
13
+ import validate as gdv
14
+
15
+
16
+ def _sources_md(hits: list[rag.Hit]) -> str:
17
+ if not hits:
18
+ return ""
19
+ lines = ["\n\n<details><summary>📚 Retrieved sources</summary>\n"]
20
+ for i, h in enumerate(hits, 1):
21
+ loc = h.repo or "corpus"
22
+ url = h.origin_url or ""
23
+ link = f"[{loc}]({url})" if url.startswith("http") else loc
24
+ lines.append(f"{i}. {link} · `{h.file_path or h.kind}` · score {h.score:.2f}")
25
+ lines.append("\n</details>")
26
+ return "\n".join(lines)
27
+
28
+
29
+ def respond(message: str, history, top_k: int, self_correct: bool):
30
+ message = (message or "").strip()
31
+ if not message:
32
+ return "Ask a GDScript or Godot question."
33
+
34
+ hits = rag.retrieve(message, k=int(top_k))
35
+ messages = promptlib.build_messages(message, hits)
36
+ answer = gen.generate(messages)
37
+
38
+ results = gdv.validate_answer(answer)
39
+
40
+ # One optional self-correction pass if a code block failed to parse.
41
+ if self_correct:
42
+ fail = gdv.first_syntax_error(results)
43
+ if fail is not None:
44
+ broken, err = fail
45
+ fixed = gen.generate(promptlib.build_fix_messages(broken, err))
46
+ fixed_results = gdv.validate_answer(fixed)
47
+ if fixed_results and all(r.ok for r in fixed_results):
48
+ answer = (answer
49
+ + "\n\n---\n**🔧 Auto-corrected** (original had a syntax "
50
+ "error):\n\n" + fixed)
51
+ results = fixed_results
52
+
53
+ report = gdv.render_report(results)
54
+ note = ("" if rag.index_available()
55
+ else "\n\n> ⏳ _Retrieval index not loaded yet — answering without "
56
+ "corpus context. Build & push the index (see DEPLOY.md)._")
57
+ return f"{answer}\n\n---\n**Validation:** \n{report}{_sources_md(hits)}{note}"
58
+
59
+
60
+ with gr.Blocks(title="GDScript Coding Assistant", fill_height=True) as demo:
61
+ gr.Markdown(
62
+ "# 🤖 GDScript Coding Assistant\n"
63
+ "RAG over a 91,720-chunk Godot/GDScript corpus · Qwen2.5-Coder-7B · "
64
+ "answers are **syntax-validated with gdtoolkit**."
65
+ )
66
+ with gr.Accordion("Settings", open=False):
67
+ top_k = gr.Slider(2, 10, value=6, step=1, label="Retrieved snippets (k)")
68
+ self_correct = gr.Checkbox(
69
+ value=True, label="Auto-correct one syntax error (extra GPU call)")
70
+
71
+ gr.ChatInterface(
72
+ fn=respond,
73
+ additional_inputs=[top_k, self_correct],
74
+ examples=[
75
+ ["Write a CharacterBody2D top-down movement script", 6, True],
76
+ ["How do I define and emit a custom signal?", 6, True],
77
+ ["Show a typed @export inventory array with @onready", 6, True],
78
+ ["Make an enemy follow the player using a NavigationAgent2D", 6, True],
79
+ ],
80
+ cache_examples=False,
81
+ )
82
+
83
+
84
+ if __name__ == "__main__":
85
+ # Preload index/chunks/embedder (and the model unless stubbed) at startup.
86
+ try:
87
+ rag.warmup()
88
+ except Exception as e:
89
+ print(f"warmup (rag) skipped: {e}")
90
+ demo.queue(max_size=16).launch()
colab_build_index.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build the jina FAISS index on a free Colab/Kaggle GPU and push it to the Space.
2
+
3
+ Run this in a GPU Colab notebook (Runtime -> Change runtime type -> T4 GPU).
4
+ It pulls chunks.jsonl from your Space repo, embeds all chunks with
5
+ jina-embeddings-v2-base-code on the GPU (~minutes), builds the FAISS index in the
6
+ exact format rag.py expects (cosine / IndexIDMap2, faiss_id == chunk id), and
7
+ uploads embeddings.faiss + id_map.json back to the Space — so the ~280 MB index
8
+ never touches your local machine.
9
+
10
+ USAGE (paste into a Colab cell, or upload this file and `%run` it):
11
+ 1) Set SPACE_REPO and HF_TOKEN below (token: https://huggingface.co/settings/tokens, write).
12
+ 2) Run. When it finishes, the Space restarts with full RAG.
13
+
14
+ Cell 0 (install):
15
+ !pip install -q "transformers<5" sentence-transformers einops faiss-cpu huggingface_hub
16
+ """
17
+ import json
18
+ import os
19
+
20
+ import faiss
21
+ import numpy as np
22
+ from huggingface_hub import hf_hub_download, login, upload_file
23
+ from sentence_transformers import SentenceTransformer
24
+
25
+ # ─── CONFIG ────────────────────────────────────────────────────────────────
26
+ SPACE_REPO = os.environ.get("SPACE_REPO", "<user>/gdscript-assistant") # <-- set
27
+ HF_TOKEN = os.environ.get("HF_TOKEN", "") # <-- set (write)
28
+ MODEL = "jinaai/jina-embeddings-v2-base-code"
29
+ BATCH = 256
30
+ # ───────────────────────────────────────────────────────────────────────────
31
+
32
+ login(token=HF_TOKEN)
33
+
34
+ # 1. Pull chunks.jsonl from the Space repo (fast on Colab's connection).
35
+ chunks_path = hf_hub_download(
36
+ repo_id=SPACE_REPO, repo_type="space", filename="data/chunks.jsonl")
37
+
38
+ ids, texts, meta = [], [], {}
39
+ with open(chunks_path, encoding="utf-8") as f:
40
+ for line in f:
41
+ if not line.strip():
42
+ continue
43
+ r = json.loads(line)
44
+ ids.append(int(r["id"]))
45
+ texts.append(r["text"])
46
+ meta[str(r["id"])] = {"origin_url": r.get("origin_url", ""),
47
+ "repo": r.get("repo", "")}
48
+ print(f"Loaded {len(ids)} chunks")
49
+
50
+ # 2. Embed on GPU (normalized -> cosine via inner product).
51
+ model = SentenceTransformer(MODEL, trust_remote_code=True, device="cuda")
52
+ vecs = model.encode(texts, batch_size=BATCH, normalize_embeddings=True,
53
+ convert_to_numpy=True, show_progress_bar=True)
54
+ vecs = vecs.astype(np.float32)
55
+ print("Embedded:", vecs.shape)
56
+
57
+ # 3. Build FAISS index — IDMap2(FlatIP), faiss_id == chunk id (matches rag.py).
58
+ index = faiss.IndexIDMap2(faiss.IndexFlatIP(vecs.shape[1]))
59
+ index.add_with_ids(vecs, np.asarray(ids, dtype=np.int64))
60
+ faiss.write_index(index, "embeddings.faiss")
61
+ with open("id_map.json", "w", encoding="utf-8") as f:
62
+ json.dump(meta, f)
63
+ print("Index built:", index.ntotal, "vectors")
64
+
65
+ # 4. Push the index back to the Space repo (Colab -> HF; not your machine).
66
+ for fn in ("embeddings.faiss", "id_map.json"):
67
+ upload_file(path_or_fileobj=fn, path_in_repo=f"data/{fn}",
68
+ repo_id=SPACE_REPO, repo_type="space",
69
+ commit_message="Add jina FAISS index (built on GPU)")
70
+ print("Done — Space will restart with full RAG.")
data/chunks.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:106af2e9e00069642dc25312710d8a48eb7501947e8c7c5437a1e001d4858914
3
+ size 88978917
generate.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
2
+
3
+ Only this module touches the GPU: the decorated ``generate`` runs under
4
+ ``@spaces.GPU`` so ZeroGPU allocates an A100 slice on demand; retrieval and
5
+ validation stay on CPU.
6
+
7
+ Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading the
8
+ model (so rag/validate/app can be exercised without a GPU or a 15 GB download).
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ from functools import lru_cache
14
+
15
+ MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
16
+ STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
17
+
18
+ # Optional ZeroGPU decorator — degrade to a no-op when running locally.
19
+ try:
20
+ import spaces
21
+ GPU = spaces.GPU
22
+ except Exception: # not on a Space
23
+ def GPU(*dargs, **dkwargs):
24
+ def deco(fn):
25
+ return fn
26
+ # support both @GPU and @GPU(duration=...)
27
+ if dargs and callable(dargs[0]):
28
+ return dargs[0]
29
+ return deco
30
+
31
+
32
+ @lru_cache(maxsize=1)
33
+ def _model_and_tokenizer():
34
+ import torch
35
+ from transformers import AutoModelForCausalLM, AutoTokenizer
36
+ tok = AutoTokenizer.from_pretrained(MODEL_ID)
37
+ model = AutoModelForCausalLM.from_pretrained(
38
+ MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto",
39
+ )
40
+ model.eval()
41
+ return model, tok
42
+
43
+
44
+ def _render(messages, tok) -> str:
45
+ return tok.apply_chat_template(
46
+ messages, tokenize=False, add_generation_prompt=True)
47
+
48
+
49
+ @GPU(duration=120)
50
+ def generate(messages: list[dict], max_new_tokens: int = 512,
51
+ temperature: float = 0.2) -> str:
52
+ """Generate an assistant reply for chat-format ``messages``."""
53
+ if STUB:
54
+ return (
55
+ "Here is a Godot 4 movement script:\n\n```gdscript\n"
56
+ "extends CharacterBody2D\n\n@export var speed: float = 200.0\n\n"
57
+ "func _physics_process(delta: float) -> void:\n"
58
+ "\tvar dir := Input.get_vector(\"ui_left\", \"ui_right\", "
59
+ "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
60
+ "\tmove_and_slide()\n```\n"
61
+ )
62
+ import torch
63
+ model, tok = _model_and_tokenizer()
64
+ text = _render(messages, tok)
65
+ inputs = tok([text], return_tensors="pt").to(model.device)
66
+ with torch.no_grad():
67
+ out = model.generate(
68
+ **inputs, max_new_tokens=max_new_tokens,
69
+ do_sample=temperature > 0, temperature=max(temperature, 1e-4),
70
+ top_p=0.95, pad_token_id=tok.eos_token_id,
71
+ )
72
+ gen = out[0][inputs["input_ids"].shape[1]:]
73
+ return tok.decode(gen, skip_special_tokens=True).strip()
74
+
75
+
76
+ def warmup() -> None:
77
+ if not STUB:
78
+ _model_and_tokenizer()
prompt.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prompt assembly: system instruction + retrieved context -> chat messages."""
2
+ from __future__ import annotations
3
+
4
+ from rag import Hit
5
+
6
+ SYSTEM_PROMPT = (
7
+ "You are an expert Godot 4 GDScript assistant. Answer using the reference "
8
+ "snippets provided below when they are relevant. Always write GDScript that "
9
+ "targets Godot 4 (GDScript 2.0). Put runnable code in ```gdscript fenced "
10
+ "blocks. Prefer static typing and @export/@onready annotations where natural. "
11
+ "If the snippets don't cover the question, answer from general Godot knowledge "
12
+ "and say so briefly. Be concise."
13
+ )
14
+
15
+ # Keep the context budget modest so generation stays fast on ZeroGPU.
16
+ MAX_CONTEXT_CHARS = 6000
17
+
18
+
19
+ def _format_context(hits: list[Hit]) -> str:
20
+ blocks, used = [], 0
21
+ for i, h in enumerate(hits, 1):
22
+ src = h.repo or h.origin_url or "corpus"
23
+ snippet = h.text.strip()
24
+ block = f"# Snippet {i} (source: {src})\n{snippet}"
25
+ if used + len(block) > MAX_CONTEXT_CHARS:
26
+ break
27
+ blocks.append(block)
28
+ used += len(block)
29
+ return "\n\n".join(blocks)
30
+
31
+
32
+ def build_messages(question: str, hits: list[Hit],
33
+ history: list[dict] | None = None) -> list[dict]:
34
+ """Build chat-template messages for the generator."""
35
+ context = _format_context(hits)
36
+ messages: list[dict] = [{"role": "system", "content": SYSTEM_PROMPT}]
37
+ if history:
38
+ messages.extend(history)
39
+ user = question if not context else (
40
+ f"Reference GDScript snippets from a curated Godot corpus:\n\n"
41
+ f"{context}\n\n---\n\nQuestion: {question}"
42
+ )
43
+ messages.append({"role": "user", "content": user})
44
+ return messages
45
+
46
+
47
+ def build_fix_messages(broken_code: str, error: str) -> list[dict]:
48
+ """Messages asking the model to fix a GDScript snippet that failed to parse."""
49
+ return [
50
+ {"role": "system", "content": SYSTEM_PROMPT},
51
+ {"role": "user", "content": (
52
+ "The following GDScript failed to parse with this error:\n"
53
+ f"{error}\n\nFix it and return ONLY the corrected GDScript in a "
54
+ f"```gdscript block:\n\n```gdscript\n{broken_code}\n```"
55
+ )},
56
+ ]
rag.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Retrieval over the GDScript corpus.
2
+
3
+ Loads the prebuilt FAISS index (cosine / IndexIDMap2, faiss_id == chunk id) and
4
+ chunks.jsonl, embeds the query with the same jina code model used to build the
5
+ index, and returns the top-k chunk records. Runs on CPU (query embedding is one
6
+ text at a time, fast).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+ from dataclasses import dataclass
13
+ from functools import lru_cache
14
+ from pathlib import Path
15
+
16
+ import faiss
17
+ import numpy as np
18
+
19
+ DATA_DIR = Path(os.environ.get("GDRAG_SPACE_DATA", Path(__file__).parent / "data"))
20
+ FAISS_PATH = DATA_DIR / "embeddings.faiss"
21
+ CHUNKS_PATH = DATA_DIR / "chunks.jsonl"
22
+ EMBED_MODEL = "jinaai/jina-embeddings-v2-base-code"
23
+
24
+
25
+ @dataclass
26
+ class Hit:
27
+ score: float
28
+ text: str
29
+ repo: str
30
+ origin_url: str
31
+ file_path: str
32
+ kind: str
33
+
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Lazy singletons (loaded once per process)
37
+ # ---------------------------------------------------------------------------
38
+ @lru_cache(maxsize=1)
39
+ def _index() -> faiss.Index:
40
+ return faiss.read_index(str(FAISS_PATH))
41
+
42
+
43
+ @lru_cache(maxsize=1)
44
+ def _chunks() -> dict[int, dict]:
45
+ by_id: dict[int, dict] = {}
46
+ with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
47
+ for line in f:
48
+ if not line.strip():
49
+ continue
50
+ try:
51
+ r = json.loads(line)
52
+ except json.JSONDecodeError:
53
+ continue
54
+ by_id[r["id"]] = r
55
+ return by_id
56
+
57
+
58
+ @lru_cache(maxsize=1)
59
+ def _embedder():
60
+ # transformers ~=4.45 (pinned) loads jina's remote code without shims.
61
+ from sentence_transformers import SentenceTransformer
62
+ return SentenceTransformer(EMBED_MODEL, trust_remote_code=True)
63
+
64
+
65
+ def _embed_query(query: str) -> np.ndarray:
66
+ vec = _embedder().encode([query], normalize_embeddings=True,
67
+ show_progress_bar=False)
68
+ return np.asarray(vec, dtype=np.float32)
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Public API
73
+ # ---------------------------------------------------------------------------
74
+ def index_available() -> bool:
75
+ return FAISS_PATH.exists() and CHUNKS_PATH.exists()
76
+
77
+
78
+ def retrieve(query: str, k: int = 6) -> list[Hit]:
79
+ """Return the top-k GDScript chunks most relevant to the query.
80
+
81
+ Returns [] if the index hasn't been built/uploaded yet, so the Space still
82
+ runs (answers without retrieval) until the Colab build pushes the index.
83
+ """
84
+ if not query.strip() or not index_available():
85
+ return []
86
+ qv = _embed_query(query)
87
+ scores, ids = _index().search(qv, k)
88
+ chunks = _chunks()
89
+ hits: list[Hit] = []
90
+ for score, cid in zip(scores[0], ids[0]):
91
+ if cid < 0:
92
+ continue
93
+ rec = chunks.get(int(cid))
94
+ if not rec:
95
+ continue
96
+ hits.append(Hit(
97
+ score=float(score),
98
+ text=rec.get("text", ""),
99
+ repo=rec.get("repo", ""),
100
+ origin_url=rec.get("origin_url", ""),
101
+ file_path=rec.get("file_path", ""),
102
+ kind=rec.get("kind", ""),
103
+ ))
104
+ return hits
105
+
106
+
107
+ def warmup() -> None:
108
+ """Preload index, chunks and embedder (call at Space startup)."""
109
+ if index_available():
110
+ _index(); _chunks(); _embedder()
111
+
112
+
113
+ if __name__ == "__main__":
114
+ import sys
115
+ q = " ".join(sys.argv[1:]) or "how do I use @export and signals in GDScript"
116
+ print(f"Query: {q}\n")
117
+ for i, h in enumerate(retrieve(q, k=6), 1):
118
+ print(f"[{i}] score={h.score:.3f} {h.repo} {h.file_path}")
119
+ print(" " + h.text[:160].replace("\n", " ") + "...\n")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.44
2
+ spaces>=0.30
3
+ torch
4
+ transformers~=4.45 # satisfies BOTH jina remote code (4.x) AND Qwen2.5-Coder
5
+ sentence-transformers~=2.7
6
+ einops # required by jina remote code
7
+ accelerate # device_map model loading
8
+ faiss-cpu>=1.8
9
+ numpy
10
+ gdtoolkit>=4.0 # GDScript syntax parse + lint
stage_index.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Copy the finished jina FAISS index into the Space's data/ dir.
3
+ # Run this from the project root (v:\Coding_RAG) AFTER the embed build completes
4
+ # (status shows embedded == 91720).
5
+ set -e
6
+ SRC=data/index
7
+ DST=hf-space/gdscript-assistant/data
8
+
9
+ cp "$SRC/embeddings.faiss" "$DST/embeddings.faiss"
10
+ cp "$SRC/id_map.json" "$DST/id_map.json"
11
+ # chunks.jsonl is already staged; refresh in case it changed:
12
+ cp data/chunks.jsonl "$DST/chunks.jsonl"
13
+
14
+ echo "Staged into $DST:"
15
+ ls -lh "$DST"
validate.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Validate GDScript produced by the model using gdtoolkit (Scony's parser).
2
+
3
+ Pure-Python, CPU-only, Godot-4 (GDScript 2.0). Checks SYNTAX (gdparse) and STYLE
4
+ (gdlint); it does NOT check runtime/scene semantics (node paths, types against a
5
+ real project) — that needs the Godot engine.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from dataclasses import dataclass, field
11
+
12
+ _FENCE_RE = re.compile(r"```(?:gdscript|gd|godot)?\s*\n(.*?)```", re.S | re.I)
13
+
14
+
15
+ @dataclass
16
+ class BlockResult:
17
+ code: str
18
+ ok: bool # parses (valid syntax)
19
+ error: str = "" # syntax error message (if any)
20
+ lint: list[str] = field(default_factory=list) # style/lint warnings
21
+ formatted: str = "" # gdformat output (if available)
22
+
23
+
24
+ def extract_gdscript_blocks(text: str) -> list[str]:
25
+ """Pull fenced GDScript blocks from a model answer."""
26
+ blocks = [m.group(1).strip() for m in _FENCE_RE.finditer(text or "")]
27
+ return [b for b in blocks if b]
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # gdtoolkit wrappers (imported lazily so the module loads even if absent)
32
+ # ---------------------------------------------------------------------------
33
+ def _parse(code: str) -> tuple[bool, str]:
34
+ try:
35
+ from gdtoolkit.parser import parser
36
+ except Exception as e: # gdtoolkit not installed
37
+ return True, f"(parser unavailable: {e})"
38
+ try:
39
+ parser.parse(code, gather_metadata=False)
40
+ return True, ""
41
+ except TypeError:
42
+ # older/newer signature without gather_metadata
43
+ try:
44
+ parser.parse(code)
45
+ return True, ""
46
+ except Exception as e:
47
+ return False, _fmt_err(e)
48
+ except Exception as e:
49
+ return False, _fmt_err(e)
50
+
51
+
52
+ def _fmt_err(e: Exception) -> str:
53
+ line = getattr(e, "line", None)
54
+ col = getattr(e, "column", None)
55
+ msg = str(e).strip().splitlines()[0] if str(e).strip() else type(e).__name__
56
+ if line is not None:
57
+ return f"line {line}:{col or 0}: {msg}"
58
+ return msg
59
+
60
+
61
+ def _lint(code: str) -> list[str]:
62
+ try:
63
+ from gdtoolkit.linter import lint_code
64
+ except Exception:
65
+ return []
66
+ try:
67
+ problems = lint_code(code)
68
+ except Exception:
69
+ return []
70
+ out = []
71
+ for p in problems:
72
+ line = getattr(p, "line", "?")
73
+ name = getattr(p, "name", "")
74
+ desc = getattr(p, "description", str(p))
75
+ out.append(f"line {line}: {desc}" + (f" [{name}]" if name else ""))
76
+ return out
77
+
78
+
79
+ def _format(code: str) -> str:
80
+ try:
81
+ from gdtoolkit.formatter import format_code
82
+ return format_code(code, max_line_length=100)
83
+ except Exception:
84
+ return ""
85
+
86
+
87
+ # ---------------------------------------------------------------------------
88
+ # Public API
89
+ # ---------------------------------------------------------------------------
90
+ def validate_code(code: str) -> BlockResult:
91
+ ok, err = _parse(code)
92
+ return BlockResult(
93
+ code=code, ok=ok, error=err,
94
+ lint=_lint(code) if ok else [],
95
+ formatted=_format(code) if ok else "",
96
+ )
97
+
98
+
99
+ def validate_answer(answer: str) -> list[BlockResult]:
100
+ return [validate_code(b) for b in extract_gdscript_blocks(answer)]
101
+
102
+
103
+ def render_report(results: list[BlockResult]) -> str:
104
+ """Markdown summary for the UI."""
105
+ if not results:
106
+ return "_No GDScript code blocks detected to validate._"
107
+ lines = []
108
+ for i, r in enumerate(results, 1):
109
+ if r.ok:
110
+ badge = "✅ **valid GDScript** (syntax OK)"
111
+ if r.lint:
112
+ badge += f" · {len(r.lint)} lint note(s)"
113
+ else:
114
+ badge = f"❌ **syntax error** — {r.error}"
115
+ lines.append(f"**Block {i}:** {badge}")
116
+ for w in r.lint[:5]:
117
+ lines.append(f"- ⚠ {w}")
118
+ return "\n".join(lines)
119
+
120
+
121
+ def first_syntax_error(results: list[BlockResult]) -> tuple[str, str] | None:
122
+ """Return (code, error) of the first block that failed to parse, else None."""
123
+ for r in results:
124
+ if not r.ok:
125
+ return r.code, r.error
126
+ return None
127
+
128
+
129
+ if __name__ == "__main__":
130
+ good = "extends Node\n\n@export var speed: float = 5.0\n\nfunc _ready() -> void:\n\tprint(speed)\n"
131
+ bad = "extends Node\n\nfunc _ready(\n\tprint('oops')\n"
132
+ for label, code in (("GOOD", good), ("BAD", bad)):
133
+ r = validate_code(code)
134
+ print(f"== {label} == ok={r.ok} error={r.error!r} lint={r.lint}")