apytel commited on
Commit Β·
11ba2bd
1
Parent(s): f9d0b31
Redesigns UI for FreeCAD RAG Python script generator
Browse filesReplaces the generic chatbot interface with a custom Gradio
- CLAUDE.md +53 -0
- README.md +96 -8
- app.py +118 -60
- build_index.py +87 -0
- freecad-docs +1 -0
- requirements.txt +13 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-313.pyc +0 -0
- src/__pycache__/chunk.cpython-313.pyc +0 -0
- src/__pycache__/citations.cpython-313.pyc +0 -0
- src/__pycache__/config.cpython-313.pyc +0 -0
- src/__pycache__/generate.cpython-313.pyc +0 -0
- src/__pycache__/ingest.cpython-313.pyc +0 -0
- src/__pycache__/retrieve.cpython-313.pyc +0 -0
- src/chunk.py +118 -0
- src/citations.py +37 -0
- src/config.py +26 -0
- src/generate.py +185 -0
- src/ingest.py +62 -0
- src/retrieve.py +199 -0
CLAUDE.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CLAUDE.md
|
| 2 |
+
|
| 3 |
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
| 4 |
+
|
| 5 |
+
## Commands
|
| 6 |
+
|
| 7 |
+
```bash
|
| 8 |
+
# Install dependencies
|
| 9 |
+
pip install -r requirements.txt
|
| 10 |
+
|
| 11 |
+
# Build retrieval indices (one-time, requires freecad-docs/ to be cloned)
|
| 12 |
+
git clone --depth 1 https://github.com/FreeCAD/FreeCAD-documentation freecad-docs
|
| 13 |
+
python build_index.py --repo freecad-docs
|
| 14 |
+
|
| 15 |
+
# Run the Gradio app
|
| 16 |
+
python app.py
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
The app requires `data/chunks.parquet`, `data/index.faiss`, and `data/bm25.pkl` to exist before it will serve requests. `indices_ready()` in `src/retrieve.py` checks for these.
|
| 20 |
+
|
| 21 |
+
## Architecture
|
| 22 |
+
|
| 23 |
+
The system is a two-phase pipeline: **offline indexing** (`build_index.py`) and **online serving** (`app.py`).
|
| 24 |
+
|
| 25 |
+
### Offline: `build_index.py`
|
| 26 |
+
Reads FreeCAD wiki markdown from `freecad-docs/wiki/`, passes pages through `src/ingest.py` β `src/chunk.py`, then builds two indices written to `data/`:
|
| 27 |
+
- **BM25** (`bm25s`, `bm25.pkl`) β tokenised with a custom camelCase/snake_case tokeniser in `src/retrieve.py:_tokenize`
|
| 28 |
+
- **Dense** (`FAISS IndexFlatIP`, `index.faiss`) β embeddings from `BAAI/bge-small-en-v1.5`
|
| 29 |
+
|
| 30 |
+
### Online: `app.py` β `src/retrieve.py` β `src/generate.py`
|
| 31 |
+
1. `HybridRetriever.retrieve(query)` runs BM25 + dense search, fuses with Reciprocal Rank Fusion (k=60), optionally reranks with `BAAI/bge-reranker-base` cross-encoder, returns top-N `Citation` objects.
|
| 32 |
+
2. `generate_response()` formats citations into a numbered context block, prepends the system prompt (with two few-shot examples), and calls the OpenAI chat API.
|
| 33 |
+
3. The response is split into a `python` code block and a prose explanation with inline `[N]` citation references.
|
| 34 |
+
|
| 35 |
+
### Key files
|
| 36 |
+
- `src/config.py` β all tuneable constants (chunk size, top-K values, model names, file paths). Change retrieval hyperparameters here.
|
| 37 |
+
- `src/chunk.py` β header-split + code-block-preserving chunker. Fenced code blocks are replaced with UUID placeholders before splitting so they are never broken mid-block.
|
| 38 |
+
- `src/retrieve.py` β all retrieval logic including lazy model singletons (`_load_*` functions) that are cached at module level for the Gradio process lifetime.
|
| 39 |
+
- `src/generate.py` β system prompt, two few-shot examples (parametric box, revolve), and the OpenAI call. The few-shot examples are the authoritative reference for expected script style.
|
| 40 |
+
- `src/citations.py` β `Citation` dataclass, context block formatter, and citation markdown renderer.
|
| 41 |
+
- `src/ingest.py` β walks `freecad-docs/wiki/*.md`, skips Category/Template/MediaWiki pages, and flags ~25 high-priority scripting pages for front-sorting.
|
| 42 |
+
|
| 43 |
+
## FreeCAD script generation constraints
|
| 44 |
+
|
| 45 |
+
All generated scripts must:
|
| 46 |
+
- Target **FreeCAD 1.1** (released March 25, 2026)
|
| 47 |
+
- Never import `*Gui` modules β they crash headless (`freecadcmd`)
|
| 48 |
+
- Use `body.newObject(...)` not `doc.addObject(...)` for PartDesign features
|
| 49 |
+
- Call `doc.recompute()` after every feature
|
| 50 |
+
- Add dress-up features (Fillet, Chamfer) only after all additive/subtractive features
|
| 51 |
+
- Reference geometry by index to minimise Topological Naming Problem risk
|
| 52 |
+
|
| 53 |
+
These rules are encoded in `_SYSTEM_PROMPT` in `src/generate.py` and must stay consistent with any few-shot examples added there.
|
README.md
CHANGED
|
@@ -1,16 +1,104 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 6.5.1
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
hf_oauth: true
|
| 11 |
-
hf_oauth_scopes:
|
| 12 |
-
- inference-api
|
| 13 |
license: apache-2.0
|
|
|
|
| 14 |
---
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: FreeCAD RAG Assistant
|
| 3 |
+
emoji: π οΈ
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 6.5.1
|
| 8 |
+
python_version: "3.11"
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
|
|
|
|
|
|
|
|
|
| 11 |
license: apache-2.0
|
| 12 |
+
short_description: Generate parametric FreeCAD Python from natural language via RAG
|
| 13 |
---
|
| 14 |
|
| 15 |
+
# FreeCAD RAG Assistant
|
| 16 |
+
|
| 17 |
+
A RAG (Retrieval-Augmented Generation) system that generates complete, runnable **FreeCAD 1.1 Python scripts** from natural-language descriptions of parts.
|
| 18 |
+
|
| 19 |
+
## Architecture
|
| 20 |
+
|
| 21 |
+
```
|
| 22 |
+
Query
|
| 23 |
+
β
|
| 24 |
+
βββΊ BM25 retrieval (bm25s) ββ
|
| 25 |
+
β βββΊ RRF fusion ββΊ Cross-encoder rerank ββΊ Top-5 chunks
|
| 26 |
+
βββΊ Dense retrieval (bge-small-en) ββ
|
| 27 |
+
β
|
| 28 |
+
OpenAI (gpt-4o-mini) + system prompt
|
| 29 |
+
β
|
| 30 |
+
Generated Python + inline citations
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
**Corpus**: [FreeCAD/FreeCAD-documentation](https://github.com/FreeCAD/FreeCAD-documentation) (CC0 1.0) β ~1,500 English wiki pages covering PartDesign, Sketcher, Python scripting API, and release notes.
|
| 34 |
+
|
| 35 |
+
## Setup
|
| 36 |
+
|
| 37 |
+
### 1. Install dependencies
|
| 38 |
+
|
| 39 |
+
```bash
|
| 40 |
+
pip install -r requirements.txt
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
### 2. Build the retrieval index (one-time, run locally)
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
# Clone the FreeCAD documentation repo
|
| 47 |
+
git clone --depth 1 https://github.com/FreeCAD/FreeCAD-documentation freecad-docs
|
| 48 |
+
|
| 49 |
+
# Build BM25 + FAISS indices (outputs to data/)
|
| 50 |
+
python build_index.py --repo freecad-docs
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
This produces `data/chunks.parquet`, `data/index.faiss`, and `data/bm25.pkl`. Commit these to the repo before pushing to Hugging Face Spaces.
|
| 54 |
+
|
| 55 |
+
### 3. Run
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
python app.py
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
Enter your OpenAI API key in the UI (it is never stored or logged).
|
| 62 |
+
|
| 63 |
+
## Retrieval modes
|
| 64 |
+
|
| 65 |
+
| Toggle | Method | Wins on |
|
| 66 |
+
|--------|--------|---------|
|
| 67 |
+
| BM25 | `bm25s` with camelCase/snake_case tokenisation | Exact API tokens: `addConstraint`, `Coincident`, `PartDesign::Pad` |
|
| 68 |
+
| Dense | `BAAI/bge-small-en-v1.5` + FAISS IndexFlatIP | Paraphrased intent: "round the edges" β Fillet |
|
| 69 |
+
| Rerank | `BAAI/bge-reranker-base` cross-encoder | Precision: re-scores top-30 fused candidates |
|
| 70 |
+
| Hybrid (default) | Reciprocal Rank Fusion (k=60) | Best overall recall |
|
| 71 |
+
|
| 72 |
+
## Project structure
|
| 73 |
+
|
| 74 |
+
```
|
| 75 |
+
βββ app.py # Gradio Blocks UI
|
| 76 |
+
βββ build_index.py # One-off corpus ingestion + indexing
|
| 77 |
+
βββ requirements.txt
|
| 78 |
+
βββ src/
|
| 79 |
+
β βββ config.py # All tuneable constants
|
| 80 |
+
β βββ ingest.py # Markdown page loader
|
| 81 |
+
β βββ chunk.py # Header-split + code-block-preserving chunker
|
| 82 |
+
β βββ retrieve.py # BM25Retriever, DenseRetriever, RRF, HybridRetriever
|
| 83 |
+
β βββ generate.py # System prompt, few-shots, OpenAI call
|
| 84 |
+
β βββ citations.py # Citation dataclass + rendering
|
| 85 |
+
βββ data/ # Pre-built indices (commit via git-LFS if > 100 MB)
|
| 86 |
+
βββ chunks.parquet
|
| 87 |
+
βββ index.faiss
|
| 88 |
+
βββ bm25.pkl
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
## FreeCAD-specific notes
|
| 92 |
+
|
| 93 |
+
- All generated scripts target **FreeCAD 1.1** (released March 25, 2026).
|
| 94 |
+
- Scripts are safe to run with `freecadcmd` (headless) β `*Gui` modules are never imported.
|
| 95 |
+
- The system prompt explicitly warns about the **Topological Naming Problem**: geometry is referenced by index where possible, and dress-up features (Fillet, Chamfer) are always added after all additive/subtractive features.
|
| 96 |
+
- `doc.recompute()` is called after every feature to avoid silent failures.
|
| 97 |
+
|
| 98 |
+
## Evaluation queries
|
| 99 |
+
|
| 100 |
+
See section 12 of the technical report for the 12-query test set covering: parametric box, flange with bolt pattern, hex nut, L-bracket, threaded shaft, spreadsheet-driven gear, revolution, coincident constraint question, TNP question, linear pattern, helix sweep, and multi-loop sketch.
|
| 101 |
+
|
| 102 |
+
## License
|
| 103 |
+
|
| 104 |
+
Source code: Apache 2.0. Documentation corpus: [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/) (FreeCAD Wiki). Attribution to FreeCAD Wiki (CC-BY 3.0) shown in the UI.
|
app.py
CHANGED
|
@@ -1,68 +1,126 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from huggingface_hub import InferenceClient
|
| 3 |
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
""
|
| 15 |
-
|
| 16 |
-
""
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
response = ""
|
| 26 |
-
|
| 27 |
-
for message in client.chat_completion(
|
| 28 |
-
messages,
|
| 29 |
-
max_tokens=max_tokens,
|
| 30 |
-
stream=True,
|
| 31 |
-
temperature=temperature,
|
| 32 |
-
top_p=top_p,
|
| 33 |
-
):
|
| 34 |
-
choices = message.choices
|
| 35 |
-
token = ""
|
| 36 |
-
if len(choices) and choices[0].delta.content:
|
| 37 |
-
token = choices[0].delta.content
|
| 38 |
-
|
| 39 |
-
response += token
|
| 40 |
-
yield response
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
"""
|
| 44 |
-
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
| 45 |
-
"""
|
| 46 |
-
chatbot = gr.ChatInterface(
|
| 47 |
-
respond,
|
| 48 |
-
additional_inputs=[
|
| 49 |
-
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
|
| 50 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
| 51 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
| 52 |
-
gr.Slider(
|
| 53 |
-
minimum=0.1,
|
| 54 |
-
maximum=1.0,
|
| 55 |
-
value=0.95,
|
| 56 |
-
step=0.05,
|
| 57 |
-
label="Top-p (nucleus sampling)",
|
| 58 |
-
),
|
| 59 |
-
],
|
| 60 |
)
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
|
| 68 |
if __name__ == "__main__":
|
|
|
|
| 1 |
+
"""FreeCAD RAG Assistant β Gradio Blocks UI."""
|
| 2 |
import gradio as gr
|
|
|
|
| 3 |
|
| 4 |
+
from src.generate import generate_response
|
| 5 |
+
from src.retrieve import HybridRetriever, indices_ready
|
| 6 |
+
from src.config import HF_MODELS, DEFAULT_MODEL
|
| 7 |
|
| 8 |
+
# ββ example queries βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 9 |
+
|
| 10 |
+
_EXAMPLES = [
|
| 11 |
+
["Create a parametric box width=50 height=30 depth=20 with 5mm fillets on all vertical edges"],
|
| 12 |
+
["Make a flange OD=80mm ID=40mm thickness=10mm with 4 M6 bolt holes on a 60mm PCD using PolarPattern"],
|
| 13 |
+
["Create a hex nut for M10 thread conforming to ISO 4032 dimensions"],
|
| 14 |
+
["Generate an L-bracket 60x40x4mm with two 6mm countersunk holes"],
|
| 15 |
+
["Create a 20mm-diameter shaft with M20 thread for 30mm at one end"],
|
| 16 |
+
["Make a parametric gear blank where the number of teeth is driven by a Spreadsheet cell"],
|
| 17 |
+
["Create a wine-glass shape by revolving a profile around the Z axis"],
|
| 18 |
+
["How do I add a coincident constraint between two endpoints in a Sketcher script?"],
|
| 19 |
+
["What is the topological naming problem and how should I avoid it in generated scripts?"],
|
| 20 |
+
["Linear pattern of 5 pockets along X with 15mm spacing"],
|
| 21 |
+
["Sweep a circle along a helical path to make a spring"],
|
| 22 |
+
["Create a Pad with a sketch containing an interior circular hole (multi-loop sketch)"],
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
_INDEX_WARNING = (
|
| 26 |
+
"> **Index not found.** Run `python build_index.py --repo <path-to-freecad-docs>` "
|
| 27 |
+
"to build the retrieval index before using this app."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
)
|
| 29 |
|
| 30 |
+
# ββ generation handler ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
+
|
| 32 |
+
def run(
|
| 33 |
+
prompt: str,
|
| 34 |
+
use_bm25: bool,
|
| 35 |
+
use_dense: bool,
|
| 36 |
+
use_rerank: bool,
|
| 37 |
+
top_n: int,
|
| 38 |
+
model: str,
|
| 39 |
+
):
|
| 40 |
+
if not prompt.strip():
|
| 41 |
+
return "", "Please enter a request.", []
|
| 42 |
+
|
| 43 |
+
if not indices_ready():
|
| 44 |
+
return "", _INDEX_WARNING, []
|
| 45 |
+
|
| 46 |
+
retriever = HybridRetriever(
|
| 47 |
+
use_bm25=use_bm25,
|
| 48 |
+
use_dense=use_dense,
|
| 49 |
+
use_rerank=use_rerank,
|
| 50 |
+
top_n=top_n,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
citations = retriever.retrieve(prompt)
|
| 55 |
+
except Exception as exc: # noqa: BLE001
|
| 56 |
+
return "", f"Retrieval error: {exc}", []
|
| 57 |
+
|
| 58 |
+
code, explain, err = generate_response(
|
| 59 |
+
query=prompt,
|
| 60 |
+
citations=citations,
|
| 61 |
+
model=model,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
if err:
|
| 65 |
+
return "", f"**Error:** {err}", []
|
| 66 |
+
|
| 67 |
+
chunk_rows = [
|
| 68 |
+
[c.id, c.page_title, c.section, c.source_url, f"{c.score:.4f}"]
|
| 69 |
+
for c in citations
|
| 70 |
+
]
|
| 71 |
+
return code, explain, chunk_rows
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 75 |
+
|
| 76 |
+
with gr.Blocks(title="FreeCAD RAG Assistant", analytics_enabled=False) as demo:
|
| 77 |
+
gr.Markdown(
|
| 78 |
+
"# FreeCAD Python Code Generator\n"
|
| 79 |
+
"Describe a parametric part and get a complete, runnable FreeCAD 1.1 Python script "
|
| 80 |
+
"retrieved from the official FreeCAD wiki documentation.\n\n"
|
| 81 |
+
"> Source: [FreeCAD Wiki](https://wiki.freecad.org), CC-BY 3.0"
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
with gr.Row():
|
| 85 |
+
# ββ left column: inputs βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 86 |
+
with gr.Column(scale=2):
|
| 87 |
+
prompt = gr.Textbox(
|
| 88 |
+
label="Describe the part or ask a scripting question",
|
| 89 |
+
lines=4,
|
| 90 |
+
placeholder="Create a parametric flange with 4 M6 bolt holes on a 60mm PCDβ¦",
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
with gr.Accordion("Retrieval settings", open=False):
|
| 94 |
+
use_bm25 = gr.Checkbox(value=True, label="Enable BM25 (keyword retrieval)")
|
| 95 |
+
use_dense = gr.Checkbox(value=True, label="Enable dense retrieval (semantic)")
|
| 96 |
+
use_rerank = gr.Checkbox(value=True, label="Enable cross-encoder reranking")
|
| 97 |
+
top_n = gr.Slider(minimum=3, maximum=10, value=5, step=1,
|
| 98 |
+
label="Final chunks passed to LLM (top-N)")
|
| 99 |
+
model = gr.Dropdown(
|
| 100 |
+
choices=HF_MODELS, value=DEFAULT_MODEL, label="HuggingFace model"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
run_btn = gr.Button("Generate", variant="primary")
|
| 104 |
+
|
| 105 |
+
gr.Examples(examples=_EXAMPLES, inputs=[prompt], label="Example queries", cache_examples=False)
|
| 106 |
+
|
| 107 |
+
# ββ right column: outputs βββββββββββββββββββββββββββββββββββββββββββββ
|
| 108 |
+
with gr.Column(scale=3):
|
| 109 |
+
code_out = gr.Code(label="Generated FreeCAD Python", language="python")
|
| 110 |
+
explain_out = gr.Markdown(label="Explanation & citations")
|
| 111 |
+
|
| 112 |
+
with gr.Accordion("Retrieved chunks", open=False):
|
| 113 |
+
chunks_out = gr.Dataframe(
|
| 114 |
+
headers=["#", "Page", "Section", "URL", "Score"],
|
| 115 |
+
wrap=True,
|
| 116 |
+
label="Top retrieved chunks",
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
run_btn.click(
|
| 120 |
+
fn=run,
|
| 121 |
+
inputs=[prompt, use_bm25, use_dense, use_rerank, top_n, model],
|
| 122 |
+
outputs=[code_out, explain_out, chunks_out],
|
| 123 |
+
)
|
| 124 |
|
| 125 |
|
| 126 |
if __name__ == "__main__":
|
build_index.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
One-off script: clone the FreeCAD docs repo, chunk, embed, and build indices.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
git clone --depth 1 https://github.com/FreeCAD/FreeCAD-documentation freecad-docs
|
| 6 |
+
python build_index.py --repo freecad-docs
|
| 7 |
+
|
| 8 |
+
Outputs written to data/:
|
| 9 |
+
chunks.parquet β all chunk metadata + text
|
| 10 |
+
index.faiss β FAISS IndexFlatIP of bge-small-en-v1.5 embeddings
|
| 11 |
+
bm25.pkl β serialised bm25s index
|
| 12 |
+
"""
|
| 13 |
+
import argparse
|
| 14 |
+
import os
|
| 15 |
+
import pickle
|
| 16 |
+
|
| 17 |
+
import bm25s
|
| 18 |
+
import faiss
|
| 19 |
+
import numpy as np
|
| 20 |
+
import pandas as pd
|
| 21 |
+
from sentence_transformers import SentenceTransformer
|
| 22 |
+
from tqdm import tqdm
|
| 23 |
+
|
| 24 |
+
from src.chunk import chunk_pages
|
| 25 |
+
from src.config import BM25_FILE, CHUNKS_FILE, EMBED_MODEL, FAISS_FILE
|
| 26 |
+
from src.ingest import load_freecad_docs
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _embed_batched(model: SentenceTransformer, texts: list[str], batch_size: int = 64) -> np.ndarray:
|
| 30 |
+
all_vecs = []
|
| 31 |
+
for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
|
| 32 |
+
batch = texts[i : i + batch_size]
|
| 33 |
+
vecs = model.encode(batch, normalize_embeddings=True, show_progress_bar=False)
|
| 34 |
+
all_vecs.append(vecs)
|
| 35 |
+
return np.vstack(all_vecs).astype("float32")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def build(repo_root: str, data_dir: str = "data") -> None:
|
| 39 |
+
os.makedirs(data_dir, exist_ok=True)
|
| 40 |
+
|
| 41 |
+
print("Loading FreeCAD docs...")
|
| 42 |
+
pages = load_freecad_docs(repo_root)
|
| 43 |
+
print(f" {len(pages)} pages loaded")
|
| 44 |
+
|
| 45 |
+
print("Chunking...")
|
| 46 |
+
chunks = chunk_pages(pages)
|
| 47 |
+
print(f" {len(chunks)} chunks produced")
|
| 48 |
+
|
| 49 |
+
df = pd.DataFrame(chunks).set_index("chunk_id")
|
| 50 |
+
df.to_parquet(CHUNKS_FILE)
|
| 51 |
+
print(f" Saved {CHUNKS_FILE}")
|
| 52 |
+
|
| 53 |
+
texts = df["text"].tolist()
|
| 54 |
+
|
| 55 |
+
# ββ BM25 index ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
print("Building BM25 index...")
|
| 57 |
+
from src.retrieve import _tokenize # noqa: PLC0415
|
| 58 |
+
tokenized = bm25s.tokenize([" ".join(_tokenize(t)) for t in texts])
|
| 59 |
+
bm25_index = bm25s.BM25(method="bm25+")
|
| 60 |
+
bm25_index.index(tokenized)
|
| 61 |
+
with open(BM25_FILE, "wb") as f:
|
| 62 |
+
pickle.dump(bm25_index, f)
|
| 63 |
+
print(f" Saved {BM25_FILE}")
|
| 64 |
+
|
| 65 |
+
# ββ Dense index βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 66 |
+
print(f"Loading embedding model: {EMBED_MODEL}")
|
| 67 |
+
model = SentenceTransformer(EMBED_MODEL)
|
| 68 |
+
|
| 69 |
+
print("Embedding chunks (this may take a few minutes on CPU)...")
|
| 70 |
+
vecs = _embed_batched(model, texts)
|
| 71 |
+
|
| 72 |
+
dim = vecs.shape[1]
|
| 73 |
+
index = faiss.IndexFlatIP(dim)
|
| 74 |
+
index.add(vecs)
|
| 75 |
+
faiss.write_index(index, FAISS_FILE)
|
| 76 |
+
print(f" Saved {FAISS_FILE} ({index.ntotal} vectors, dim={dim})")
|
| 77 |
+
|
| 78 |
+
print("\nDone. Commit the data/ directory to your Spaces repo.")
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
parser = argparse.ArgumentParser()
|
| 83 |
+
parser.add_argument("--repo", default="freecad-docs",
|
| 84 |
+
help="Path to the cloned FreeCAD-documentation repository")
|
| 85 |
+
parser.add_argument("--data-dir", default="data")
|
| 86 |
+
args = parser.parse_args()
|
| 87 |
+
build(args.repo, args.data_dir)
|
freecad-docs
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit 0499378a238ce4c77c643b9cc4a03d0947381e45
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=6.5.1
|
| 2 |
+
openai>=1.50.0
|
| 3 |
+
sentence-transformers>=3.0.0
|
| 4 |
+
bm25s>=0.2.6
|
| 5 |
+
faiss-cpu>=1.8.0
|
| 6 |
+
langchain-text-splitters>=0.3.0
|
| 7 |
+
markdown-it-py>=3.0.0
|
| 8 |
+
huggingface_hub>=0.25.0
|
| 9 |
+
numpy>=2.0
|
| 10 |
+
tiktoken>=0.7.0
|
| 11 |
+
pyarrow>=15.0
|
| 12 |
+
pandas>=2.0.0
|
| 13 |
+
tqdm>=4.66.0
|
src/__init__.py
ADDED
|
File without changes
|
src/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (151 Bytes). View file
|
|
|
src/__pycache__/chunk.cpython-313.pyc
ADDED
|
Binary file (5.46 kB). View file
|
|
|
src/__pycache__/citations.cpython-313.pyc
ADDED
|
Binary file (2.49 kB). View file
|
|
|
src/__pycache__/config.cpython-313.pyc
ADDED
|
Binary file (806 Bytes). View file
|
|
|
src/__pycache__/generate.cpython-313.pyc
ADDED
|
Binary file (7.96 kB). View file
|
|
|
src/__pycache__/ingest.cpython-313.pyc
ADDED
|
Binary file (3.34 kB). View file
|
|
|
src/__pycache__/retrieve.cpython-313.pyc
ADDED
|
Binary file (10.4 kB). View file
|
|
|
src/chunk.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Markdown-aware, code-block-preserving chunker for FreeCAD wiki pages."""
|
| 2 |
+
import re
|
| 3 |
+
import uuid
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
import tiktoken
|
| 7 |
+
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
|
| 8 |
+
|
| 9 |
+
from src.config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 10 |
+
|
| 11 |
+
_enc = tiktoken.get_encoding("cl100k_base")
|
| 12 |
+
|
| 13 |
+
# Matches fenced code blocks (``` or ~~~, with optional language tag)
|
| 14 |
+
_FENCE_RE = re.compile(r"(```[\w]*\n.*?```|~~~[\w]*\n.*?~~~)", re.DOTALL)
|
| 15 |
+
|
| 16 |
+
_HEADERS_TO_SPLIT = [("#", "h1"), ("##", "h2"), ("###", "h3")]
|
| 17 |
+
|
| 18 |
+
_SPLITTER = RecursiveCharacterTextSplitter(
|
| 19 |
+
separators=["\n\n", "\n", ". ", " ", ""],
|
| 20 |
+
chunk_size=CHUNK_SIZE * 4, # chars; ~4 chars per token
|
| 21 |
+
chunk_overlap=CHUNK_OVERLAP * 4,
|
| 22 |
+
length_function=len,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _count_tokens(text: str) -> int:
|
| 27 |
+
return len(_enc.encode(text))
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _protect_code_blocks(text: str) -> tuple[str, dict[str, str]]:
|
| 31 |
+
"""Replace fenced code blocks with stable placeholders. Returns modified text + map."""
|
| 32 |
+
placeholders: dict[str, str] = {}
|
| 33 |
+
def replace(m: re.Match) -> str:
|
| 34 |
+
key = f"__CODEBLOCK_{uuid.uuid4().hex}__"
|
| 35 |
+
placeholders[key] = m.group(0)
|
| 36 |
+
return key
|
| 37 |
+
return _FENCE_RE.sub(replace, text), placeholders
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _restore_code_blocks(text: str, placeholders: dict[str, str]) -> str:
|
| 41 |
+
for key, code in placeholders.items():
|
| 42 |
+
text = text.replace(key, code)
|
| 43 |
+
return text
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _classify(text: str) -> str:
|
| 47 |
+
has_code = bool(_FENCE_RE.search(text)) or bool(re.search(r"^\s{4}", text, re.MULTILINE))
|
| 48 |
+
has_prose = bool(re.search(r"[a-zA-Z]{20,}", text))
|
| 49 |
+
if has_code and has_prose:
|
| 50 |
+
return "mixed"
|
| 51 |
+
if has_code:
|
| 52 |
+
return "code"
|
| 53 |
+
return "text"
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def chunk_page(page: dict[str, Any]) -> list[dict[str, Any]]:
|
| 57 |
+
"""
|
| 58 |
+
Split one wiki page dict into a list of chunk dicts ready for embedding.
|
| 59 |
+
page keys: source_file, page_title, source_url, raw_text, priority
|
| 60 |
+
"""
|
| 61 |
+
raw = page["raw_text"]
|
| 62 |
+
protected, placeholders = _protect_code_blocks(raw)
|
| 63 |
+
|
| 64 |
+
# Structural split on headers
|
| 65 |
+
header_splitter = MarkdownHeaderTextSplitter(
|
| 66 |
+
headers_to_split_on=_HEADERS_TO_SPLIT, strip_headers=False
|
| 67 |
+
)
|
| 68 |
+
sections = header_splitter.split_text(protected)
|
| 69 |
+
|
| 70 |
+
chunks: list[dict[str, Any]] = []
|
| 71 |
+
for sec in sections:
|
| 72 |
+
content = sec.page_content
|
| 73 |
+
meta = sec.metadata # {"h1": ..., "h2": ..., "h3": ...}
|
| 74 |
+
|
| 75 |
+
# Restore code blocks before deciding whether to split further
|
| 76 |
+
restored = _restore_code_blocks(content, placeholders)
|
| 77 |
+
tok_len = _count_tokens(restored)
|
| 78 |
+
|
| 79 |
+
if tok_len <= CHUNK_SIZE:
|
| 80 |
+
candidates = [restored]
|
| 81 |
+
else:
|
| 82 |
+
# Re-protect code blocks for the recursive splitter
|
| 83 |
+
protected2, ph2 = _protect_code_blocks(restored)
|
| 84 |
+
raw_splits = _SPLITTER.split_text(protected2)
|
| 85 |
+
candidates = [_restore_code_blocks(s, ph2) for s in raw_splits]
|
| 86 |
+
|
| 87 |
+
section_label = meta.get("h3") or meta.get("h2") or meta.get("h1") or ""
|
| 88 |
+
|
| 89 |
+
for text in candidates:
|
| 90 |
+
text = text.strip()
|
| 91 |
+
if not text or _count_tokens(text) < 30:
|
| 92 |
+
continue
|
| 93 |
+
|
| 94 |
+
# Build preamble for BM25/embedding quality
|
| 95 |
+
preamble = f"[Page: {page['page_title']} | Section: {section_label}]\n" if section_label else f"[Page: {page['page_title']}]\n"
|
| 96 |
+
full_text = preamble + text
|
| 97 |
+
|
| 98 |
+
chunks.append({
|
| 99 |
+
"source_file": page["source_file"],
|
| 100 |
+
"source_url": page["source_url"],
|
| 101 |
+
"page_title": page["page_title"],
|
| 102 |
+
"section": section_label,
|
| 103 |
+
"type": _classify(text),
|
| 104 |
+
"text": full_text,
|
| 105 |
+
"token_len": _count_tokens(full_text),
|
| 106 |
+
"char_len": len(full_text),
|
| 107 |
+
})
|
| 108 |
+
|
| 109 |
+
return chunks
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def chunk_pages(pages: list[dict]) -> list[dict]:
|
| 113 |
+
all_chunks: list[dict] = []
|
| 114 |
+
for page in pages:
|
| 115 |
+
all_chunks.extend(chunk_page(page))
|
| 116 |
+
for i, c in enumerate(all_chunks):
|
| 117 |
+
c["chunk_id"] = i
|
| 118 |
+
return all_chunks
|
src/citations.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, field
|
| 2 |
+
from typing import Optional
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@dataclass
|
| 7 |
+
class Citation:
|
| 8 |
+
id: int
|
| 9 |
+
chunk_id: int
|
| 10 |
+
source_url: str
|
| 11 |
+
page_title: str
|
| 12 |
+
section: str
|
| 13 |
+
snippet: str
|
| 14 |
+
score: float = 0.0
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def build_context_block(citations: list[Citation]) -> str:
|
| 18 |
+
"""Format citations as numbered context for the LLM prompt."""
|
| 19 |
+
parts = []
|
| 20 |
+
for c in citations:
|
| 21 |
+
header = f"[{c.id}] (Page: {c.page_title} | Section: {c.section} | URL: {c.source_url})"
|
| 22 |
+
parts.append(f"{header}\n{c.snippet}")
|
| 23 |
+
return "\n\n---\n\n".join(parts)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def extract_inline_refs(text: str) -> set[int]:
|
| 27 |
+
"""Return the set of citation IDs referenced inline, e.g. [1], [2]."""
|
| 28 |
+
return {int(m) for m in re.findall(r"\[(\d+)\]", text)}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def render_citation_markdown(citations: list[Citation], used_ids: Optional[set[int]] = None) -> str:
|
| 32 |
+
lines = ["### Sources"]
|
| 33 |
+
for c in citations:
|
| 34 |
+
if used_ids is not None and c.id not in used_ids:
|
| 35 |
+
continue
|
| 36 |
+
lines.append(f"{c.id}. [{c.page_title} β {c.section}]({c.source_url})")
|
| 37 |
+
return "\n".join(lines)
|
src/config.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
EMBED_MODEL = "BAAI/bge-small-en-v1.5"
|
| 2 |
+
RERANK_MODEL = "BAAI/bge-reranker-base"
|
| 3 |
+
|
| 4 |
+
CHUNK_SIZE = 700 # tokens
|
| 5 |
+
CHUNK_OVERLAP = 120
|
| 6 |
+
|
| 7 |
+
TOP_K_BM25 = 20
|
| 8 |
+
TOP_K_DENSE = 20
|
| 9 |
+
TOP_K_FUSED = 30
|
| 10 |
+
TOP_N_FINAL = 5
|
| 11 |
+
RRF_K = 60
|
| 12 |
+
|
| 13 |
+
DEFAULT_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
| 14 |
+
HF_MODELS = [
|
| 15 |
+
"Qwen/Qwen2.5-Coder-32B-Instruct",
|
| 16 |
+
"meta-llama/Llama-3.3-70B-Instruct",
|
| 17 |
+
"meta-llama/Llama-3.1-8B-Instruct",
|
| 18 |
+
"mistralai/Mistral-7B-Instruct-v0.3",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
DATA_DIR = "data"
|
| 22 |
+
CHUNKS_FILE = "data/chunks.parquet"
|
| 23 |
+
FAISS_FILE = "data/index.faiss"
|
| 24 |
+
BM25_FILE = "data/bm25.pkl"
|
| 25 |
+
|
| 26 |
+
WIKI_BASE_URL = "https://wiki.freecad.org"
|
src/generate.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LLM generation: build prompt, call HuggingFace Inference API, return code + explanation."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from huggingface_hub import InferenceClient
|
| 5 |
+
|
| 6 |
+
from src.citations import Citation, build_context_block, extract_inline_refs, render_citation_markdown
|
| 7 |
+
from src.config import DEFAULT_MODEL
|
| 8 |
+
|
| 9 |
+
# ββ few-shot examples βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 10 |
+
|
| 11 |
+
_FEW_SHOT_BOX = '''
|
| 12 |
+
### Example 1 β Parametric box with fillets
|
| 13 |
+
|
| 14 |
+
User: Create a parametric box width=50, height=30, depth=20 with 5mm fillets on vertical edges.
|
| 15 |
+
|
| 16 |
+
```python
|
| 17 |
+
import FreeCAD as App
|
| 18 |
+
import Part, Sketcher
|
| 19 |
+
|
| 20 |
+
doc = App.newDocument("ParametricBox")
|
| 21 |
+
|
| 22 |
+
body = doc.addObject("PartDesign::Body", "Body")
|
| 23 |
+
sketch = body.newObject("Sketcher::SketchObject", "RectSketch")
|
| 24 |
+
sketch.Support = (body.Origin.OriginFeatures[3], [""]) # XY_Plane
|
| 25 |
+
sketch.MapMode = "FlatFace"
|
| 26 |
+
|
| 27 |
+
width, depth, height, fillet_r = 50.0, 20.0, 30.0, 5.0
|
| 28 |
+
|
| 29 |
+
sketch.addGeometry(Part.LineSegment(App.Vector(0,0,0), App.Vector(width,0,0)), False)
|
| 30 |
+
sketch.addGeometry(Part.LineSegment(App.Vector(width,0,0), App.Vector(width,depth,0)), False)
|
| 31 |
+
sketch.addGeometry(Part.LineSegment(App.Vector(width,depth,0), App.Vector(0,depth,0)), False)
|
| 32 |
+
sketch.addGeometry(Part.LineSegment(App.Vector(0,depth,0), App.Vector(0,0,0)), False)
|
| 33 |
+
sketch.addConstraint(Sketcher.Constraint("Coincident", 0,2, 1,1))
|
| 34 |
+
sketch.addConstraint(Sketcher.Constraint("Coincident", 1,2, 2,1))
|
| 35 |
+
sketch.addConstraint(Sketcher.Constraint("Coincident", 2,2, 3,1))
|
| 36 |
+
sketch.addConstraint(Sketcher.Constraint("Coincident", 3,2, 0,1))
|
| 37 |
+
sketch.addConstraint(Sketcher.Constraint("Horizontal", 0))
|
| 38 |
+
sketch.addConstraint(Sketcher.Constraint("Horizontal", 2))
|
| 39 |
+
sketch.addConstraint(Sketcher.Constraint("Vertical", 1))
|
| 40 |
+
sketch.addConstraint(Sketcher.Constraint("Vertical", 3))
|
| 41 |
+
sketch.addConstraint(Sketcher.Constraint("DistanceX", 0, 1, 0, 2, width))
|
| 42 |
+
sketch.addConstraint(Sketcher.Constraint("DistanceY", 1, 1, 1, 2, depth))
|
| 43 |
+
doc.recompute()
|
| 44 |
+
|
| 45 |
+
pad = body.newObject("PartDesign::Pad", "Pad")
|
| 46 |
+
pad.Profile = sketch
|
| 47 |
+
pad.Length = height
|
| 48 |
+
doc.recompute()
|
| 49 |
+
|
| 50 |
+
fillet = body.newObject("PartDesign::Fillet", "Fillet")
|
| 51 |
+
fillet.Base = (pad, ["Edge1","Edge2","Edge3","Edge4"])
|
| 52 |
+
fillet.Radius = fillet_r
|
| 53 |
+
doc.recompute()
|
| 54 |
+
|
| 55 |
+
doc.saveAs("output.FCStd")
|
| 56 |
+
```
|
| 57 |
+
'''.strip()
|
| 58 |
+
|
| 59 |
+
_FEW_SHOT_REVOLVE = '''
|
| 60 |
+
### Example 2 β Revolved profile (cylinder / shaft)
|
| 61 |
+
|
| 62 |
+
User: Create a 20mm-diameter, 60mm-long cylindrical shaft using Revolution.
|
| 63 |
+
|
| 64 |
+
```python
|
| 65 |
+
import FreeCAD as App
|
| 66 |
+
import Part, Sketcher
|
| 67 |
+
|
| 68 |
+
doc = App.newDocument("Shaft")
|
| 69 |
+
body = doc.addObject("PartDesign::Body", "Body")
|
| 70 |
+
|
| 71 |
+
sketch = body.newObject("Sketcher::SketchObject", "Profile")
|
| 72 |
+
sketch.Support = (body.Origin.OriginFeatures[4], [""]) # XZ_Plane
|
| 73 |
+
sketch.MapMode = "FlatFace"
|
| 74 |
+
|
| 75 |
+
radius, length = 10.0, 60.0
|
| 76 |
+
|
| 77 |
+
sketch.addGeometry(Part.LineSegment(App.Vector(0,0,0), App.Vector(radius,0,0)), False)
|
| 78 |
+
sketch.addGeometry(Part.LineSegment(App.Vector(radius,0,0), App.Vector(radius,length,0)), False)
|
| 79 |
+
sketch.addGeometry(Part.LineSegment(App.Vector(radius,length,0), App.Vector(0,length,0)), False)
|
| 80 |
+
sketch.addGeometry(Part.LineSegment(App.Vector(0,length,0), App.Vector(0,0,0)), False)
|
| 81 |
+
sketch.addConstraint(Sketcher.Constraint("Coincident", 0,2, 1,1))
|
| 82 |
+
sketch.addConstraint(Sketcher.Constraint("Coincident", 1,2, 2,1))
|
| 83 |
+
sketch.addConstraint(Sketcher.Constraint("Coincident", 2,2, 3,1))
|
| 84 |
+
sketch.addConstraint(Sketcher.Constraint("Coincident", 3,2, 0,1))
|
| 85 |
+
sketch.addConstraint(Sketcher.Constraint("DistanceX", 0, 1, 0, 2, radius))
|
| 86 |
+
sketch.addConstraint(Sketcher.Constraint("DistanceY", 1, 1, 1, 2, length))
|
| 87 |
+
sketch.addConstraint(Sketcher.Constraint("PointOnObject", 0, 1, -1)) # origin on Y-axis
|
| 88 |
+
doc.recompute()
|
| 89 |
+
|
| 90 |
+
rev = body.newObject("PartDesign::Revolution", "Revolution")
|
| 91 |
+
rev.Profile = sketch
|
| 92 |
+
rev.ReferenceAxis = (sketch, ["V_Axis"])
|
| 93 |
+
rev.Angle = 360.0
|
| 94 |
+
doc.recompute()
|
| 95 |
+
|
| 96 |
+
doc.saveAs("output.FCStd")
|
| 97 |
+
```
|
| 98 |
+
'''.strip()
|
| 99 |
+
|
| 100 |
+
# ββ system prompt βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 101 |
+
|
| 102 |
+
_SYSTEM_PROMPT = f"""You are an expert FreeCAD 1.1 Python scripting assistant specialised in \
|
| 103 |
+
parametric solid modelling with the PartDesign and Sketcher workbenches.
|
| 104 |
+
|
| 105 |
+
OUTPUT CONTRACT (strict):
|
| 106 |
+
1. Return ONE complete, self-contained Python script enclosed in a single ```python ... ``` block, \
|
| 107 |
+
runnable with `freecadcmd script.py`.
|
| 108 |
+
2. The script MUST:
|
| 109 |
+
- import FreeCAD as App, Part, Sketcher (never import PartDesignGui / FreeCADGui / SketcherGui β they crash headless)
|
| 110 |
+
- call App.newDocument(...)
|
| 111 |
+
- create a PartDesign::Body BEFORE any Sketch
|
| 112 |
+
- attach every Sketch to a standard plane from body.Origin.OriginFeatures (index 3=XY, 4=XZ, 5=YZ)
|
| 113 |
+
- call doc.recompute() after EVERY feature creation
|
| 114 |
+
- end with doc.saveAs("output.FCStd")
|
| 115 |
+
3. Use named variables for every dimension so the model is parametric.
|
| 116 |
+
4. Reference geometry by INDEX where possible (e.g. Sketcher.Constraint("Coincident", 0, 2, 1, 1)), \
|
| 117 |
+
NOT by topological name strings like "Face1" or "Edge3", to minimise Topological Naming Problem risk \
|
| 118 |
+
(mitigated but NOT eliminated in FreeCAD 1.0/1.1).
|
| 119 |
+
5. Add all dress-up features (Fillet, Chamfer, etc.) AFTER all additive/subtractive features.
|
| 120 |
+
6. After the code block, write one short paragraph explaining the key design decisions.
|
| 121 |
+
7. Cite the retrieved sources inline as [1], [2], etc. in comments and in the explanation.
|
| 122 |
+
8. End with a numbered citation list: `1. <Page Title> β <URL>`
|
| 123 |
+
|
| 124 |
+
KNOWN PITFALLS (never repeat these errors):
|
| 125 |
+
- Missing doc.recompute() β silent failure
|
| 126 |
+
- Mixing App.ActiveDocument and the captured doc variable
|
| 127 |
+
- Creating PartDesign features via doc.addObject instead of body.newObject
|
| 128 |
+
- Importing *Gui modules in headless scripts
|
| 129 |
+
|
| 130 |
+
{_FEW_SHOT_BOX}
|
| 131 |
+
|
| 132 |
+
{_FEW_SHOT_REVOLVE}
|
| 133 |
+
""".strip()
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# ββ main entry point ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 137 |
+
|
| 138 |
+
def generate_response(
|
| 139 |
+
query: str,
|
| 140 |
+
citations: list[Citation],
|
| 141 |
+
model: str = DEFAULT_MODEL,
|
| 142 |
+
) -> tuple[str, str, str]:
|
| 143 |
+
"""
|
| 144 |
+
Returns (code_block, explanation_md, error_msg).
|
| 145 |
+
code_block: the raw python code (no fences).
|
| 146 |
+
explanation_md: explanation + citations markdown.
|
| 147 |
+
error_msg: non-empty string on failure.
|
| 148 |
+
"""
|
| 149 |
+
if not citations:
|
| 150 |
+
return "", "", "No relevant documentation chunks were retrieved. Try broadening the query."
|
| 151 |
+
|
| 152 |
+
client = InferenceClient()
|
| 153 |
+
context = build_context_block(citations)
|
| 154 |
+
|
| 155 |
+
user_msg = f"RETRIEVED CONTEXT:\n{context}\n\nUSER REQUEST:\n{query}"
|
| 156 |
+
|
| 157 |
+
try:
|
| 158 |
+
resp = client.chat.completions.create(
|
| 159 |
+
model=model,
|
| 160 |
+
messages=[
|
| 161 |
+
{"role": "system", "content": _SYSTEM_PROMPT},
|
| 162 |
+
{"role": "user", "content": user_msg},
|
| 163 |
+
],
|
| 164 |
+
temperature=0.2,
|
| 165 |
+
max_tokens=2500,
|
| 166 |
+
)
|
| 167 |
+
except Exception as exc: # noqa: BLE001
|
| 168 |
+
return "", "", f"HuggingFace API error: {exc}"
|
| 169 |
+
|
| 170 |
+
full_text = resp.choices[0].message.content or ""
|
| 171 |
+
|
| 172 |
+
# Split code block from rest of response
|
| 173 |
+
code_match = __import__("re").search(r"```python\n(.*?)```", full_text, __import__("re").DOTALL)
|
| 174 |
+
if code_match:
|
| 175 |
+
code = code_match.group(1).rstrip()
|
| 176 |
+
after_code = full_text[code_match.end():].strip()
|
| 177 |
+
else:
|
| 178 |
+
code = full_text
|
| 179 |
+
after_code = ""
|
| 180 |
+
|
| 181 |
+
used_ids = extract_inline_refs(full_text)
|
| 182 |
+
cite_md = render_citation_markdown(citations, used_ids or None)
|
| 183 |
+
explain = (after_code + "\n\n" + cite_md).strip()
|
| 184 |
+
|
| 185 |
+
return code, explain, ""
|
src/ingest.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Parse the FreeCAD-documentation repo into a list of page dicts."""
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Iterator
|
| 6 |
+
|
| 7 |
+
from src.config import WIKI_BASE_URL
|
| 8 |
+
|
| 9 |
+
_SKIP_PATTERNS = re.compile(
|
| 10 |
+
r"(Category:|File:|Template:|Special:|MediaWiki:|User:|Talk:|^index\.md$)",
|
| 11 |
+
re.IGNORECASE,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
_PRIORITY_PAGES = {
|
| 15 |
+
"Python_scripting_tutorial", "FreeCAD_Scripting_Basics", "Scripting_and_macros",
|
| 16 |
+
"Part_scripting", "Sketcher_scripting", "PartDesign_scripting",
|
| 17 |
+
"Topological_naming_problem", "Scripted_objects", "Scripted_objects_migration",
|
| 18 |
+
"PartDesign_Pad", "PartDesign_Pocket", "PartDesign_Revolution", "PartDesign_Body",
|
| 19 |
+
"PartDesign_Fillet", "PartDesign_Chamfer", "PartDesign_Hole", "PartDesign_Boolean",
|
| 20 |
+
"PartDesign_AdditiveLoft", "PartDesign_AdditivePipe", "PartDesign_Workbench",
|
| 21 |
+
"Sketcher_Workbench", "Release_notes_1.0", "Release_notes_1.1",
|
| 22 |
+
"Basic_Part_Design_Tutorial_019", "Creating_a_simple_part_with_PartDesign",
|
| 23 |
+
"Spreadsheet_Workbench",
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _page_title(stem: str) -> str:
|
| 28 |
+
return stem.replace("_", " ")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _source_url(stem: str) -> str:
|
| 32 |
+
return f"{WIKI_BASE_URL}/{stem}"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def iter_pages(wiki_dir: str | Path) -> Iterator[dict]:
|
| 36 |
+
wiki_path = Path(wiki_dir)
|
| 37 |
+
for md_file in sorted(wiki_path.glob("*.md")):
|
| 38 |
+
stem = md_file.stem
|
| 39 |
+
if _SKIP_PATTERNS.search(stem):
|
| 40 |
+
continue
|
| 41 |
+
raw = md_file.read_text(encoding="utf-8", errors="replace")
|
| 42 |
+
if len(raw.strip()) < 200:
|
| 43 |
+
continue
|
| 44 |
+
yield {
|
| 45 |
+
"source_file": str(md_file),
|
| 46 |
+
"page_title": _page_title(stem),
|
| 47 |
+
"source_url": _source_url(stem),
|
| 48 |
+
"raw_text": raw,
|
| 49 |
+
"priority": stem in _PRIORITY_PAGES,
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def load_freecad_docs(repo_root: str) -> list[dict]:
|
| 54 |
+
wiki_dir = os.path.join(repo_root, "wiki")
|
| 55 |
+
if not os.path.isdir(wiki_dir):
|
| 56 |
+
raise FileNotFoundError(
|
| 57 |
+
f"Expected wiki/ directory at {wiki_dir}. "
|
| 58 |
+
"Clone https://github.com/FreeCAD/FreeCAD-documentation first."
|
| 59 |
+
)
|
| 60 |
+
pages = list(iter_pages(wiki_dir))
|
| 61 |
+
pages.sort(key=lambda p: (not p["priority"], p["page_title"]))
|
| 62 |
+
return pages
|
src/retrieve.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Retrieval: BM25 + Dense (FAISS) + RRF fusion + cross-encoder reranking."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import pickle
|
| 6 |
+
import re
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pandas as pd
|
| 11 |
+
|
| 12 |
+
from src.citations import Citation
|
| 13 |
+
from src.config import (
|
| 14 |
+
BM25_FILE, CHUNKS_FILE, EMBED_MODEL, FAISS_FILE,
|
| 15 |
+
RRF_K, RERANK_MODEL, TOP_K_BM25, TOP_K_DENSE, TOP_K_FUSED, TOP_N_FINAL,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# ββ tokeniser ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
|
| 20 |
+
_TOKEN_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_.:]*|\d+")
|
| 21 |
+
_CAMEL_RE = re.compile(r"(?<!^)(?=[A-Z])")
|
| 22 |
+
_STOP = {"the","a","an","of","to","in","is","are","and","or","this","that","it","be"}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _tokenize(text: str) -> list[str]:
|
| 26 |
+
tokens = _TOKEN_RE.findall(text)
|
| 27 |
+
out: list[str] = []
|
| 28 |
+
for t in tokens:
|
| 29 |
+
tl = t.lower()
|
| 30 |
+
if tl in _STOP:
|
| 31 |
+
continue
|
| 32 |
+
out.append(tl)
|
| 33 |
+
parts = _CAMEL_RE.split(t)
|
| 34 |
+
if len(parts) > 1:
|
| 35 |
+
out.extend(p.lower() for p in parts if p and p.lower() not in _STOP)
|
| 36 |
+
for sub in re.split(r"[._:]+", t):
|
| 37 |
+
if sub and sub.lower() not in _STOP and sub.lower() != tl:
|
| 38 |
+
out.append(sub.lower())
|
| 39 |
+
return out
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ββ lazy singletons βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
|
| 44 |
+
_chunks_df: Optional[pd.DataFrame] = None
|
| 45 |
+
_bm25_index = None
|
| 46 |
+
_faiss_index = None
|
| 47 |
+
_embed_model = None
|
| 48 |
+
_rerank_model = None
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _load_chunks() -> pd.DataFrame:
|
| 52 |
+
global _chunks_df
|
| 53 |
+
if _chunks_df is None:
|
| 54 |
+
if not os.path.exists(CHUNKS_FILE):
|
| 55 |
+
raise FileNotFoundError(
|
| 56 |
+
f"{CHUNKS_FILE} not found. Run `python build_index.py` first."
|
| 57 |
+
)
|
| 58 |
+
_chunks_df = pd.read_parquet(CHUNKS_FILE)
|
| 59 |
+
return _chunks_df
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _load_bm25():
|
| 63 |
+
global _bm25_index
|
| 64 |
+
if _bm25_index is None:
|
| 65 |
+
if not os.path.exists(BM25_FILE):
|
| 66 |
+
raise FileNotFoundError(f"{BM25_FILE} not found.")
|
| 67 |
+
with open(BM25_FILE, "rb") as f:
|
| 68 |
+
_bm25_index = pickle.load(f)
|
| 69 |
+
return _bm25_index
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _load_faiss():
|
| 73 |
+
global _faiss_index
|
| 74 |
+
if _faiss_index is None:
|
| 75 |
+
import faiss # noqa: PLC0415
|
| 76 |
+
if not os.path.exists(FAISS_FILE):
|
| 77 |
+
raise FileNotFoundError(f"{FAISS_FILE} not found.")
|
| 78 |
+
_faiss_index = faiss.read_index(FAISS_FILE)
|
| 79 |
+
return _faiss_index
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _load_embed():
|
| 83 |
+
global _embed_model
|
| 84 |
+
if _embed_model is None:
|
| 85 |
+
from sentence_transformers import SentenceTransformer # noqa: PLC0415
|
| 86 |
+
_embed_model = SentenceTransformer(EMBED_MODEL)
|
| 87 |
+
return _embed_model
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _load_reranker():
|
| 91 |
+
global _rerank_model
|
| 92 |
+
if _rerank_model is None:
|
| 93 |
+
from sentence_transformers import CrossEncoder # noqa: PLC0415
|
| 94 |
+
_rerank_model = CrossEncoder(RERANK_MODEL)
|
| 95 |
+
return _rerank_model
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def indices_ready() -> bool:
|
| 99 |
+
return all(os.path.exists(p) for p in (CHUNKS_FILE, BM25_FILE, FAISS_FILE))
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# ββ retrieval methods βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 103 |
+
|
| 104 |
+
def _bm25_search(query: str, top_k: int) -> list[tuple[int, float]]:
|
| 105 |
+
"""Returns [(chunk_id, score), ...]."""
|
| 106 |
+
import bm25s # noqa: PLC0415
|
| 107 |
+
bm25 = _load_bm25()
|
| 108 |
+
query_tokens_arr = bm25s.tokenize([" ".join(_tokenize(query))])
|
| 109 |
+
results, scores = bm25.retrieve(query_tokens_arr, k=top_k)
|
| 110 |
+
return list(zip(results[0].tolist(), scores[0].tolist()))
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _dense_search(query: str, top_k: int) -> list[tuple[int, float]]:
|
| 114 |
+
"""Returns [(chunk_id, score), ...]."""
|
| 115 |
+
model = _load_embed()
|
| 116 |
+
index = _load_faiss()
|
| 117 |
+
# BGE models expect a query prefix
|
| 118 |
+
vec = model.encode(f"Represent this sentence for searching relevant passages: {query}",
|
| 119 |
+
normalize_embeddings=True).reshape(1, -1).astype("float32")
|
| 120 |
+
scores, ids = index.search(vec, top_k)
|
| 121 |
+
return [(int(i), float(s)) for i, s in zip(ids[0], scores[0]) if i >= 0]
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _rrf_fuse(
|
| 125 |
+
bm25_hits: list[tuple[int, float]],
|
| 126 |
+
dense_hits: list[tuple[int, float]],
|
| 127 |
+
k: int = RRF_K,
|
| 128 |
+
top_n: int = TOP_K_FUSED,
|
| 129 |
+
) -> list[tuple[int, float]]:
|
| 130 |
+
scores: dict[int, float] = {}
|
| 131 |
+
for rank, (cid, _) in enumerate(bm25_hits):
|
| 132 |
+
scores[cid] = scores.get(cid, 0.0) + 1.0 / (k + rank + 1)
|
| 133 |
+
for rank, (cid, _) in enumerate(dense_hits):
|
| 134 |
+
scores[cid] = scores.get(cid, 0.0) + 1.0 / (k + rank + 1)
|
| 135 |
+
ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
| 136 |
+
return ranked[:top_n]
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def _rerank(query: str, hits: list[tuple[int, float]], top_n: int, df: pd.DataFrame) -> list[tuple[int, float]]:
|
| 140 |
+
reranker = _load_reranker()
|
| 141 |
+
pairs = [(query, df.loc[cid, "text"]) for cid, _ in hits]
|
| 142 |
+
scores = reranker.predict(pairs)
|
| 143 |
+
ranked = sorted(zip([cid for cid, _ in hits], scores), key=lambda x: x[1], reverse=True)
|
| 144 |
+
return [(int(cid), float(s)) for cid, s in ranked[:top_n]]
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
# ββ public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 148 |
+
|
| 149 |
+
class HybridRetriever:
|
| 150 |
+
def __init__(
|
| 151 |
+
self,
|
| 152 |
+
use_bm25: bool = True,
|
| 153 |
+
use_dense: bool = True,
|
| 154 |
+
use_rerank: bool = True,
|
| 155 |
+
top_n: int = TOP_N_FINAL,
|
| 156 |
+
):
|
| 157 |
+
self.use_bm25 = use_bm25
|
| 158 |
+
self.use_dense = use_dense
|
| 159 |
+
self.use_rerank = use_rerank
|
| 160 |
+
self.top_n = top_n
|
| 161 |
+
|
| 162 |
+
def retrieve(self, query: str) -> list[Citation]:
|
| 163 |
+
df = _load_chunks()
|
| 164 |
+
|
| 165 |
+
bm25_hits: list[tuple[int, float]] = []
|
| 166 |
+
dense_hits: list[tuple[int, float]] = []
|
| 167 |
+
|
| 168 |
+
if self.use_bm25:
|
| 169 |
+
bm25_hits = _bm25_search(query, TOP_K_BM25)
|
| 170 |
+
if self.use_dense:
|
| 171 |
+
dense_hits = _dense_search(query, TOP_K_DENSE)
|
| 172 |
+
|
| 173 |
+
if self.use_bm25 and self.use_dense:
|
| 174 |
+
fused = _rrf_fuse(bm25_hits, dense_hits)
|
| 175 |
+
elif self.use_bm25:
|
| 176 |
+
fused = bm25_hits[:TOP_K_FUSED]
|
| 177 |
+
elif self.use_dense:
|
| 178 |
+
fused = dense_hits[:TOP_K_FUSED]
|
| 179 |
+
else:
|
| 180 |
+
return []
|
| 181 |
+
|
| 182 |
+
if self.use_rerank and len(fused) > 0:
|
| 183 |
+
final = _rerank(query, fused, self.top_n, df)
|
| 184 |
+
else:
|
| 185 |
+
final = fused[:self.top_n]
|
| 186 |
+
|
| 187 |
+
citations: list[Citation] = []
|
| 188 |
+
for rank, (cid, score) in enumerate(final, start=1):
|
| 189 |
+
row = df.loc[cid]
|
| 190 |
+
citations.append(Citation(
|
| 191 |
+
id=rank,
|
| 192 |
+
chunk_id=int(cid),
|
| 193 |
+
source_url=str(row["source_url"]),
|
| 194 |
+
page_title=str(row["page_title"]),
|
| 195 |
+
section=str(row.get("section", "")),
|
| 196 |
+
snippet=str(row["text"])[:600],
|
| 197 |
+
score=float(score),
|
| 198 |
+
))
|
| 199 |
+
return citations
|