NLP_Lab / src /ingest.py
apytel
Redesigns UI for FreeCAD RAG Python script generator
11ba2bd
Raw
History Blame Contribute Delete
2.18 kB
"""Parse the FreeCAD-documentation repo into a list of page dicts."""
import os
import re
from pathlib import Path
from typing import Iterator
from src.config import WIKI_BASE_URL
_SKIP_PATTERNS = re.compile(
r"(Category:|File:|Template:|Special:|MediaWiki:|User:|Talk:|^index\.md$)",
re.IGNORECASE,
)
_PRIORITY_PAGES = {
"Python_scripting_tutorial", "FreeCAD_Scripting_Basics", "Scripting_and_macros",
"Part_scripting", "Sketcher_scripting", "PartDesign_scripting",
"Topological_naming_problem", "Scripted_objects", "Scripted_objects_migration",
"PartDesign_Pad", "PartDesign_Pocket", "PartDesign_Revolution", "PartDesign_Body",
"PartDesign_Fillet", "PartDesign_Chamfer", "PartDesign_Hole", "PartDesign_Boolean",
"PartDesign_AdditiveLoft", "PartDesign_AdditivePipe", "PartDesign_Workbench",
"Sketcher_Workbench", "Release_notes_1.0", "Release_notes_1.1",
"Basic_Part_Design_Tutorial_019", "Creating_a_simple_part_with_PartDesign",
"Spreadsheet_Workbench",
}
def _page_title(stem: str) -> str:
return stem.replace("_", " ")
def _source_url(stem: str) -> str:
return f"{WIKI_BASE_URL}/{stem}"
def iter_pages(wiki_dir: str | Path) -> Iterator[dict]:
wiki_path = Path(wiki_dir)
for md_file in sorted(wiki_path.glob("*.md")):
stem = md_file.stem
if _SKIP_PATTERNS.search(stem):
continue
raw = md_file.read_text(encoding="utf-8", errors="replace")
if len(raw.strip()) < 200:
continue
yield {
"source_file": str(md_file),
"page_title": _page_title(stem),
"source_url": _source_url(stem),
"raw_text": raw,
"priority": stem in _PRIORITY_PAGES,
}
def load_freecad_docs(repo_root: str) -> list[dict]:
wiki_dir = os.path.join(repo_root, "wiki")
if not os.path.isdir(wiki_dir):
raise FileNotFoundError(
f"Expected wiki/ directory at {wiki_dir}. "
"Clone https://github.com/FreeCAD/FreeCAD-documentation first."
)
pages = list(iter_pages(wiki_dir))
pages.sort(key=lambda p: (not p["priority"], p["page_title"]))
return pages