"""Parse the FreeCAD-documentation repo into a list of page dicts.""" import os import re from pathlib import Path from typing import Iterator from src.config import WIKI_BASE_URL _SKIP_PATTERNS = re.compile( r"(Category:|File:|Template:|Special:|MediaWiki:|User:|Talk:|^index\.md$)", re.IGNORECASE, ) _PRIORITY_PAGES = { "Python_scripting_tutorial", "FreeCAD_Scripting_Basics", "Scripting_and_macros", "Part_scripting", "Sketcher_scripting", "PartDesign_scripting", "Topological_naming_problem", "Scripted_objects", "Scripted_objects_migration", "PartDesign_Pad", "PartDesign_Pocket", "PartDesign_Revolution", "PartDesign_Body", "PartDesign_Fillet", "PartDesign_Chamfer", "PartDesign_Hole", "PartDesign_Boolean", "PartDesign_AdditiveLoft", "PartDesign_AdditivePipe", "PartDesign_Workbench", "Sketcher_Workbench", "Release_notes_1.0", "Release_notes_1.1", "Basic_Part_Design_Tutorial_019", "Creating_a_simple_part_with_PartDesign", "Spreadsheet_Workbench", } def _page_title(stem: str) -> str: return stem.replace("_", " ") def _source_url(stem: str) -> str: return f"{WIKI_BASE_URL}/{stem}" def iter_pages(wiki_dir: str | Path) -> Iterator[dict]: wiki_path = Path(wiki_dir) for md_file in sorted(wiki_path.glob("*.md")): stem = md_file.stem if _SKIP_PATTERNS.search(stem): continue raw = md_file.read_text(encoding="utf-8", errors="replace") if len(raw.strip()) < 200: continue yield { "source_file": str(md_file), "page_title": _page_title(stem), "source_url": _source_url(stem), "raw_text": raw, "priority": stem in _PRIORITY_PAGES, } def load_freecad_docs(repo_root: str) -> list[dict]: wiki_dir = os.path.join(repo_root, "wiki") if not os.path.isdir(wiki_dir): raise FileNotFoundError( f"Expected wiki/ directory at {wiki_dir}. " "Clone https://github.com/FreeCAD/FreeCAD-documentation first." ) pages = list(iter_pages(wiki_dir)) pages.sort(key=lambda p: (not p["priority"], p["page_title"])) return pages