| """Parse the FreeCAD-documentation repo into a list of page dicts.""" |
| import os |
| import re |
| from pathlib import Path |
| from typing import Iterator |
|
|
| from src.config import WIKI_BASE_URL |
|
|
| _SKIP_PATTERNS = re.compile( |
| r"(Category:|File:|Template:|Special:|MediaWiki:|User:|Talk:|^index\.md$)", |
| re.IGNORECASE, |
| ) |
|
|
| _PRIORITY_PAGES = { |
| "Python_scripting_tutorial", "FreeCAD_Scripting_Basics", "Scripting_and_macros", |
| "Part_scripting", "Sketcher_scripting", "PartDesign_scripting", |
| "Topological_naming_problem", "Scripted_objects", "Scripted_objects_migration", |
| "PartDesign_Pad", "PartDesign_Pocket", "PartDesign_Revolution", "PartDesign_Body", |
| "PartDesign_Fillet", "PartDesign_Chamfer", "PartDesign_Hole", "PartDesign_Boolean", |
| "PartDesign_AdditiveLoft", "PartDesign_AdditivePipe", "PartDesign_Workbench", |
| "Sketcher_Workbench", "Release_notes_1.0", "Release_notes_1.1", |
| "Basic_Part_Design_Tutorial_019", "Creating_a_simple_part_with_PartDesign", |
| "Spreadsheet_Workbench", |
| } |
|
|
|
|
| def _page_title(stem: str) -> str: |
| return stem.replace("_", " ") |
|
|
|
|
| def _source_url(stem: str) -> str: |
| return f"{WIKI_BASE_URL}/{stem}" |
|
|
|
|
| def iter_pages(wiki_dir: str | Path) -> Iterator[dict]: |
| wiki_path = Path(wiki_dir) |
| for md_file in sorted(wiki_path.glob("*.md")): |
| stem = md_file.stem |
| if _SKIP_PATTERNS.search(stem): |
| continue |
| raw = md_file.read_text(encoding="utf-8", errors="replace") |
| if len(raw.strip()) < 200: |
| continue |
| yield { |
| "source_file": str(md_file), |
| "page_title": _page_title(stem), |
| "source_url": _source_url(stem), |
| "raw_text": raw, |
| "priority": stem in _PRIORITY_PAGES, |
| } |
|
|
|
|
| def load_freecad_docs(repo_root: str) -> list[dict]: |
| wiki_dir = os.path.join(repo_root, "wiki") |
| if not os.path.isdir(wiki_dir): |
| raise FileNotFoundError( |
| f"Expected wiki/ directory at {wiki_dir}. " |
| "Clone https://github.com/FreeCAD/FreeCAD-documentation first." |
| ) |
| pages = list(iter_pages(wiki_dir)) |
| pages.sort(key=lambda p: (not p["priority"], p["page_title"])) |
| return pages |
|
|