Spaces:

Undrick
/

NLP_Lab

Sleeping

NLP_Lab / src /ingest.py

apytel

Redesigns UI for FreeCAD RAG Python script generator

11ba2bd about 1 month ago

2.18 kB

	"""Parse the FreeCAD-documentation repo into a list of page dicts."""
	import os
	import re
	from pathlib import Path
	from typing import Iterator

	from src.config import WIKI_BASE_URL

	_SKIP_PATTERNS = re.compile(
	r"(Category:\|File:\|Template:\|Special:\|MediaWiki:\|User:\|Talk:\|^index\.md$)",
	re.IGNORECASE,
	)

	_PRIORITY_PAGES = {
	"Python_scripting_tutorial", "FreeCAD_Scripting_Basics", "Scripting_and_macros",
	"Part_scripting", "Sketcher_scripting", "PartDesign_scripting",
	"Topological_naming_problem", "Scripted_objects", "Scripted_objects_migration",
	"PartDesign_Pad", "PartDesign_Pocket", "PartDesign_Revolution", "PartDesign_Body",
	"PartDesign_Fillet", "PartDesign_Chamfer", "PartDesign_Hole", "PartDesign_Boolean",
	"PartDesign_AdditiveLoft", "PartDesign_AdditivePipe", "PartDesign_Workbench",
	"Sketcher_Workbench", "Release_notes_1.0", "Release_notes_1.1",
	"Basic_Part_Design_Tutorial_019", "Creating_a_simple_part_with_PartDesign",
	"Spreadsheet_Workbench",
	}


	def _page_title(stem: str) -> str:
	return stem.replace("_", " ")


	def _source_url(stem: str) -> str:
	return f"{WIKI_BASE_URL}/{stem}"


	def iter_pages(wiki_dir: str \| Path) -> Iterator[dict]:
	wiki_path = Path(wiki_dir)
	for md_file in sorted(wiki_path.glob("*.md")):
	stem = md_file.stem
	if _SKIP_PATTERNS.search(stem):
	continue
	raw = md_file.read_text(encoding="utf-8", errors="replace")
	if len(raw.strip()) < 200:
	continue
	yield {
	"source_file": str(md_file),
	"page_title": _page_title(stem),
	"source_url": _source_url(stem),
	"raw_text": raw,
	"priority": stem in _PRIORITY_PAGES,
	}


	def load_freecad_docs(repo_root: str) -> list[dict]:
	wiki_dir = os.path.join(repo_root, "wiki")
	if not os.path.isdir(wiki_dir):
	raise FileNotFoundError(
	f"Expected wiki/ directory at {wiki_dir}. "
	"Clone https://github.com/FreeCAD/FreeCAD-documentation first."
	)
	pages = list(iter_pages(wiki_dir))
	pages.sort(key=lambda p: (not p["priority"], p["page_title"]))
	return pages