#!/usr/bin/env python3 """Validate local Markdown links and anchors.""" from __future__ import annotations import re import sys import urllib.parse from collections import Counter from pathlib import Path LINK_RE = re.compile(r"(? str: text = re.sub(r"<[^>]+>", "", heading) text = re.sub(r"[`*_]", "", text).strip().lower() text = re.sub(r"[^a-z0-9\u4e00-\u9fff -]", "", text) text = re.sub(r"\s+", "-", text) base = text.strip("-") seen[base] += 1 if seen[base] == 1: return base return f"{base}-{seen[base] - 1}" def anchors_for(path: Path) -> set[str]: seen: Counter[str] = Counter() anchors: set[str] = set() for line in path.read_text(encoding="utf-8").splitlines(): match = HEADING_RE.match(line) if match: anchors.add(slugify(match.group(2), seen)) return anchors def markdown_files(root: Path) -> list[Path]: return sorted(path for path in root.rglob("*.md") if ".git" not in path.parts) def normalize_link(raw_link: str) -> tuple[str, str]: link = raw_link.split()[0].strip("<>") parsed = urllib.parse.urlsplit(link) path = urllib.parse.unquote(parsed.path) fragment = urllib.parse.unquote(parsed.fragment) return path, fragment def main() -> int: root = Path(".") anchor_cache: dict[Path, set[str]] = {} failures: list[str] = [] for source in markdown_files(root): text = source.read_text(encoding="utf-8") for raw_link in LINK_RE.findall(text): parsed = urllib.parse.urlsplit(raw_link.strip("<>")) if parsed.scheme in SKIP_SCHEMES: continue if raw_link.startswith("#"): target_path = source fragment = urllib.parse.unquote(parsed.fragment or raw_link.lstrip("#")) elif parsed.scheme: continue else: path_part, fragment = normalize_link(raw_link) if not path_part: target_path = source else: target_path = (source.parent / path_part).resolve().relative_to(root.resolve()) if not target_path.exists(): failures.append(f"{source}: missing linked file {raw_link}") continue if fragment and target_path.suffix.lower() == ".md": anchors = anchor_cache.setdefault(target_path, anchors_for(target_path)) if fragment not in anchors: failures.append(f"{source}: missing anchor #{fragment} in {target_path}") if failures: print("Internal link check failed:", file=sys.stderr) for failure in failures: print(f"- {failure}", file=sys.stderr) return 1 return 0 if __name__ == "__main__": raise SystemExit(main())