| | """Validate markdown links across the repository. |
| | |
| | Checks: |
| | 1. Reject non-clickable URL formatting such as `https://...` inside backticks. |
| | 2. Reject raw bare URLs that are not markdown links. |
| | 3. Optionally verify remote URL reachability with --online. |
| | |
| | Usage: |
| | python scripts/check_links.py |
| | python scripts/check_links.py --online |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | import argparse |
| | import re |
| | from pathlib import Path |
| | from urllib.error import HTTPError, URLError |
| | from urllib.request import Request, urlopen |
| |
|
| |
|
| | MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\((https?://[^)\s]+)\)") |
| | CODE_URL_RE = re.compile(r"`(https?://[^`\s]+)`") |
| | RAW_URL_RE = re.compile(r"https?://[^\s)>\]]+") |
| |
|
| |
|
| | def md_files(root: Path) -> list[Path]: |
| | return sorted(path for path in root.rglob("*.md") if ".git" not in path.parts) |
| |
|
| |
|
| | def lint_markdown_links(path: Path) -> tuple[list[str], set[str]]: |
| | errors: list[str] = [] |
| | urls: set[str] = set() |
| | lines = path.read_text(encoding="utf-8").splitlines() |
| |
|
| | for line_no, line in enumerate(lines, start=1): |
| | for match in MARKDOWN_LINK_RE.finditer(line): |
| | urls.add(match.group(1)) |
| |
|
| | for match in CODE_URL_RE.finditer(line): |
| | errors.append( |
| | f"{path}:{line_no} non-clickable code URL; use markdown link: {match.group(1)}" |
| | ) |
| |
|
| | for raw in RAW_URL_RE.finditer(line): |
| | url = raw.group(0) |
| | start = raw.start() |
| | end = raw.end() |
| |
|
| | |
| | if start >= 1 and line[start - 1] == "(": |
| | continue |
| | if end < len(line) and line[end : end + 1] == ")": |
| | continue |
| |
|
| | |
| | if (start >= 1 and line[start - 1] == "`") or ( |
| | end < len(line) and line[end : end + 1] == "`" |
| | ): |
| | continue |
| |
|
| | errors.append(f"{path}:{line_no} bare URL; wrap in markdown link: {url}") |
| |
|
| | return errors, urls |
| |
|
| |
|
| | def check_url_online(url: str, timeout: float = 10.0) -> str | None: |
| | request = Request(url, method="HEAD", headers={"User-Agent": "pashto-link-checker/1.0"}) |
| | try: |
| | with urlopen(request, timeout=timeout): |
| | return None |
| | except HTTPError as exc: |
| | if exc.code in {403, 405}: |
| | |
| | pass |
| | else: |
| | return f"{url} returned HTTP {exc.code}" |
| | except URLError as exc: |
| | return f"{url} failed: {exc.reason}" |
| | except TimeoutError: |
| | return f"{url} failed: timeout" |
| |
|
| | request = Request(url, method="GET", headers={"User-Agent": "pashto-link-checker/1.0"}) |
| | try: |
| | with urlopen(request, timeout=timeout): |
| | return None |
| | except HTTPError as exc: |
| | return f"{url} returned HTTP {exc.code}" |
| | except URLError as exc: |
| | return f"{url} failed: {exc.reason}" |
| | except TimeoutError: |
| | return f"{url} failed: timeout" |
| |
|
| |
|
| | def main() -> int: |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--root", default=".", help="Repository root") |
| | parser.add_argument("--online", action="store_true", help="Check URL reachability online") |
| | args = parser.parse_args() |
| |
|
| | root = Path(args.root).resolve() |
| | files = md_files(root) |
| | all_errors: list[str] = [] |
| | all_urls: set[str] = set() |
| |
|
| | for path in files: |
| | errors, urls = lint_markdown_links(path) |
| | all_errors.extend(errors) |
| | all_urls.update(urls) |
| |
|
| | if args.online: |
| | for url in sorted(all_urls): |
| | error = check_url_online(url) |
| | if error: |
| | all_errors.append(f"URL check failed: {error}") |
| |
|
| | if all_errors: |
| | print("Link check failed:") |
| | for error in all_errors: |
| | print(f"- {error}") |
| | return 1 |
| |
|
| | print(f"Link check passed: {len(files)} markdown files, {len(all_urls)} URLs") |
| | return 0 |
| |
|
| |
|
| | if __name__ == "__main__": |
| | raise SystemExit(main()) |
| |
|