File size: 4,000 Bytes

d2f0b77

"""Validate markdown links across the repository.

Checks:
1. Reject non-clickable URL formatting such as `https://...` inside backticks.
2. Reject raw bare URLs that are not markdown links.
3. Optionally verify remote URL reachability with --online.

Usage:
    python scripts/check_links.py
    python scripts/check_links.py --online
"""

from __future__ import annotations

import argparse
import re
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen


MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\((https?://[^)\s]+)\)")
CODE_URL_RE = re.compile(r"`(https?://[^`\s]+)`")
RAW_URL_RE = re.compile(r"https?://[^\s)>\]]+")


def md_files(root: Path) -> list[Path]:
    return sorted(path for path in root.rglob("*.md") if ".git" not in path.parts)


def lint_markdown_links(path: Path) -> tuple[list[str], set[str]]:
    errors: list[str] = []
    urls: set[str] = set()
    lines = path.read_text(encoding="utf-8").splitlines()

    for line_no, line in enumerate(lines, start=1):
        for match in MARKDOWN_LINK_RE.finditer(line):
            urls.add(match.group(1))

        for match in CODE_URL_RE.finditer(line):
            errors.append(
                f"{path}:{line_no} non-clickable code URL; use markdown link: {match.group(1)}"
            )

        for raw in RAW_URL_RE.finditer(line):
            url = raw.group(0)
            start = raw.start()
            end = raw.end()

            # Skip URLs that are part of markdown links.
            if start >= 1 and line[start - 1] == "(":
                continue
            if end < len(line) and line[end : end + 1] == ")":
                continue

            # Skip URLs inside backticks (already handled above).
            if (start >= 1 and line[start - 1] == "`") or (
                end < len(line) and line[end : end + 1] == "`"
            ):
                continue

            errors.append(f"{path}:{line_no} bare URL; wrap in markdown link: {url}")

    return errors, urls


def check_url_online(url: str, timeout: float = 10.0) -> str | None:
    request = Request(url, method="HEAD", headers={"User-Agent": "pashto-link-checker/1.0"})
    try:
        with urlopen(request, timeout=timeout):
            return None
    except HTTPError as exc:
        if exc.code in {403, 405}:
            # Some hosts block HEAD; retry with GET.
            pass
        else:
            return f"{url} returned HTTP {exc.code}"
    except URLError as exc:
        return f"{url} failed: {exc.reason}"
    except TimeoutError:
        return f"{url} failed: timeout"

    request = Request(url, method="GET", headers={"User-Agent": "pashto-link-checker/1.0"})
    try:
        with urlopen(request, timeout=timeout):
            return None
    except HTTPError as exc:
        return f"{url} returned HTTP {exc.code}"
    except URLError as exc:
        return f"{url} failed: {exc.reason}"
    except TimeoutError:
        return f"{url} failed: timeout"


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--root", default=".", help="Repository root")
    parser.add_argument("--online", action="store_true", help="Check URL reachability online")
    args = parser.parse_args()

    root = Path(args.root).resolve()
    files = md_files(root)
    all_errors: list[str] = []
    all_urls: set[str] = set()

    for path in files:
        errors, urls = lint_markdown_links(path)
        all_errors.extend(errors)
        all_urls.update(urls)

    if args.online:
        for url in sorted(all_urls):
            error = check_url_online(url)
            if error:
                all_errors.append(f"URL check failed: {error}")

    if all_errors:
        print("Link check failed:")
        for error in all_errors:
            print(f"- {error}")
        return 1

    print(f"Link check passed: {len(files)} markdown files, {len(all_urls)} URLs")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())