File size: 4,000 Bytes
d2f0b77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Validate markdown links across the repository.

Checks:
1. Reject non-clickable URL formatting such as `https://...` inside backticks.
2. Reject raw bare URLs that are not markdown links.
3. Optionally verify remote URL reachability with --online.

Usage:
    python scripts/check_links.py
    python scripts/check_links.py --online
"""

from __future__ import annotations

import argparse
import re
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen


MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\((https?://[^)\s]+)\)")
CODE_URL_RE = re.compile(r"`(https?://[^`\s]+)`")
RAW_URL_RE = re.compile(r"https?://[^\s)>\]]+")


def md_files(root: Path) -> list[Path]:
    return sorted(path for path in root.rglob("*.md") if ".git" not in path.parts)


def lint_markdown_links(path: Path) -> tuple[list[str], set[str]]:
    errors: list[str] = []
    urls: set[str] = set()
    lines = path.read_text(encoding="utf-8").splitlines()

    for line_no, line in enumerate(lines, start=1):
        for match in MARKDOWN_LINK_RE.finditer(line):
            urls.add(match.group(1))

        for match in CODE_URL_RE.finditer(line):
            errors.append(
                f"{path}:{line_no} non-clickable code URL; use markdown link: {match.group(1)}"
            )

        for raw in RAW_URL_RE.finditer(line):
            url = raw.group(0)
            start = raw.start()
            end = raw.end()

            # Skip URLs that are part of markdown links.
            if start >= 1 and line[start - 1] == "(":
                continue
            if end < len(line) and line[end : end + 1] == ")":
                continue

            # Skip URLs inside backticks (already handled above).
            if (start >= 1 and line[start - 1] == "`") or (
                end < len(line) and line[end : end + 1] == "`"
            ):
                continue

            errors.append(f"{path}:{line_no} bare URL; wrap in markdown link: {url}")

    return errors, urls


def check_url_online(url: str, timeout: float = 10.0) -> str | None:
    request = Request(url, method="HEAD", headers={"User-Agent": "pashto-link-checker/1.0"})
    try:
        with urlopen(request, timeout=timeout):
            return None
    except HTTPError as exc:
        if exc.code in {403, 405}:
            # Some hosts block HEAD; retry with GET.
            pass
        else:
            return f"{url} returned HTTP {exc.code}"
    except URLError as exc:
        return f"{url} failed: {exc.reason}"
    except TimeoutError:
        return f"{url} failed: timeout"

    request = Request(url, method="GET", headers={"User-Agent": "pashto-link-checker/1.0"})
    try:
        with urlopen(request, timeout=timeout):
            return None
    except HTTPError as exc:
        return f"{url} returned HTTP {exc.code}"
    except URLError as exc:
        return f"{url} failed: {exc.reason}"
    except TimeoutError:
        return f"{url} failed: timeout"


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--root", default=".", help="Repository root")
    parser.add_argument("--online", action="store_true", help="Check URL reachability online")
    args = parser.parse_args()

    root = Path(args.root).resolve()
    files = md_files(root)
    all_errors: list[str] = []
    all_urls: set[str] = set()

    for path in files:
        errors, urls = lint_markdown_links(path)
        all_errors.extend(errors)
        all_urls.update(urls)

    if args.online:
        for url in sorted(all_urls):
            error = check_url_online(url)
            if error:
                all_errors.append(f"URL check failed: {error}")

    if all_errors:
        print("Link check failed:")
        for error in all_errors:
            print(f"- {error}")
        return 1

    print(f"Link check passed: {len(files)} markdown files, {len(all_urls)} URLs")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())