musaw
docs: make all links clickable and add structured resource/docs tooling
d2f0b77
"""Validate markdown links across the repository.
Checks:
1. Reject non-clickable URL formatting such as `https://...` inside backticks.
2. Reject raw bare URLs that are not markdown links.
3. Optionally verify remote URL reachability with --online.
Usage:
python scripts/check_links.py
python scripts/check_links.py --online
"""
from __future__ import annotations
import argparse
import re
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\((https?://[^)\s]+)\)")
CODE_URL_RE = re.compile(r"`(https?://[^`\s]+)`")
RAW_URL_RE = re.compile(r"https?://[^\s)>\]]+")
def md_files(root: Path) -> list[Path]:
return sorted(path for path in root.rglob("*.md") if ".git" not in path.parts)
def lint_markdown_links(path: Path) -> tuple[list[str], set[str]]:
errors: list[str] = []
urls: set[str] = set()
lines = path.read_text(encoding="utf-8").splitlines()
for line_no, line in enumerate(lines, start=1):
for match in MARKDOWN_LINK_RE.finditer(line):
urls.add(match.group(1))
for match in CODE_URL_RE.finditer(line):
errors.append(
f"{path}:{line_no} non-clickable code URL; use markdown link: {match.group(1)}"
)
for raw in RAW_URL_RE.finditer(line):
url = raw.group(0)
start = raw.start()
end = raw.end()
# Skip URLs that are part of markdown links.
if start >= 1 and line[start - 1] == "(":
continue
if end < len(line) and line[end : end + 1] == ")":
continue
# Skip URLs inside backticks (already handled above).
if (start >= 1 and line[start - 1] == "`") or (
end < len(line) and line[end : end + 1] == "`"
):
continue
errors.append(f"{path}:{line_no} bare URL; wrap in markdown link: {url}")
return errors, urls
def check_url_online(url: str, timeout: float = 10.0) -> str | None:
request = Request(url, method="HEAD", headers={"User-Agent": "pashto-link-checker/1.0"})
try:
with urlopen(request, timeout=timeout):
return None
except HTTPError as exc:
if exc.code in {403, 405}:
# Some hosts block HEAD; retry with GET.
pass
else:
return f"{url} returned HTTP {exc.code}"
except URLError as exc:
return f"{url} failed: {exc.reason}"
except TimeoutError:
return f"{url} failed: timeout"
request = Request(url, method="GET", headers={"User-Agent": "pashto-link-checker/1.0"})
try:
with urlopen(request, timeout=timeout):
return None
except HTTPError as exc:
return f"{url} returned HTTP {exc.code}"
except URLError as exc:
return f"{url} failed: {exc.reason}"
except TimeoutError:
return f"{url} failed: timeout"
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--root", default=".", help="Repository root")
parser.add_argument("--online", action="store_true", help="Check URL reachability online")
args = parser.parse_args()
root = Path(args.root).resolve()
files = md_files(root)
all_errors: list[str] = []
all_urls: set[str] = set()
for path in files:
errors, urls = lint_markdown_links(path)
all_errors.extend(errors)
all_urls.update(urls)
if args.online:
for url in sorted(all_urls):
error = check_url_online(url)
if error:
all_errors.append(f"URL check failed: {error}")
if all_errors:
print("Link check failed:")
for error in all_errors:
print(f"- {error}")
return 1
print(f"Link check passed: {len(files)} markdown files, {len(all_urls)} URLs")
return 0
if __name__ == "__main__":
raise SystemExit(main())