github-actions
Sync from GitHub 2025-12-17T12:18:53Z
5a3b322
from __future__ import annotations
import hashlib
import random
import time
import urllib.parse
from datetime import datetime, timezone
from typing import Iterable
def canonicalize_url(url: str) -> str:
"""Normalize URL by stripping fragments/query trackers and trailing slashes."""
parsed = urllib.parse.urlparse(url)
query = urllib.parse.parse_qsl(parsed.query, keep_blank_values=True)
filtered_query = [(k, v) for k, v in query if not k.lower().startswith("utm_")]
cleaned_query = urllib.parse.urlencode(filtered_query, doseq=True)
path = parsed.path if parsed.path != "/" else ""
# Keep trailing slash for non-root paths to avoid 404s on detail pages.
if path and not path.endswith("/"):
path = path
normalized = parsed._replace(query=cleaned_query, fragment="", path=path).geturl()
return normalized or url
def make_assessment_id(url: str) -> str:
"""Deterministic ID from canonical URL."""
canonical = canonicalize_url(url)
return hashlib.sha1(canonical.encode("utf-8")).hexdigest()
def now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
class RateLimiter:
"""Coarse rate limiter with jitter to respect polite crawling."""
def __init__(self, base_delay: float, jitter: float) -> None:
self.base_delay = base_delay
self.jitter = jitter
self._last_ts = 0.0
def sleep(self) -> None:
now = time.monotonic()
elapsed = now - self._last_ts
delay = self.base_delay + random.uniform(0, self.jitter)
if elapsed < delay:
time.sleep(delay - elapsed)
self._last_ts = time.monotonic()
def batched(iterable: Iterable, size: int):
"""Yield fixed-size batches from an iterable."""
batch = []
for item in iterable:
batch.append(item)
if len(batch) == size:
yield batch
batch = []
if batch:
yield batch