github-actions
Sync from GitHub 2025-12-17T12:18:53Z
5a3b322
from __future__ import annotations
import logging
import urllib.robotparser
from dataclasses import dataclass
import structlog
logger = structlog.get_logger(__name__)
@dataclass
class RobotsManager:
robots_url: str
user_agent: str
def __post_init__(self) -> None:
self._parser = urllib.robotparser.RobotFileParser()
def load(self) -> None:
logger.info("robots.load.start", robots_url=self.robots_url)
self._parser.set_url(self.robots_url)
try:
self._parser.read()
logger.info("robots.load.success", can_fetch_all=self._parser.can_fetch(self.user_agent, "*"))
except Exception as exc: # pragma: no cover - network errors are logged
logger.warning("robots.load.failed", error=str(exc))
def is_allowed(self, url: str) -> bool:
try:
return self._parser.can_fetch(self.user_agent, url)
except Exception as exc: # pragma: no cover
logger.warning("robots.check.error", url=url, error=str(exc))
return False