import asyncio import urllib.robotparser as urobot from functools import lru_cache from urllib.parse import urlparse USER_AGENT = "HF-FOIA-Search/1.0" @lru_cache(maxsize=128) def _get_parser(base_url: str) -> urobot.RobotFileParser: parsed = urlparse(base_url) robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" rp = urobot.RobotFileParser() rp.set_url(robots_url) try: rp.read() except Exception: pass return rp async def allowed(url: str) -> bool: # robots parser is sync; wrap lightly rp = _get_parser(url) return rp.can_fetch(USER_AGENT, url)