Spaces:
Sleeping
Sleeping
| import asyncio | |
| import urllib.robotparser as urobot | |
| from functools import lru_cache | |
| from urllib.parse import urlparse | |
| USER_AGENT = "HF-FOIA-Search/1.0" | |
| def _get_parser(base_url: str) -> urobot.RobotFileParser: | |
| parsed = urlparse(base_url) | |
| robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" | |
| rp = urobot.RobotFileParser() | |
| rp.set_url(robots_url) | |
| try: | |
| rp.read() | |
| except Exception: | |
| pass | |
| return rp | |
| async def allowed(url: str) -> bool: | |
| # robots parser is sync; wrap lightly | |
| rp = _get_parser(url) | |
| return rp.can_fetch(USER_AGENT, url) |