Spaces:
Sleeping
Sleeping
File size: 615 Bytes
1558b4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
import asyncio
import urllib.robotparser as urobot
from functools import lru_cache
from urllib.parse import urlparse
USER_AGENT = "HF-FOIA-Search/1.0"
@lru_cache(maxsize=128)
def _get_parser(base_url: str) -> urobot.RobotFileParser:
parsed = urlparse(base_url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
rp = urobot.RobotFileParser()
rp.set_url(robots_url)
try:
rp.read()
except Exception:
pass
return rp
async def allowed(url: str) -> bool:
# robots parser is sync; wrap lightly
rp = _get_parser(url)
return rp.can_fetch(USER_AGENT, url) |