File size: 615 Bytes
1558b4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import asyncio
import urllib.robotparser as urobot
from functools import lru_cache
from urllib.parse import urlparse

USER_AGENT = "HF-FOIA-Search/1.0"

@lru_cache(maxsize=128)
def _get_parser(base_url: str) -> urobot.RobotFileParser:
    parsed = urlparse(base_url)
    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
    rp = urobot.RobotFileParser()
    rp.set_url(robots_url)
    try:
        rp.read()
    except Exception:
        pass
    return rp

async def allowed(url: str) -> bool:
    # robots parser is sync; wrap lightly
    rp = _get_parser(url)
    return rp.can_fetch(USER_AGENT, url)