Spaces:
Sleeping
Sleeping
Create ingest/robots.py
Browse files- ingest/robots.py +23 -0
ingest/robots.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import urllib.robotparser as urobot
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from urllib.parse import urlparse
|
| 5 |
+
|
| 6 |
+
USER_AGENT = "HF-FOIA-Search/1.0"
|
| 7 |
+
|
| 8 |
+
@lru_cache(maxsize=128)
|
| 9 |
+
def _get_parser(base_url: str) -> urobot.RobotFileParser:
|
| 10 |
+
parsed = urlparse(base_url)
|
| 11 |
+
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
| 12 |
+
rp = urobot.RobotFileParser()
|
| 13 |
+
rp.set_url(robots_url)
|
| 14 |
+
try:
|
| 15 |
+
rp.read()
|
| 16 |
+
except Exception:
|
| 17 |
+
pass
|
| 18 |
+
return rp
|
| 19 |
+
|
| 20 |
+
async def allowed(url: str) -> bool:
|
| 21 |
+
# robots parser is sync; wrap lightly
|
| 22 |
+
rp = _get_parser(url)
|
| 23 |
+
return rp.can_fetch(USER_AGENT, url)
|