GodsDevProject commited on
Commit
1558b4d
·
verified ·
1 Parent(s): b6a7950

Create ingest/robots.py

Browse files
Files changed (1) hide show
  1. ingest/robots.py +23 -0
ingest/robots.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import urllib.robotparser as urobot
3
+ from functools import lru_cache
4
+ from urllib.parse import urlparse
5
+
6
+ USER_AGENT = "HF-FOIA-Search/1.0"
7
+
8
+ @lru_cache(maxsize=128)
9
+ def _get_parser(base_url: str) -> urobot.RobotFileParser:
10
+ parsed = urlparse(base_url)
11
+ robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
12
+ rp = urobot.RobotFileParser()
13
+ rp.set_url(robots_url)
14
+ try:
15
+ rp.read()
16
+ except Exception:
17
+ pass
18
+ return rp
19
+
20
+ async def allowed(url: str) -> bool:
21
+ # robots parser is sync; wrap lightly
22
+ rp = _get_parser(url)
23
+ return rp.can_fetch(USER_AGENT, url)