Spaces:
Sleeping
Sleeping
Create ingest/utils.py
Browse files- ingest/utils.py +7 -8
ingest/utils.py
CHANGED
|
@@ -3,12 +3,12 @@ import requests
|
|
| 3 |
import urllib.robotparser as robotparser
|
| 4 |
from functools import lru_cache
|
| 5 |
|
| 6 |
-
|
| 7 |
-
"User-Agent": "FOIA-Public-Search/1.0 (
|
| 8 |
}
|
| 9 |
|
| 10 |
@lru_cache(maxsize=64)
|
| 11 |
-
def
|
| 12 |
parsed = requests.utils.urlparse(url)
|
| 13 |
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
| 14 |
|
|
@@ -16,13 +16,12 @@ def can_fetch(url: str, user_agent: str = "*") -> bool:
|
|
| 16 |
try:
|
| 17 |
rp.set_url(robots_url)
|
| 18 |
rp.read()
|
|
|
|
| 19 |
except Exception:
|
| 20 |
return False
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def safe_get(url: str, timeout: int = 15):
|
| 25 |
-
if not can_fetch(url):
|
| 26 |
raise RuntimeError(f"Blocked by robots.txt: {url}")
|
| 27 |
time.sleep(1.0)
|
| 28 |
-
return requests.get(url, headers=
|
|
|
|
| 3 |
import urllib.robotparser as robotparser
|
| 4 |
from functools import lru_cache
|
| 5 |
|
| 6 |
+
HEADERS = {
|
| 7 |
+
"User-Agent": "FOIA-Public-Search/1.0 (HuggingFace Space)"
|
| 8 |
}
|
| 9 |
|
| 10 |
@lru_cache(maxsize=64)
|
| 11 |
+
def robots_allowed(url: str) -> bool:
|
| 12 |
parsed = requests.utils.urlparse(url)
|
| 13 |
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
| 14 |
|
|
|
|
| 16 |
try:
|
| 17 |
rp.set_url(robots_url)
|
| 18 |
rp.read()
|
| 19 |
+
return rp.can_fetch("*", url)
|
| 20 |
except Exception:
|
| 21 |
return False
|
| 22 |
|
| 23 |
+
def safe_get(url: str, timeout=15):
|
| 24 |
+
if not robots_allowed(url):
|
|
|
|
|
|
|
| 25 |
raise RuntimeError(f"Blocked by robots.txt: {url}")
|
| 26 |
time.sleep(1.0)
|
| 27 |
+
return requests.get(url, headers=HEADERS, timeout=timeout)
|