GodsDevProject commited on
Commit
6795047
·
verified ·
1 Parent(s): 8aba1e7

Create ingest/utils.py

Browse files
Files changed (1) hide show
  1. ingest/utils.py +7 -8
ingest/utils.py CHANGED
@@ -3,12 +3,12 @@ import requests
3
  import urllib.robotparser as robotparser
4
  from functools import lru_cache
5
 
6
- DEFAULT_HEADERS = {
7
- "User-Agent": "FOIA-Public-Search/1.0 (HF Spaces)"
8
  }
9
 
10
  @lru_cache(maxsize=64)
11
- def can_fetch(url: str, user_agent: str = "*") -> bool:
12
  parsed = requests.utils.urlparse(url)
13
  robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
14
 
@@ -16,13 +16,12 @@ def can_fetch(url: str, user_agent: str = "*") -> bool:
16
  try:
17
  rp.set_url(robots_url)
18
  rp.read()
 
19
  except Exception:
20
  return False
21
 
22
- return rp.can_fetch(user_agent, url)
23
-
24
- def safe_get(url: str, timeout: int = 15):
25
- if not can_fetch(url):
26
  raise RuntimeError(f"Blocked by robots.txt: {url}")
27
  time.sleep(1.0)
28
- return requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
 
3
  import urllib.robotparser as robotparser
4
  from functools import lru_cache
5
 
6
+ HEADERS = {
7
+ "User-Agent": "FOIA-Public-Search/1.0 (HuggingFace Space)"
8
  }
9
 
10
  @lru_cache(maxsize=64)
11
+ def robots_allowed(url: str) -> bool:
12
  parsed = requests.utils.urlparse(url)
13
  robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
14
 
 
16
  try:
17
  rp.set_url(robots_url)
18
  rp.read()
19
+ return rp.can_fetch("*", url)
20
  except Exception:
21
  return False
22
 
23
+ def safe_get(url: str, timeout=15):
24
+ if not robots_allowed(url):
 
 
25
  raise RuntimeError(f"Blocked by robots.txt: {url}")
26
  time.sleep(1.0)
27
+ return requests.get(url, headers=HEADERS, timeout=timeout)