GodsDevProject commited on
Commit
fb6a4da
·
verified ·
1 Parent(s): d7db33b

Update ingest/generic_public_foia.py

Browse files
Files changed (1) hide show
  1. ingest/generic_public_foia.py +30 -38
ingest/generic_public_foia.py CHANGED
@@ -1,46 +1,38 @@
1
  # ingest/generic_public_foia.py
2
 
 
 
3
  import requests
4
- from bs4 import BeautifulSoup
5
- from urllib.parse import urlparse
6
- from typing import Dict
7
- from .agency_registry import ALLOWED_FOIA_SOURCES
8
 
9
- MAX_CHARS = 12000
10
 
11
- def infer_agency_from_url(url: str) -> str:
12
- host = urlparse(url).netloc.lower()
13
-
14
- for domain, agency in ALLOWED_FOIA_SOURCES.items():
15
- if domain.startswith("label:"):
16
- continue
17
- if domain in host:
18
- return agency
19
-
20
- return "Unknown"
21
-
22
- def ingest_public_foia_url(url: str) -> Dict:
23
  """
24
- SAFE ingestion:
25
- - user-supplied URL only
26
- - public FOIA / reading room pages
27
- - no crawling, no discovery
28
  """
29
 
30
- agency = infer_agency_from_url(url)
31
-
32
- r = requests.get(url, timeout=15)
33
- r.raise_for_status()
34
-
35
- soup = BeautifulSoup(r.text, "html.parser")
36
-
37
- title = soup.find("h1")
38
- text = soup.get_text(separator=" ", strip=True)
39
-
40
- return {
41
- "agency": agency,
42
- "url": url,
43
- "title": title.text.strip() if title else "Public FOIA Document",
44
- "text": text[:MAX_CHARS],
45
- "source_type": "public_foia"
46
- }
 
 
 
 
 
 
 
 
1
  # ingest/generic_public_foia.py
2
 
3
+ import abc
4
+ import asyncio
5
  import requests
6
+ from typing import List, Dict
 
 
 
7
 
 
8
 
9
+ class GenericFOIAAdapter(abc.ABC):
 
 
 
 
 
 
 
 
 
 
 
10
  """
11
+ Base adapter for all public FOIA Electronic Reading Rooms.
12
+ Enforces rate-limiting, public-only access, and safe defaults.
 
 
13
  """
14
 
15
+ source_name: str = "UNKNOWN"
16
+ base_url: str = ""
17
+
18
+ def __init__(self, rate_limit_seconds: float = 1.0):
19
+ self.rate_limit_seconds = rate_limit_seconds
20
+ self._last_call = 0.0
21
+
22
+ async def _rate_limit(self):
23
+ delta = asyncio.get_event_loop().time() - self._last_call
24
+ if delta < self.rate_limit_seconds:
25
+ await asyncio.sleep(self.rate_limit_seconds - delta)
26
+ self._last_call = asyncio.get_event_loop().time()
27
+
28
+ @abc.abstractmethod
29
+ async def search(self, query: str) -> List[Dict]:
30
+ """
31
+ Perform a public FOIA search.
32
+ Must return a list of dicts with:
33
+ - source
34
+ - title
35
+ - url
36
+ - snippet
37
+ """
38
+ raise NotImplementedError