GodsDevProject commited on
Commit
2bf4b36
·
verified ·
1 Parent(s): ac6e700

Create ingest/generic_public_foia.py

Browse files
Files changed (1) hide show
  1. ingest/generic_public_foia.py +31 -36
ingest/generic_public_foia.py CHANGED
@@ -1,37 +1,32 @@
 
 
1
  import time
2
- import aiohttp
3
- from ingest.generic_public_foia import GenericFOIAAdapter
4
-
5
-
6
- class CIAAdapter(GenericFOIAAdapter):
7
- source_name = "CIA FOIA Reading Room"
8
- base_url = "https://www.cia.gov/readingroom/search/site"
9
- is_stub = False
10
-
11
- def __init__(self, live: bool = False):
12
- super().__init__(rate_limit=2.0)
13
- self.live = live
14
-
15
- async def search(self, query):
16
- if not self.live:
17
- return []
18
-
19
- await self._throttle()
20
- start = time.time()
21
-
22
- async with aiohttp.ClientSession() as session:
23
- async with session.get(
24
- self.base_url,
25
- params={"search_api_fulltext": query},
26
- timeout=20
27
- ) as resp:
28
- await resp.text()
29
-
30
- self.last_latency = time.time() - start
31
-
32
- # Public-safe placeholder parse
33
- return [{
34
- "title": f"CIA FOIA document mentioning '{query}'",
35
- "url": self.base_url,
36
- "snippet": "Publicly released FOIA document."
37
- }]
 
1
+ import abc
2
+ import asyncio
3
  import time
4
+ import urllib.robotparser as robotparser
5
+
6
+ class GenericFOIAAdapter(abc.ABC):
7
+ source_name = "UNKNOWN"
8
+ base_url = ""
9
+ is_live = False
10
+
11
+ def __init__(self):
12
+ self.last_call = 0
13
+ self.health = "unknown"
14
+ self._rp = robotparser.RobotFileParser()
15
+ self._rp.set_url(self.base_url + "/robots.txt")
16
+ try:
17
+ self._rp.read()
18
+ except Exception:
19
+ pass
20
+
21
+ async def _rate_limit(self):
22
+ now = time.time()
23
+ if now - self.last_call < 1:
24
+ await asyncio.sleep(1)
25
+ self.last_call = time.time()
26
+
27
+ def allowed(self, path="/"):
28
+ return self._rp.can_fetch("*", self.base_url + path)
29
+
30
+ @abc.abstractmethod
31
+ async def search(self, query: str):
32
+ pass