GodsDevProject commited on
Commit
145898a
·
verified ·
1 Parent(s): b299465

Create ingest/cia_reading_room.py

Browse files
Files changed (1) hide show
  1. ingest/cia_reading_room.py +51 -2
ingest/cia_reading_room.py CHANGED
@@ -1,6 +1,55 @@
 
 
 
1
  from ingest.generic_public_foia import GenericFOIAAdapter
2
 
3
 
4
  class CIAAdapter(GenericFOIAAdapter):
5
- source_name = "CIA Reading Room"
6
- base_url = "https://www.cia.gov/readingroom/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import requests
3
+ from typing import List, Dict
4
  from ingest.generic_public_foia import GenericFOIAAdapter
5
 
6
 
7
  class CIAAdapter(GenericFOIAAdapter):
8
+ """
9
+ LIVE adapter for the CIA FOIA Electronic Reading Room
10
+ https://www.cia.gov/readingroom/
11
+ """
12
+
13
+ source_name = "CIA FOIA Reading Room"
14
+ base_url = "https://www.cia.gov/readingroom/search/site"
15
+
16
+ live = True
17
+ extended = False
18
+
19
+ async def search(self, query: str) -> List[Dict]:
20
+ await self._rate_limit()
21
+
22
+ params = {
23
+ "search_api_fulltext": query
24
+ }
25
+
26
+ try:
27
+ resp = requests.get(
28
+ self.base_url,
29
+ params=params,
30
+ timeout=10,
31
+ headers={
32
+ "User-Agent": "FOIA-Research-Bot/1.0"
33
+ }
34
+ )
35
+ resp.raise_for_status()
36
+ except Exception:
37
+ return []
38
+
39
+ # CIA search pages are HTML — keep it SAFE + SIMPLE
40
+ results = []
41
+
42
+ # Minimal heuristic parse (intentionally conservative)
43
+ for line in resp.text.splitlines():
44
+ if "/readingroom/document/" in line:
45
+ url = line.split('"')[1]
46
+ results.append({
47
+ "source": self.source_name,
48
+ "title": "CIA FOIA Document",
49
+ "url": f"https://www.cia.gov{url}",
50
+ "snippet": "Publicly released CIA FOIA document.",
51
+ "live": True,
52
+ "extended": False,
53
+ })
54
+
55
+ return results[:10] # HARD CAP (HF-safe)