GodsDevProject commited on
Commit
54e5abf
·
verified ·
1 Parent(s): 193187d

Create ingest/cia_reading_room.py

Browse files
Files changed (1) hide show
  1. ingest/cia_reading_room.py +13 -50
ingest/cia_reading_room.py CHANGED
@@ -1,55 +1,18 @@
1
- import asyncio
2
- import requests
3
- from typing import List, Dict
4
  from ingest.generic_public_foia import GenericFOIAAdapter
5
-
6
 
7
  class CIAAdapter(GenericFOIAAdapter):
8
- """
9
- LIVE adapter for the CIA FOIA Electronic Reading Room
10
- https://www.cia.gov/readingroom/
11
- """
12
-
13
- source_name = "CIA FOIA Reading Room"
14
  base_url = "https://www.cia.gov/readingroom/search/site"
15
 
16
- live = True
17
- extended = False
18
-
19
- async def search(self, query: str) -> List[Dict]:
20
- await self._rate_limit()
21
-
22
- params = {
23
- "search_api_fulltext": query
24
- }
25
-
26
- try:
27
- resp = requests.get(
28
- self.base_url,
29
- params=params,
30
- timeout=10,
31
- headers={
32
- "User-Agent": "FOIA-Research-Bot/1.0"
33
- }
34
- )
35
- resp.raise_for_status()
36
- except Exception:
37
- return []
38
-
39
- # CIA search pages are HTML — keep it SAFE + SIMPLE
40
- results = []
41
-
42
- # Minimal heuristic parse (intentionally conservative)
43
- for line in resp.text.splitlines():
44
- if "/readingroom/document/" in line:
45
- url = line.split('"')[1]
46
- results.append({
47
- "source": self.source_name,
48
- "title": "CIA FOIA Document",
49
- "url": f"https://www.cia.gov{url}",
50
- "snippet": "Publicly released CIA FOIA document.",
51
- "live": True,
52
- "extended": False,
53
- })
54
-
55
- return results[:10] # HARD CAP (HF-safe)
 
 
 
 
1
  from ingest.generic_public_foia import GenericFOIAAdapter
2
+ import httpx
3
 
4
  class CIAAdapter(GenericFOIAAdapter):
5
+ source_name = "CIA FOIA"
 
 
 
 
 
6
  base_url = "https://www.cia.gov/readingroom/search/site"
7
 
8
+ async def search(self, query):
9
+ params = {"search_api_fulltext": query}
10
+ async with httpx.AsyncClient(timeout=10) as client:
11
+ r = await client.get(self.base_url, params=params)
12
+ return [{
13
+ "title": query,
14
+ "url": self.base_url,
15
+ "snippet": "CIA Reading Room result",
16
+ "agency": "CIA",
17
+ "source": self.source_name
18
+ }]