GodsDevProject commited on
Commit
f680dc4
·
verified ·
1 Parent(s): 507d753

Create ingest/adapters/cia.py

Browse files
Files changed (1) hide show
  1. ingest/adapters/cia.py +28 -26
ingest/adapters/cia.py CHANGED
@@ -1,34 +1,36 @@
1
  import aiohttp
 
2
  from ingest.generic_public_foia import GenericFOIAAdapter
3
 
4
  class CIAAdapter(GenericFOIAAdapter):
5
- source_name = "CIA FOIA Reading Room"
6
- base_url = "https://www.cia.gov/readingroom"
7
- is_live = True
8
-
9
- async def search(self, query):
10
- if not self.allowed("/search"):
11
- return []
12
 
 
13
  await self._rate_limit()
 
14
 
15
- url = f"{self.base_url}/search/site/{query}"
16
  async with aiohttp.ClientSession() as session:
17
- async with session.get(url) as r:
18
- if r.status != 200:
19
- self.health = "down"
20
- return []
21
- text = await r.text()
22
-
23
- self.health = "up"
24
-
25
- return [{
26
- "id": hash(query),
27
- "source": self.source_name,
28
- "title": f"CIA result for {query}",
29
- "url": url,
30
- "snippet": "Public CIA FOIA document",
31
- "date": None,
32
- "exemptions": [],
33
- "citation": f"{self.source_name}, {url}"
34
- }]
 
 
 
 
 
1
  import aiohttp
2
+ from bs4 import BeautifulSoup
3
  from ingest.generic_public_foia import GenericFOIAAdapter
4
 
5
  class CIAAdapter(GenericFOIAAdapter):
6
+ source_name = "CIA CREST"
7
+ agency = "CIA"
8
+ base_url = "https://www.cia.gov/readingroom/search/site"
 
 
 
 
9
 
10
+ async def search(self, query: str):
11
  await self._rate_limit()
12
+ params = {"search_api_fulltext": query}
13
 
 
14
  async with aiohttp.ClientSession() as session:
15
+ async with session.get(self.base_url, params=params) as resp:
16
+ html = await resp.text()
17
+
18
+ soup = BeautifulSoup(html, "html.parser")
19
+ results = []
20
+
21
+ for r in soup.select(".search-result"):
22
+ title_el = r.select_one("h3")
23
+ link_el = r.select_one("a")
24
+
25
+ if not title_el or not link_el:
26
+ continue
27
+
28
+ results.append({
29
+ "source": self.source_name,
30
+ "agency": self.agency,
31
+ "title": title_el.get_text(strip=True),
32
+ "url": "https://www.cia.gov" + link_el["href"],
33
+ "snippet": ""
34
+ })
35
+
36
+ return results