Spaces:

GodsDevProject
/

FOIA_Doc_Search

Sleeping

GodsDevProject commited on Jan 10

Commit

f680dc4

verified ·

1 Parent(s): 507d753

Create ingest/adapters/cia.py

Files changed (1) hide show

ingest/adapters/cia.py CHANGED Viewed

@@ -1,34 +1,36 @@
 import aiohttp
 from ingest.generic_public_foia import GenericFOIAAdapter
 class CIAAdapter(GenericFOIAAdapter):
-    source_name = "CIA FOIA Reading Room"
-    base_url = "https://www.cia.gov/readingroom"
-    is_live = True
-    async def search(self, query):
-        if not self.allowed("/search"):
-            return []
         await self._rate_limit()
-        url = f"{self.base_url}/search/site/{query}"
         async with aiohttp.ClientSession() as session:
-            async with session.get(url) as r:
-                if r.status != 200:
-                    self.health = "down"
-                    return []
-                text = await r.text()
-        self.health = "up"
-        return [{
-            "id": hash(query),
-            "source": self.source_name,
-            "title": f"CIA result for {query}",
-            "url": url,
-            "snippet": "Public CIA FOIA document",
-            "date": None,
-            "exemptions": [],
-            "citation": f"{self.source_name}, {url}"
-        }]

 import aiohttp
+from bs4 import BeautifulSoup
 from ingest.generic_public_foia import GenericFOIAAdapter
 class CIAAdapter(GenericFOIAAdapter):
+    source_name = "CIA CREST"
+    agency = "CIA"
+    base_url = "https://www.cia.gov/readingroom/search/site"
+    async def search(self, query: str):
         await self._rate_limit()
+        params = {"search_api_fulltext": query}
         async with aiohttp.ClientSession() as session:
+            async with session.get(self.base_url, params=params) as resp:
+                html = await resp.text()
+        soup = BeautifulSoup(html, "html.parser")
+        results = []
+        for r in soup.select(".search-result"):
+            title_el = r.select_one("h3")
+            link_el = r.select_one("a")
+            if not title_el or not link_el:
+                continue
+            results.append({
+                "source": self.source_name,
+                "agency": self.agency,
+                "title": title_el.get_text(strip=True),
+                "url": "https://www.cia.gov" + link_el["href"],
+                "snippet": ""
+            })
+        return results