GodsDevProject commited on
Commit
f4a4150
·
verified ·
1 Parent(s): b00f508

Update ingest/fbi_vault.py

Browse files
Files changed (1) hide show
  1. ingest/fbi_vault.py +30 -24
ingest/fbi_vault.py CHANGED
@@ -1,29 +1,35 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
 
4
- def ingest_fbi_vault(url: str) -> dict:
5
- r = requests.get(url, timeout=10)
6
- r.raise_for_status()
7
- soup = BeautifulSoup(r.text, "html.parser")
8
 
9
- text = soup.get_text(separator=" ", strip=True)
10
- title = soup.find("h1")
 
11
 
12
- return {
13
- "source": "FBI Vault",
14
- "agency": "FBI",
15
- "url": url,
16
- "title": title.text if title else "FBI Vault Document",
17
- "text": text[:10000]
18
- }(r.text, "html.parser")
 
19
 
20
- title = soup.find("h1")
21
- body = soup.get_text(separator=" ", strip=True)
22
 
23
- return {
24
- "source": "FBI Vault",
25
- "url": url,
26
- "title": title.text if title else "Untitled FBI Vault Document",
27
- "text": body,
28
- "agency": "FBI"
29
- }
 
 
 
 
 
 
1
+ import aiohttp
2
+ from ingest.base_adapter import BaseAdapter
3
 
4
+ class FBIAdapter(BaseAdapter):
5
+ name = "FBI Vault"
6
+ source_type = "live"
7
+ base_url = "https://vault.fbi.gov"
8
 
9
+ async def search(self, query: str):
10
+ params = {"q": query}
11
+ results = []
12
 
13
+ async with aiohttp.ClientSession() as session:
14
+ async with session.get(
15
+ f"{self.base_url}/search",
16
+ params=params,
17
+ timeout=15
18
+ ) as resp:
19
+ if resp.status != 200:
20
+ return []
21
 
22
+ text = await resp.text()
 
23
 
24
+ # Minimal, safe parsing (no deep scrape)
25
+ for line in text.splitlines():
26
+ if "/reading-room/" in line:
27
+ results.append({
28
+ "title": f"FBI Vault result for {query}",
29
+ "url": self.base_url,
30
+ "snippet": "Public FBI Vault FOIA document",
31
+ "source": self.name,
32
+ "live": True
33
+ })
34
+
35
+ return results