GodsDevProject commited on
Commit
ef1ca61
·
verified ·
1 Parent(s): 0c65402

Update ingest/cia_reading_room.py

Browse files
Files changed (1) hide show
  1. ingest/cia_reading_room.py +5 -10
ingest/cia_reading_room.py CHANGED
@@ -1,23 +1,18 @@
1
  import requests
2
- from typing import Dict
3
  from bs4 import BeautifulSoup
4
 
5
- def ingest_cia_reading_room(url: str) -> Dict:
6
- """
7
- Ingest a single CIA Reading Room document page.
8
- """
9
  r = requests.get(url, timeout=10)
10
  r.raise_for_status()
11
-
12
  soup = BeautifulSoup(r.text, "html.parser")
13
 
 
14
  title = soup.find("h1")
15
- body = soup.get_text(separator=" ", strip=True)
16
 
17
  return {
18
  "source": "CIA Reading Room",
 
19
  "url": url,
20
- "title": title.text if title else "Untitled CIA Document",
21
- "text": body,
22
- "agency": "CIA"
23
  }
 
1
  import requests
 
2
  from bs4 import BeautifulSoup
3
 
4
+ def ingest_cia_reading_room(url: str) -> dict:
 
 
 
5
  r = requests.get(url, timeout=10)
6
  r.raise_for_status()
 
7
  soup = BeautifulSoup(r.text, "html.parser")
8
 
9
+ text = soup.get_text(separator=" ", strip=True)
10
  title = soup.find("h1")
 
11
 
12
  return {
13
  "source": "CIA Reading Room",
14
+ "agency": "CIA",
15
  "url": url,
16
+ "title": title.text if title else "CIA FOIA Document",
17
+ "text": text[:10000]
 
18
  }