Spaces:

GodsDevProject
/

FOIA_Declassified_Document_Search

Sleeping

GodsDevProject commited on Jan 9

Commit

ef1ca61

verified ·

1 Parent(s): 0c65402

Update ingest/cia_reading_room.py

Files changed (1) hide show

ingest/cia_reading_room.py CHANGED Viewed

@@ -1,23 +1,18 @@
 import requests
-from typing import Dict
 from bs4 import BeautifulSoup
-def ingest_cia_reading_room(url: str) -> Dict:
-    """
-    Ingest a single CIA Reading Room document page.
-    """
     r = requests.get(url, timeout=10)
     r.raise_for_status()
     soup = BeautifulSoup(r.text, "html.parser")
     title = soup.find("h1")
-    body = soup.get_text(separator=" ", strip=True)
     return {
         "source": "CIA Reading Room",
         "url": url,
-        "title": title.text if title else "Untitled CIA Document",
-        "text": body,
-        "agency": "CIA"
     }

 import requests
 from bs4 import BeautifulSoup
+def ingest_cia_reading_room(url: str) -> dict:
     r = requests.get(url, timeout=10)
     r.raise_for_status()
     soup = BeautifulSoup(r.text, "html.parser")
+    text = soup.get_text(separator=" ", strip=True)
     title = soup.find("h1")
     return {
         "source": "CIA Reading Room",
+        "agency": "CIA",
         "url": url,
+        "title": title.text if title else "CIA FOIA Document",
+        "text": text[:10000]
     }