FOIA_Doc_Search / ingest /fbi_vault.py
GodsDevProject's picture
Upload 31 files
6aba5f3 verified
raw
history blame
761 Bytes
import requests
from bs4 import BeautifulSoup
def ingest_fbi_vault(url: str) -> dict:
r = requests.get(url, timeout=10)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
text = soup.get_text(separator=" ", strip=True)
title = soup.find("h1")
return {
"source": "FBI Vault",
"agency": "FBI",
"url": url,
"title": title.text if title else "FBI Vault Document",
"text": text[:10000]
}(r.text, "html.parser")
title = soup.find("h1")
body = soup.get_text(separator=" ", strip=True)
return {
"source": "FBI Vault",
"url": url,
"title": title.text if title else "Untitled FBI Vault Document",
"text": body,
"agency": "FBI"
}