FOIA_Doc_Search / ingest /generic_public_foia.py
GodsDevProject's picture
Upload 31 files
6aba5f3 verified
raw
history blame
1.11 kB
# ingest/generic_public_foia.py
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from typing import Dict
from .agency_registry import ALLOWED_FOIA_SOURCES
MAX_CHARS = 12000
def infer_agency_from_url(url: str) -> str:
host = urlparse(url).netloc.lower()
for domain, agency in ALLOWED_FOIA_SOURCES.items():
if domain.startswith("label:"):
continue
if domain in host:
return agency
return "Unknown"
def ingest_public_foia_url(url: str) -> Dict:
"""
SAFE ingestion:
- user-supplied URL only
- public FOIA / reading room pages
- no crawling, no discovery
"""
agency = infer_agency_from_url(url)
r = requests.get(url, timeout=15)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
title = soup.find("h1")
text = soup.get_text(separator=" ", strip=True)
return {
"agency": agency,
"url": url,
"title": title.text.strip() if title else "Public FOIA Document",
"text": text[:MAX_CHARS],
"source_type": "public_foia"
}