Spaces:
Sleeping
Sleeping
| # ingest/generic_public_foia.py | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urlparse | |
| from typing import Dict | |
| from .agency_registry import ALLOWED_FOIA_SOURCES | |
| MAX_CHARS = 12000 | |
| def infer_agency_from_url(url: str) -> str: | |
| host = urlparse(url).netloc.lower() | |
| for domain, agency in ALLOWED_FOIA_SOURCES.items(): | |
| if domain.startswith("label:"): | |
| continue | |
| if domain in host: | |
| return agency | |
| return "Unknown" | |
| def ingest_public_foia_url(url: str) -> Dict: | |
| """ | |
| SAFE ingestion: | |
| - user-supplied URL only | |
| - public FOIA / reading room pages | |
| - no crawling, no discovery | |
| """ | |
| agency = infer_agency_from_url(url) | |
| r = requests.get(url, timeout=15) | |
| r.raise_for_status() | |
| soup = BeautifulSoup(r.text, "html.parser") | |
| title = soup.find("h1") | |
| text = soup.get_text(separator=" ", strip=True) | |
| return { | |
| "agency": agency, | |
| "url": url, | |
| "title": title.text.strip() if title else "Public FOIA Document", | |
| "text": text[:MAX_CHARS], | |
| "source_type": "public_foia" | |
| } |