Spaces:
Sleeping
Sleeping
Delete generic_public_foia.py
Browse files- generic_public_foia.py +0 -46
generic_public_foia.py
DELETED
|
@@ -1,46 +0,0 @@
|
|
| 1 |
-
# ingest/generic_public_foia.py
|
| 2 |
-
|
| 3 |
-
import requests
|
| 4 |
-
from bs4 import BeautifulSoup
|
| 5 |
-
from urllib.parse import urlparse
|
| 6 |
-
from typing import Dict
|
| 7 |
-
from .agency_registry import ALLOWED_FOIA_SOURCES
|
| 8 |
-
|
| 9 |
-
MAX_CHARS = 12000
|
| 10 |
-
|
| 11 |
-
def infer_agency_from_url(url: str) -> str:
|
| 12 |
-
host = urlparse(url).netloc.lower()
|
| 13 |
-
|
| 14 |
-
for domain, agency in ALLOWED_FOIA_SOURCES.items():
|
| 15 |
-
if domain.startswith("label:"):
|
| 16 |
-
continue
|
| 17 |
-
if domain in host:
|
| 18 |
-
return agency
|
| 19 |
-
|
| 20 |
-
return "Unknown"
|
| 21 |
-
|
| 22 |
-
def ingest_public_foia_url(url: str) -> Dict:
|
| 23 |
-
"""
|
| 24 |
-
SAFE ingestion:
|
| 25 |
-
- user-supplied URL only
|
| 26 |
-
- public FOIA / reading room pages
|
| 27 |
-
- no crawling, no discovery
|
| 28 |
-
"""
|
| 29 |
-
|
| 30 |
-
agency = infer_agency_from_url(url)
|
| 31 |
-
|
| 32 |
-
r = requests.get(url, timeout=15)
|
| 33 |
-
r.raise_for_status()
|
| 34 |
-
|
| 35 |
-
soup = BeautifulSoup(r.text, "html.parser")
|
| 36 |
-
|
| 37 |
-
title = soup.find("h1")
|
| 38 |
-
text = soup.get_text(separator=" ", strip=True)
|
| 39 |
-
|
| 40 |
-
return {
|
| 41 |
-
"agency": agency,
|
| 42 |
-
"url": url,
|
| 43 |
-
"title": title.text.strip() if title else "Public FOIA Document",
|
| 44 |
-
"text": text[:MAX_CHARS],
|
| 45 |
-
"source_type": "public_foia"
|
| 46 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|