Spaces:

GodsDevProject
/

FOIA_Doc_Search

Sleeping

GodsDevProject commited on Jan 10

Commit

4ab9c9c

verified ·

1 Parent(s): 95d4bbc

Delete generic_public_foia.py

Files changed (1) hide show

generic_public_foia.py DELETED Viewed

@@ -1,46 +0,0 @@
-# ingest/generic_public_foia.py
-import requests
-from bs4 import BeautifulSoup
-from urllib.parse import urlparse
-from typing import Dict
-from .agency_registry import ALLOWED_FOIA_SOURCES
-MAX_CHARS = 12000
-def infer_agency_from_url(url: str) -> str:
-    host = urlparse(url).netloc.lower()
-    for domain, agency in ALLOWED_FOIA_SOURCES.items():
-        if domain.startswith("label:"):
-            continue
-        if domain in host:
-            return agency
-    return "Unknown"
-def ingest_public_foia_url(url: str) -> Dict:
-    """
-    SAFE ingestion:
-    - user-supplied URL only
-    - public FOIA / reading room pages
-    - no crawling, no discovery
-    """
-    agency = infer_agency_from_url(url)
-    r = requests.get(url, timeout=15)
-    r.raise_for_status()
-    soup = BeautifulSoup(r.text, "html.parser")
-    title = soup.find("h1")
-    text = soup.get_text(separator=" ", strip=True)
-    return {
-        "agency": agency,
-        "url": url,
-        "title": title.text.strip() if title else "Public FOIA Document",
-        "text": text[:MAX_CHARS],
-        "source_type": "public_foia"
-    }