GodsDevProject commited on
Commit
4ab9c9c
·
verified ·
1 Parent(s): 95d4bbc

Delete generic_public_foia.py

Browse files
Files changed (1) hide show
  1. generic_public_foia.py +0 -46
generic_public_foia.py DELETED
@@ -1,46 +0,0 @@
1
- # ingest/generic_public_foia.py
2
-
3
- import requests
4
- from bs4 import BeautifulSoup
5
- from urllib.parse import urlparse
6
- from typing import Dict
7
- from .agency_registry import ALLOWED_FOIA_SOURCES
8
-
9
- MAX_CHARS = 12000
10
-
11
- def infer_agency_from_url(url: str) -> str:
12
- host = urlparse(url).netloc.lower()
13
-
14
- for domain, agency in ALLOWED_FOIA_SOURCES.items():
15
- if domain.startswith("label:"):
16
- continue
17
- if domain in host:
18
- return agency
19
-
20
- return "Unknown"
21
-
22
- def ingest_public_foia_url(url: str) -> Dict:
23
- """
24
- SAFE ingestion:
25
- - user-supplied URL only
26
- - public FOIA / reading room pages
27
- - no crawling, no discovery
28
- """
29
-
30
- agency = infer_agency_from_url(url)
31
-
32
- r = requests.get(url, timeout=15)
33
- r.raise_for_status()
34
-
35
- soup = BeautifulSoup(r.text, "html.parser")
36
-
37
- title = soup.find("h1")
38
- text = soup.get_text(separator=" ", strip=True)
39
-
40
- return {
41
- "agency": agency,
42
- "url": url,
43
- "title": title.text.strip() if title else "Public FOIA Document",
44
- "text": text[:MAX_CHARS],
45
- "source_type": "public_foia"
46
- }