GodsDevProject commited on
Commit
6aba5f3
·
verified ·
1 Parent(s): 7c43419

Upload 31 files

Browse files
ingest/__init__.py ADDED
File without changes
ingest/aatip_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class AATIPAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "AATIP"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/agency_registry.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ingest/agency_registry.py
2
+
3
+ # Domains are PUBLIC FOIA / reading room hosts only.
4
+ # Labels may include units if (and only if) a public FOIA page exists
5
+ # and the user provides its URL.
6
+
7
+ ALLOWED_FOIA_SOURCES = {
8
+ # Core
9
+ "vault.fbi.gov": "FBI",
10
+ "www.cia.gov": "CIA",
11
+ "www.archives.gov": "NARA",
12
+ "foia.state.gov": "State Dept",
13
+ "www.nsa.gov": "NSA",
14
+ "www.defense.gov": "DoD",
15
+ "www.esd.whs.mil": "DoD FOIA",
16
+ "www.whitehouse.gov": "White House",
17
+
18
+ # Military (public FOIA pages)
19
+ "www.af.mil": "USAF",
20
+ "www.navy.mil": "US Navy",
21
+ "www.marines.mil": "USMC",
22
+ "www.army.mil": "US Army",
23
+ "www.spaceforce.mil": "US Space Force",
24
+
25
+ # Intelligence / defense components (public FOIA pages only)
26
+ "www.dia.mil": "DIA",
27
+ "www.nro.gov": "NRO",
28
+
29
+ # Law enforcement / protective services (public FOIA pages)
30
+ "www.secretservice.gov": "US Secret Service",
31
+ "www.dea.gov": "DEA",
32
+
33
+ # Labels for historical / organizational references
34
+ # (NO claim of dedicated public repositories)
35
+ # These are ONLY labels applied if a public FOIA URL is supplied.
36
+ "label:SAC": "CIA Special Activities Center (label only)",
37
+ "label:SAD": "CIA Special Activities Division (label only)",
38
+ "label:NIA": "National Intelligence Authority (historical)"
39
+ }
ingest/air_force_foia_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class USAirForceAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "USAirForce"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/army_foia_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class USArmyAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "USArmy"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/cia_reading_room.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+ from ingest.generic_public_foia import GenericFOIAAdapter
3
+
4
+ class CIAAdapter(GenericFOIAAdapter):
5
+ name = "CIA"
6
+ rate_limit = 1
7
+ robots_respected = True
8
+ base_url = "https://www.cia.gov/readingroom/search/site/"
9
+
10
+ async def search(self, query: str):
11
+ async with httpx.AsyncClient(timeout=10) as client:
12
+ r = await client.get(self.base_url, params={"query": query})
13
+ if r.status_code != 200:
14
+ return []
15
+ # Minimal safe parse: return page-level hit
16
+ return [{
17
+ "source": "CIA FOIA Reading Room",
18
+ "query": query,
19
+ "url": str(r.url),
20
+ "snippet": "Public FOIA search result page"
21
+ }]
ingest/coast_guard_foia_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class USCoastGuardAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "USCoastGuard"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/darpa_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class DARPAAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "DARPA"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/dhs_reading_room.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class DHSAdapter(GenericFOIAAdapter):
4
+ name = "DHS"
5
+ rate_limit = 1 # requests per second
6
+ robots_respected = True
ingest/dia_reading_room.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class DIAAdapter(GenericFOIAAdapter):
4
+ name = "DIA"
5
+ rate_limit = 1 # requests per second
6
+ robots_respected = True
ingest/dod_reading_room.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class DoDAdapter(GenericFOIAAdapter):
4
+ name = "DoD"
5
+ rate_limit = 1 # requests per second
6
+ robots_respected = True
ingest/dod_reading_room_live.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+ from ingest.generic_public_foia import GenericFOIAAdapter
3
+
4
+ class DoDAdapter(GenericFOIAAdapter):
5
+ name = "DoD FOIA Reading Room"
6
+ rate_limit = 1
7
+ robots_respected = True
8
+ base_url = "https://www.esd.whs.mil/FOIA/Reading-Room/"
9
+
10
+ async def search(self, query: str):
11
+ async with httpx.AsyncClient(timeout=10) as client:
12
+ r = await client.get(self.base_url, params={"search": query})
13
+ if r.status_code != 200:
14
+ return []
15
+ return [{
16
+ "source": "DoD FOIA Reading Room",
17
+ "query": query,
18
+ "url": str(r.url),
19
+ "snippet": "Public DoD FOIA reading room page"
20
+ }]
ingest/fbi_vault.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ def ingest_fbi_vault(url: str) -> dict:
5
+ r = requests.get(url, timeout=10)
6
+ r.raise_for_status()
7
+ soup = BeautifulSoup(r.text, "html.parser")
8
+
9
+ text = soup.get_text(separator=" ", strip=True)
10
+ title = soup.find("h1")
11
+
12
+ return {
13
+ "source": "FBI Vault",
14
+ "agency": "FBI",
15
+ "url": url,
16
+ "title": title.text if title else "FBI Vault Document",
17
+ "text": text[:10000]
18
+ }(r.text, "html.parser")
19
+
20
+ title = soup.find("h1")
21
+ body = soup.get_text(separator=" ", strip=True)
22
+
23
+ return {
24
+ "source": "FBI Vault",
25
+ "url": url,
26
+ "title": title.text if title else "Untitled FBI Vault Document",
27
+ "text": body,
28
+ "agency": "FBI"
29
+ }
ingest/fbi_vault_live.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+ from ingest.generic_public_foia import GenericFOIAAdapter
3
+
4
+ class FBIAdapter(GenericFOIAAdapter):
5
+ name = "FBI Vault"
6
+ rate_limit = 1
7
+ robots_respected = True
8
+ base_url = "https://vault.fbi.gov/search"
9
+
10
+ async def search(self, query: str):
11
+ async with httpx.AsyncClient(timeout=10) as client:
12
+ r = await client.get(self.base_url, params={"q": query})
13
+ if r.status_code != 200:
14
+ return []
15
+ return [{
16
+ "source": "FBI Vault",
17
+ "query": query,
18
+ "url": str(r.url),
19
+ "snippet": "Public FBI Vault search results"
20
+ }]
ingest/generic_public_foia.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ingest/generic_public_foia.py
2
+
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from urllib.parse import urlparse
6
+ from typing import Dict
7
+ from .agency_registry import ALLOWED_FOIA_SOURCES
8
+
9
+ MAX_CHARS = 12000
10
+
11
+ def infer_agency_from_url(url: str) -> str:
12
+ host = urlparse(url).netloc.lower()
13
+
14
+ for domain, agency in ALLOWED_FOIA_SOURCES.items():
15
+ if domain.startswith("label:"):
16
+ continue
17
+ if domain in host:
18
+ return agency
19
+
20
+ return "Unknown"
21
+
22
+ def ingest_public_foia_url(url: str) -> Dict:
23
+ """
24
+ SAFE ingestion:
25
+ - user-supplied URL only
26
+ - public FOIA / reading room pages
27
+ - no crawling, no discovery
28
+ """
29
+
30
+ agency = infer_agency_from_url(url)
31
+
32
+ r = requests.get(url, timeout=15)
33
+ r.raise_for_status()
34
+
35
+ soup = BeautifulSoup(r.text, "html.parser")
36
+
37
+ title = soup.find("h1")
38
+ text = soup.get_text(separator=" ", strip=True)
39
+
40
+ return {
41
+ "agency": agency,
42
+ "url": url,
43
+ "title": title.text.strip() if title else "Public FOIA Document",
44
+ "text": text[:MAX_CHARS],
45
+ "source_type": "public_foia"
46
+ }
ingest/ice_reading_room.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class ICEAdapter(GenericFOIAAdapter):
4
+ name = "ICE"
5
+ rate_limit = 1 # requests per second
6
+ robots_respected = True
ingest/loader.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from typing import List, Dict
4
+
5
+ def ingest_documents(enable_scraping: bool = False) -> List[Dict]:
6
+ if not enable_scraping:
7
+ return []
8
+
9
+ # HF-safe: capped, read-only metadata fetch
10
+ docs = []
11
+ try:
12
+ r = requests.get("https://vault.fbi.gov", timeout=10)
13
+ soup = BeautifulSoup(r.text, "html.parser")
14
+ for link in soup.select("a")[:10]:
15
+ docs.append({
16
+ "title": link.text.strip(),
17
+ "agency": "FBI",
18
+ "date": "",
19
+ "content": link.get("href", "")
20
+ })
21
+ except Exception:
22
+ pass
23
+
24
+ return docs
ingest/marine_corps_foia_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class USMarineCorpsAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "USMarineCorps"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/navy_foia_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class USNavyAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "USNavy"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/nia_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class NIAAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "NIA"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/nis_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class NISAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "NIS"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/nro_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class NROAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "NRO"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/nsa_reading_room.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class NSAAdapter(GenericFOIAAdapter):
4
+ name = "NSA"
5
+ rate_limit = 1 # requests per second
6
+ robots_respected = True
ingest/sap_public_releases.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class SpecialAccessProgramsAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "SpecialAccessPrograms"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/secret_service_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class SecretServiceAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "SecretService"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/sources.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ AGENCY_SOURCES = {
2
+ "FBI": "https://vault.fbi.gov",
3
+ "CIA": "https://www.cia.gov/readingroom",
4
+ "DoD": "https://www.esd.whs.mil/FOIA/",
5
+ "NSA": "https://www.nsa.gov/Helpful-Links/FOIA/",
6
+ "NRO": "https://www.nro.gov/FOIA/",
7
+ "USAF": "https://www.afhra.af.mil/FOIA/",
8
+ "White House": "https://www.archives.gov/foia"
9
+ }
ingest/space_force_foia_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class USSpaceForceAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "USSpaceForce"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/special_activities_public.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class SpecialActivitiesAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "SpecialActivities"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/special_projects_public.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class SpecialProjectsAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "SpecialProjects"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
ingest/tencap_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class TENCAPAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "TENCAP"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True