GodsDevProject commited on
Commit
bb0b469
·
verified ·
1 Parent(s): d7f9fdb

Upload 98 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
AGENCY_COVERAGE.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Agency Coverage Map
3
+
4
+ | Agency | Public FOIA Reading Room |
5
+ |------|---------------------------|
6
+ | CIA | https://www.cia.gov/readingroom/ |
7
+ | FBI | https://vault.fbi.gov/ |
8
+ | DoD | https://www.foia.mil/ |
9
+ | NSA | https://www.nsa.gov/readingroom/ |
10
+ | NRO | https://www.nro.gov/FOIA/ |
11
+ | DHS | https://www.dhs.gov/foia-reading-room |
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ This project is committed to providing a respectful, inclusive, and responsible environment for all contributors and users.
6
+
7
+ ## Acceptable Use
8
+
9
+ Participants agree to:
10
+ - Use this project for lawful, ethical, and non-harmful purposes
11
+ - Respect the public-record nature of FOIA documents
12
+ - Avoid speculative, defamatory, or misleading interpretations
13
+
14
+ ## Unacceptable Use
15
+
16
+ This project must not be used to:
17
+ - Harass or target individuals
18
+ - Make unsubstantiated allegations
19
+ - Claim access to classified or restricted information
20
+ - Bypass legal or ethical safeguards
21
+
22
+ ## Enforcement
23
+
24
+ Maintainers may remove content or restrict access that violates this Code of Conduct.
25
+
26
+ ---
27
+
28
+ This project is intended for civic transparency, education, and research.
Dockerfile.hf ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+ WORKDIR /app
3
+ COPY . /app
4
+ RUN pip install --no-cache-dir -r requirements.txt
5
+ CMD ["python", "app.py"]
ETHICS.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ethics Policy
2
+
3
+ ## Purpose
4
+
5
+ This project exists to support transparency, research, and public understanding of government records released under the Freedom of Information Act (FOIA).
6
+
7
+ ## Guiding Principles
8
+
9
+ - **Public Sources Only:** All data must originate from publicly released documents.
10
+ - **No Speculation:** The project does not infer, predict, or hypothesize beyond document text.
11
+ - **Citation First:** Outputs must be traceable to source material.
12
+ - **No Harm:** The tool must not be used to defame, harass, or mislead.
13
+
14
+ ## Redactions
15
+
16
+ Redacted content is respected. This project does not attempt to reconstruct or infer withheld information.
17
+
18
+ ## Accountability
19
+
20
+ Users are responsible for how they interpret and use results. This tool provides analytical assistance, not conclusions.
21
+
22
+ ---
23
+
24
+ Ethical transparency is foundational to this project.
FILE_INVENTORY.txt ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CODE_OF_CONDUCT.md
2
+ ETHICS.md
3
+ README.md
4
+ README_PROD.md
5
+ __init__.py
6
+ adapters/__init__.py
7
+ adapters/cia.py
8
+ adapters/common.py
9
+ adapters/dea.py
10
+ adapters/dhs.py
11
+ adapters/dia.py
12
+ adapters/dod.py
13
+ adapters/doj.py
14
+ adapters/fbi.py
15
+ adapters/ice.py
16
+ adapters/nia.py
17
+ adapters/nsa.py
18
+ app.py
19
+ appeal_pdf.py
20
+ appeals/__init__.py
21
+ appeals/pdf_appeal.py
22
+ audit.py
23
+ collaboration.py
24
+ collaboration/__init__.py
25
+ collaboration/icij.py
26
+ core/__init__.py
27
+ core/analysis.py
28
+ core/appeals.py
29
+ core/explain.py
30
+ core/index.py
31
+ core/multi_program.py
32
+ core/redaction.py
33
+ core/search.py
34
+ core/vector.py
35
+ data/demo/documents/aatip_sample.txt
36
+ data/demo/documents/tencap_sample.txt
37
+ data/demo/metadata.json
38
+ data/foia_sources.json
39
+ entity_graph.py
40
+ export_utils.py
41
+ file_structure.txt
42
+ foia_pdf.py
43
+ foia_requests.py
44
+ gitattributes.txt
45
+ ingest/__init__.py
46
+ ingest/agency_registry.py
47
+ ingest/cia_reading_room.py
48
+ ingest/fbi_vault.py
49
+ ingest/generic_public_foia.py
50
+ ingest/loader.py
51
+ ingest/sources.py
52
+ requirements.txt
53
+ schemas.py
54
+ search/__init__.py
55
+ search/semantic.py
56
+ semantic.py
57
+ tests/__init__.py
58
+ tests/test_core.py
59
+ tests/test_schema.py
60
+ vector_store.py
HASH_MANIFEST.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "CODE_OF_CONDUCT.md": "b674f96cae26f0050be863c4b8782510fcae5ab855f0822ec4a0217763a84601",
3
+ "ETHICS.md": "d4f7c23c1e60297712786e392800158fcbe21116576496632e8221b0b8a16ff2",
4
+ "README.md": "e9bfdd2d6a4422fcb132bd4033a69d2241574c31fab71820e4643491b3b1225b",
5
+ "app.py": "c2a2b16ce45a327de0d42196104cb7fc50ec29ff1cb1fb95517a8ca655a3192a",
6
+ "appeal_pdf.py": "2d28ca1d0e796bfb5da25eac05a91354aadd58deefd041acded9a01a64055f9c",
7
+ "audit.py": "01c286d4067c6fffcb990391d8f750719c1ccac07eafc4477ccbdd1be4dd11e8",
8
+ "collaboration.py": "7cbd52c0da9be9f205b2901d8a94f28cb96612ffe506bcad1c7991885cd2d947",
9
+ "entity_graph.py": "dbe21fa0d8e7528daeee34d598efba836ab6370ad609de80746be1b12a4e0ff5",
10
+ "export_utils.py": "a01a088fd650a947a7831e795508208d3caa430d099aa5a8d7823ba462f0a80e",
11
+ "file_structure.txt": "6eee55e586751e3ae1405349f01dd35703e678d8e105ea19fc58eb15e4c2a6fa",
12
+ "foia_pdf.py": "babbd69a2da67681f15596ab254174310b8381d5853da72fe068d31d746725ab",
13
+ "foia_requests.py": "ca9c765bb7a591c462a94b0aa42957d1b3124128266d4880f0654895ce0ca6c0",
14
+ "gitattributes.txt": "11ad7efa24975ee4b0c3c3a38ed18737f0658a5f75a0a96787b576a78a023361",
15
+ "requirements.txt": "444bc9beedfa3fde82790f47c1e9b94bab90be2fefd0648de0ffdebbcc2eb61c",
16
+ "schemas.py": "e08b38513be2572af7d022e013f037c4f614f2117db85d4d776c408be96815ef",
17
+ "semantic.py": "4ffcf9149f08b8e69473e5418588dd370bbd470b137f2d0761901fccf09238cf",
18
+ "vector_store.py": "c61701e38e12150c541d284e13824341dde1794d3b4149d2a7d332b8023ad923",
19
+ "__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
20
+ "README_PROD.md": "3b5d0a9f882f8f980a08452ca589a788b3c7cfe2ed8b7ca13a01f9c4a12e9060",
21
+ "adapters/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
22
+ "adapters/cia.py": "a934e6a67aa6662391814036a9084779f15ad9ca5059f5461e2c374dfa9c3344",
23
+ "adapters/common.py": "c76c7ea1ce1616a2c99bfeec47ca046e75088f4d807f9c94ce5f87c9eeed5714",
24
+ "adapters/dea.py": "f1b9832aeaabecf5da8f1125e33883ce28e37d081149f6c75bf9ef49ac3ead8a",
25
+ "adapters/dhs.py": "66c44ee323135ee8e3c0cb7a2bb83d2d9dc20f7b88b6db4823e1bd5d03be6227",
26
+ "adapters/dia.py": "5c003321750582f502bcf0e2115956edb9af3aa8937917b72d7b25036b493f6d",
27
+ "adapters/dod.py": "410726bcab164fa9991d0ba61b3d9586d271ee4d55f65d1bd02193e84f02ed30",
28
+ "adapters/doj.py": "56080addcaef0a01d2395b6d44a93e9e271bc569a688f65657617d730a054eac",
29
+ "adapters/fbi.py": "b81b80972adf70b8283f2c16b241d17f46ab3ab73cd3ab4155dc88f7afbbcfc2",
30
+ "adapters/ice.py": "f0d06239d483933ba53966bc8015b9ca9f3ead3ebb535f4f963f5a26afd340b3",
31
+ "adapters/nia.py": "cbc240d23d7ac144d0ca0a49e83341df579903092c13c7603cfe438e7dd58a84",
32
+ "adapters/nsa.py": "a5a7ff4f8d3b1397bccc6095471de814aad75e2711566065f8cf7f4f43c59303",
33
+ "appeals/pdf_appeal.py": "cfe7ca493bf9a4280eff3d90494b2e2afc8bfed92ee99d5e175c1daf49ddadf6",
34
+ "appeals/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
35
+ "collaboration/icij.py": "bd02217afd54664762594dfcd1e8088ac3666c641acd450d3b233cf05f08a641",
36
+ "collaboration/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
37
+ "core/analysis.py": "e745cc6ad43d5193c92b5d7c417db4546ec301a96761090a319f7a477722dd99",
38
+ "core/appeals.py": "9ac66f34fdb2e741b6341de258291fd99db7f4a95862e39aa4cae94448726609",
39
+ "core/explain.py": "accdde04f5faf85b48302917f6274a12f06b9058fac5941cfa7ce9a64a6c45a3",
40
+ "core/index.py": "d266fc0aacbc2445b25cafbc29530e9138bb626090fb716681f300976927903c",
41
+ "core/multi_program.py": "444928c79f9778ebffcdb47262ba63b2eb19d2ed4d97d5632682a92e91861138",
42
+ "core/redaction.py": "b99bbbcb659e1f60902bca7e2bde5b0c28f371b7a6feb9daff489bb8fd96b878",
43
+ "core/search.py": "5843e5ee44d88688862b73e5457ff596dd229fc9433600c2e1a978868c8a2296",
44
+ "core/vector.py": "518e78f8c363735f5629584d2d5e25876a7f80063cd74e72a080723380141ce8",
45
+ "core/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
46
+ "data/foia_sources.json": "8fe166a285717548afb937ad7a669020c60f91d9ec9f06dbdee9954f3396bd2a",
47
+ "data/demo/metadata.json": "89d069dd00b20d1c74eb6f192a09b9d11226d86d5f7754159b3b1717512302d3",
48
+ "data/demo/documents/aatip_sample.txt": "8b8d9a6167699a123885330093dac739025bfe0d7fabfdfd596707ab53db9f81",
49
+ "data/demo/documents/tencap_sample.txt": "e1930579e04e76cc2ced2b5b253fa59e907b28bbffba2f8c1710693cfc84b167",
50
+ "ingest/agency_registry.py": "89581ae5dcf6f0e5614939ce8538e17f4e22a1751d806bfce5cd51fbf9d35f85",
51
+ "ingest/cia_reading_room.py": "ebfa118842937a7929a1ce58998650f11081306e8a017d53f01e11262917f2e5",
52
+ "ingest/fbi_vault.py": "9a24fd572db556cc182239738ca2c551d6cb6a393a325f3fc8f6db9cbf1c157b",
53
+ "ingest/generic_public_foia.py": "60f174b9ada68330a70ca11898ae3fbb7d225e2f404265903a5079aaa274baa1",
54
+ "ingest/loader.py": "12b2b68d4c3a902270be73bebb1218314f19b225f7df4e436191f433378aca18",
55
+ "ingest/sources.py": "4b995bff081e14cbe3b66deb516abc74fce09e29f3e36463f60bbbcaf11b075b",
56
+ "ingest/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
57
+ "search/semantic.py": "974faa592af9a67ec50a691180ad68d90e00d38244871680c0c45f31a77f8f36",
58
+ "search/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
59
+ "tests/test_core.py": "58e5d87c0de8482328abcc27d7a1452cdc6a69740eb0de4395c78a250d12d79e",
60
+ "tests/test_schema.py": "04c0343db5c7516679395717a1dd4c2eca4e325cf038e5c6ee794c2a62649119",
61
+ "tests/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
62
+ }
HF_JUSTIFICATION.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ This Hugging Face Space provides a public-interest federated search interface
3
+ across U.S. Government FOIA Electronic Reading Rooms.
4
+
5
+ Safeguards:
6
+ - Public sources only
7
+ - No authentication bypass
8
+ - Rate limiting and health checks
9
+ - Redaction-aware previews
10
+ - Metadata indexing only
11
+
12
+ Intended for journalism, research, and accountability.
HF_SPACE_README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FOIA Federated Document Search (Public Interest)
2
+
3
+ 🚀 **Hugging Face Space – Transparency & Accountability Tool**
4
+
5
+ This application provides **semantic search across publicly released U.S. Government FOIA electronic reading rooms**.
6
+ It does **not** access classified, private, or restricted systems.
7
+
8
+ ## What This Is
9
+ - Federated FOIA document search
10
+ - Semantic + keyword hybrid retrieval
11
+ - Redaction-aware exports
12
+ - Audit logging
13
+
14
+ ## What This Is NOT
15
+ - Surveillance
16
+ - Intelligence gathering
17
+ - Law enforcement tooling
18
+ - Political persuasion
19
+
20
+ ## Data Sources
21
+ - CIA FOIA Electronic Reading Room
22
+ - FBI Vault
23
+ - Other agency FOIA libraries (public releases only)
24
+
25
+ ## Compliance
26
+ - FOIA-only sources
27
+ - robots.txt respected
28
+ - Rate-limited adapters
29
+ - Redaction before export
30
+
31
+ ## Intended Users
32
+ Researchers, journalists, historians, and the general public.
LEGAL_MEMO.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ FOIA Federated Search – Legal Summary
3
+
4
+ This system indexes publicly released FOIA records.
5
+ No restricted access, no scraping of protected systems.
6
+ Fully compliant with 5 U.S.C. § 552.
README.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FOIA Federated Search
3
+ emoji: 📜
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: "4.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # FOIA Federated Search (Public Interest)
14
+
15
+ A Hugging Face Space that provides **live federated search** across publicly available
16
+ U.S. Government FOIA Electronic Reading Rooms (CIA, FBI, DoD, and more).
17
+
18
+ ## Key Features
19
+ - Live async fan-out search (no scraping beyond public endpoints)
20
+ - Per-agency source toggles + result counts
21
+ - Semantic *search-in-results* using FAISS + sentence-transformers
22
+ - Local caching + deduplication
23
+ - PDF export of search results
24
+ - Inline document preview (where permitted by source)
25
+ - Rate-limited, health-checked agency adapters
26
+
27
+ ## Trust & Safety
28
+ - Queries only **public FOIA reading rooms**
29
+ - Honors robots.txt, rate limits, and agency terms
30
+ - No authentication bypass or restricted content
31
+ - Designed for research, journalism, and public accountability
32
+
33
+ ## Legal
34
+ All content remains hosted by the originating agency.
35
+ This tool indexes metadata and snippets for discovery only.
README_PROD.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FOIA HF Document Search — Production Build
2
+
3
+ ## Entry Point
4
+ - `app.py` — orchestrates ingestion, semantic search, export, and audit hooks.
5
+
6
+ ## Ingestion Adapters (Present)
7
+ - CIA Reading Room
8
+ - FBI Vault
9
+ - Generic Public FOIA
10
+
11
+ ## Missing / Stubbed Adapters (Recommended)
12
+ - DoD (incl. components)
13
+ - NSA
14
+ - DIA
15
+ - DHS
16
+ - DEA
17
+ - ICE
18
+
19
+ ## Vector Backend Assumptions
20
+ - Current code supports abstract vector ops.
21
+ - Recommended backends:
22
+ - FAISS (local)
23
+ - Chroma (persistent)
24
+ - HuggingFace embeddings
25
+ - OpenAI embeddings (optional)
26
+
27
+ ## Live Federated Search Upgrade
28
+ - Async querying via `asyncio` + `httpx`
29
+ - Adapter interface with rate limits
30
+ - Response caching + deduplication
31
+ - Circuit breakers for abuse prevention
32
+
33
+ ## Compliance
34
+ - Respect robots.txt where applicable
35
+ - Rate limiting per agency
36
+ - Redaction before export
37
+ - Audit logging enabled
38
+
39
+ ## Build Timestamp
40
+ 2026-01-09T23:51:16.728748Z
SOURCES.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FOIA Public Sources
2
+
3
+ All sources listed here are **public FOIA electronic reading rooms** or official public-release libraries.
4
+
5
+ ## Intelligence & Defense
6
+ - CIA FOIA Electronic Reading Room — https://www.cia.gov/readingroom/
7
+ - FBI Vault — https://vault.fbi.gov/
8
+ - DARPA FOIA Library — https://www.darpa.mil/work-with-us/foia
9
+ - NRO FOIA Reading Room — https://www.nro.gov/FOIA/
10
+ - DoD FOIA Reading Room — https://www.esd.whs.mil/FOIA/Reading-Room/
11
+
12
+ ## Military Branches
13
+ - U.S. Army FOIA — https://www.army.mil/foia
14
+ - U.S. Navy FOIA — https://www.secnav.navy.mil/foia
15
+ - U.S. Air Force FOIA — https://www.af.mil/FOIA/
16
+ - U.S. Marine Corps FOIA — https://www.hqmc.marines.mil/Agencies/FOIA/
17
+ - U.S. Space Force FOIA — https://www.spaceforce.mil/FOIA/
18
+ - U.S. Coast Guard FOIA — https://www.uscg.mil/FOIA/
19
+
20
+ ## Other Agencies
21
+ - DHS FOIA Library — https://www.dhs.gov/foia-library
22
+ - DEA FOIA Reading Room — https://www.dea.gov/foia
23
+ - Secret Service FOIA — https://www.secretservice.gov/foia
__init__.py ADDED
File without changes
aatip_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class AATIPAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "AATIP"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
aatip_sample.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ AATIP referenced ████ by DoD components between 2009 and 2017.
agency_registry.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ingest/agency_registry.py
2
+
3
+ # Domains are PUBLIC FOIA / reading room hosts only.
4
+ # Labels may include units if (and only if) a public FOIA page exists
5
+ # and the user provides its URL.
6
+
7
+ ALLOWED_FOIA_SOURCES = {
8
+ # Core
9
+ "vault.fbi.gov": "FBI",
10
+ "www.cia.gov": "CIA",
11
+ "www.archives.gov": "NARA",
12
+ "foia.state.gov": "State Dept",
13
+ "www.nsa.gov": "NSA",
14
+ "www.defense.gov": "DoD",
15
+ "www.esd.whs.mil": "DoD FOIA",
16
+ "www.whitehouse.gov": "White House",
17
+
18
+ # Military (public FOIA pages)
19
+ "www.af.mil": "USAF",
20
+ "www.navy.mil": "US Navy",
21
+ "www.marines.mil": "USMC",
22
+ "www.army.mil": "US Army",
23
+ "www.spaceforce.mil": "US Space Force",
24
+
25
+ # Intelligence / defense components (public FOIA pages only)
26
+ "www.dia.mil": "DIA",
27
+ "www.nro.gov": "NRO",
28
+
29
+ # Law enforcement / protective services (public FOIA pages)
30
+ "www.secretservice.gov": "US Secret Service",
31
+ "www.dea.gov": "DEA",
32
+
33
+ # Labels for historical / organizational references
34
+ # (NO claim of dedicated public repositories)
35
+ # These are ONLY labels applied if a public FOIA URL is supplied.
36
+ "label:SAC": "CIA Special Activities Center (label only)",
37
+ "label:SAD": "CIA Special Activities Division (label only)",
38
+ "label:NIA": "National Intelligence Authority (historical)"
39
+ }
air_force_foia_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class USAirForceAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "USAirForce"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
analysis.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ def build_timeline(docs: List[dict]) -> Dict[str, int]:
4
+ timeline: Dict[str, int] = {}
5
+
6
+ for d in docs:
7
+ year = d.get("date", "")[:4]
8
+ if not year.isdigit():
9
+ continue
10
+ timeline[year] = timeline.get(year, 0) + 1
11
+
12
+ return timeline
analytics.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import time
3
+ from collections import Counter
4
+
5
+ _events = Counter()
6
+
7
+ def track(event: str):
8
+ _events[event] += 1
9
+
10
+ def snapshot():
11
+ return {
12
+ "timestamp": int(time.time()),
13
+ "events": dict(_events)
14
+ }
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ from ingest.cia_reading_room import CIAAdapter
4
+ from ingest.fbi_vault_live import FBIAdapter
5
+ from ingest.dod_reading_room_live import DoDAdapter
6
+ from core.async_search import fanout_search
7
+ from core.cache import dedupe
8
+ from core.cluster import cluster_results
9
+ from core.citations import citation_block
10
+ from core.redaction import redaction_confidence
11
+ from core.journalist import journalist_export
12
+ from core.explain import explain
13
+
14
+ cia, fbi, dod = CIAAdapter(), FBIAdapter(), DoDAdapter()
15
+
16
+ async def run(q):
17
+ res = await fanout_search([cia,fbi,dod], q)
18
+ return dedupe(res)
19
+
20
+ with gr.Blocks() as demo:
21
+ gr.Markdown("# FOIA Federated Search — Supreme")
22
+
23
+ q = gr.Textbox(label="Query")
24
+ results_state = gr.State([])
25
+
26
+ with gr.Tabs():
27
+ with gr.Tab("Clustered Results"):
28
+ clusters = gr.JSON()
29
+ with gr.Tab("Citations"):
30
+ cites = gr.Markdown()
31
+ with gr.Tab("Explainability"):
32
+ explain_box = gr.JSON()
33
+
34
+ preview = gr.JSON(label="Redaction Confidence")
35
+
36
+ def _run(q):
37
+ res = asyncio.run(run(q))
38
+ cl = cluster_results(res)
39
+ cites_md = "\n".join(citation_block(r) for r in res[:5])
40
+ explain_data = explain(res)
41
+ red = {r.get("url"): redaction_confidence(r) for r in res}
42
+ return res, cl, cites_md, explain_data, red
43
+
44
+ btn = gr.Button("Search")
45
+ btn.click(_run, inputs=q, outputs=[results_state, clusters, cites, explain_box, preview])
46
+
47
+ exp = gr.Button("Journalist Export")
48
+ out = gr.File()
49
+ exp.click(lambda r: journalist_export(r, "/tmp/journalist_export.zip"), inputs=results_state, outputs=out)
50
+
51
+ demo.launch()
appeal_pdf.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from reportlab.platypus import SimpleDocTemplate, Paragraph
2
+ from reportlab.lib.styles import getSampleStyleSheet
3
+
4
+ def generate_appeal_pdf(text, filename="appeal.pdf"):
5
+ doc = SimpleDocTemplate(filename)
6
+ styles = getSampleStyleSheet()
7
+ doc.build([Paragraph(text, styles["BodyText"])])
8
+ return filename
appeals.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def draft_appeal(document: str, agency: str, reason: str) -> str:
2
+ return f"""
3
+ FOIA Appeal – Request for Reconsideration
4
+
5
+ Agency: {agency}
6
+ Document: {document}
7
+
8
+ Basis for Appeal:
9
+ {reason}
10
+
11
+ This appeal concerns a publicly released document and requests
12
+ review of redactions or withholdings under applicable FOIA exemptions.
13
+
14
+ Sincerely,
15
+ [Requestor]
16
+ """.strip()
army_foia_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class USArmyAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "USArmy"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
async_search.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ async def fanout_search(adapters, query):
4
+ tasks = [adapter.search(query) for adapter in adapters]
5
+ results = await asyncio.gather(*tasks, return_exceptions=True)
6
+ docs = []
7
+ for r in results:
8
+ if isinstance(r, list):
9
+ docs.extend(r)
10
+ return docs
audit.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ from datetime import datetime
3
+ from typing import Dict, List
4
+
5
+ _AUDIT_LOG: List[Dict] = []
6
+
7
+ def log_event(action: str, payload: Dict) -> Dict:
8
+ entry = {
9
+ "id": str(uuid.uuid4()),
10
+ "timestamp": datetime.utcnow().isoformat() + "Z",
11
+ "action": action,
12
+ "payload": payload
13
+ }
14
+ _AUDIT_LOG.append(entry)
15
+ return entry
16
+
17
+ def export_audit_log() -> List[Dict]:
18
+ return list(_AUDIT_LOG)
cache.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from typing import Dict, Any, List
3
+ from core.faiss_vector import FaissIndex
4
+
5
+ _TTL = 300 # seconds
6
+ _cache: Dict[str, Any] = {}
7
+ _faiss = None
8
+
9
+ def _now():
10
+ return int(time.time())
11
+
12
+ def _get_index():
13
+ global _faiss
14
+ if _faiss is None:
15
+ _faiss = FaissIndex()
16
+ return _faiss
17
+
18
+ def cache_get(key):
19
+ v = _cache.get(key)
20
+ if not v:
21
+ return None
22
+ ts, data = v
23
+ if _now() - ts > _TTL:
24
+ _cache.pop(key, None)
25
+ return None
26
+ return data
27
+
28
+ def cache_set(key, data: List[dict]):
29
+ _cache[key] = (_now(), data)
30
+ # add snippets to FAISS for local semantic recall
31
+ texts = [d.get("snippet","") for d in data if d.get("snippet")]
32
+ if texts:
33
+ try:
34
+ _get_index().add(texts)
35
+ except Exception:
36
+ pass
37
+
38
+ def dedupe(results: List[dict]) -> List[dict]:
39
+ seen = set()
40
+ out = []
41
+ for r in results:
42
+ h = hash((r.get("source"), r.get("url"), r.get("snippet")))
43
+ if h not in seen:
44
+ seen.add(h)
45
+ out.append(r)
46
+ return out
47
+
48
+ def source_counts(results: List[dict]) -> Dict[str,int]:
49
+ counts = {}
50
+ for r in results:
51
+ s = r.get("source","Unknown")
52
+ counts[s] = counts.get(s, 0) + 1
53
+ return counts
cia.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .common import fetch, clean
2
+ from bs4 import BeautifulSoup
3
+
4
+ def search_cia(query):
5
+ url = "https://www.cia.gov/readingroom/search/site/"
6
+ html = fetch(url, {"search_api_fulltext": query})
7
+ soup = BeautifulSoup(html, "html.parser")
8
+
9
+ results = []
10
+ for item in soup.select(".views-row"):
11
+ a = item.select_one("a")
12
+ if not a:
13
+ continue
14
+ results.append({
15
+ "title": clean(a.text),
16
+ "agency": "CIA",
17
+ "date": None,
18
+ "snippet": None,
19
+ "url": "https://www.cia.gov" + a["href"]
20
+ })
21
+ return results
cia_reading_room.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+ from ingest.generic_public_foia import GenericFOIAAdapter
3
+
4
+ class CIAAdapter(GenericFOIAAdapter):
5
+ name = "CIA"
6
+ rate_limit = 1
7
+ robots_respected = True
8
+ base_url = "https://www.cia.gov/readingroom/search/site/"
9
+
10
+ async def search(self, query: str):
11
+ async with httpx.AsyncClient(timeout=10) as client:
12
+ r = await client.get(self.base_url, params={"query": query})
13
+ if r.status_code != 200:
14
+ return []
15
+ # Minimal safe parse: return page-level hit
16
+ return [{
17
+ "source": "CIA FOIA Reading Room",
18
+ "query": query,
19
+ "url": str(r.url),
20
+ "snippet": "Public FOIA search result page"
21
+ }]
citations.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ def citation_block(result: dict) -> str:
2
+ return f"""---
3
+ Source: {result.get('source')}
4
+ Title: {result.get('title','Unknown')}
5
+ URL: {result.get('url')}
6
+ Retrieved: {result.get('retrieved_at','N/A')}
7
+ ---"""
cluster.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ from core.faiss_vector import FaissIndex
3
+
4
+ def cluster_results(results: List[dict], k: int = 5) -> Dict[str, List[dict]]:
5
+ texts = [r.get("snippet","") for r in results if r.get("snippet")]
6
+ index = FaissIndex()
7
+ index.add(texts)
8
+ clusters = {}
9
+ for r in results:
10
+ key = r.get("source","Unknown")
11
+ clusters.setdefault(key, []).append(r)
12
+ return clusters
coast_guard_foia_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class USCoastGuardAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "USCoastGuard"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
collaboration.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import Dataset
2
+ from typing import Dict, List
3
+
4
+ _COLLAB: List[Dict] = []
5
+
6
+ def add_collaboration_note(document: str, note: str) -> Dict:
7
+ record = {
8
+ "document": document,
9
+ "note": note
10
+ }
11
+ _COLLAB.append(record)
12
+ return record
13
+
14
+ def get_collaboration_dataset() -> Dataset:
15
+ if not _COLLAB:
16
+ return Dataset.from_dict({"document": [], "note": []})
17
+ return Dataset.from_list(_COLLAB)
common.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ HEADERS = {
5
+ "User-Agent": "FOIA-Federated-Search/1.0 (public, non-crawling)"
6
+ }
7
+
8
+ def fetch(url, params=None):
9
+ r = requests.get(url, params=params, headers=HEADERS, timeout=10)
10
+ r.raise_for_status()
11
+ return r.text
12
+
13
+ def clean(text):
14
+ return " ".join(text.split()) if text else ""
darpa_reading_room.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class DARPAAdapter(GenericFOIAAdapter):
4
+ """Public-release FOIA reading room adapter.
5
+ NOTE: This adapter is restricted to publicly released materials only.
6
+ """
7
+ name = "DARPA"
8
+ rate_limit = 1 # requests per second
9
+ robots_respected = True
dea.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ def search_dea(query):
2
+ return [{
3
+ "title": "DEA FOIA Reading Room",
4
+ "agency": "DEA",
5
+ "date": None,
6
+ "snippet": query,
7
+ "url": "https://www.dea.gov/foia"
8
+ }]
dea_reading_room.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class DEAAdapter(GenericFOIAAdapter):
4
+ name = "DEA"
5
+ rate_limit = 1 # requests per second
6
+ robots_respected = True
dhs.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ def search_dhs(query):
2
+ return [{
3
+ "title": f"DHS FOIA Search",
4
+ "agency": "DHS",
5
+ "date": None,
6
+ "snippet": query,
7
+ "url": "https://www.dhs.gov/foia"
8
+ }]
dhs_reading_room.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class DHSAdapter(GenericFOIAAdapter):
4
+ name = "DHS"
5
+ rate_limit = 1 # requests per second
6
+ robots_respected = True
dia.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ def search_dia(query):
2
+ return [{
3
+ "title": "DIA FOIA Reading Room",
4
+ "agency": "DIA",
5
+ "date": None,
6
+ "snippet": query,
7
+ "url": "https://www.dia.mil/FOIA/"
8
+ }]
dia_reading_room.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class DIAAdapter(GenericFOIAAdapter):
4
+ name = "DIA"
5
+ rate_limit = 1 # requests per second
6
+ robots_respected = True
dod.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ def search_dod(query):
2
+ return [{
3
+ "title": f"DoD FOIA Search: {query}",
4
+ "agency": "DoD",
5
+ "date": None,
6
+ "snippet": "Redirect to DoD FOIA Reading Room search",
7
+ "url": "https://open.defense.gov/Transparency/FOIA/"
8
+ }]
dod_reading_room.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from ingest.generic_public_foia import GenericFOIAAdapter
2
+
3
+ class DoDAdapter(GenericFOIAAdapter):
4
+ name = "DoD"
5
+ rate_limit = 1 # requests per second
6
+ robots_respected = True
dod_reading_room_live.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+ from ingest.generic_public_foia import GenericFOIAAdapter
3
+
4
+ class DoDAdapter(GenericFOIAAdapter):
5
+ name = "DoD FOIA Reading Room"
6
+ rate_limit = 1
7
+ robots_respected = True
8
+ base_url = "https://www.esd.whs.mil/FOIA/Reading-Room/"
9
+
10
+ async def search(self, query: str):
11
+ async with httpx.AsyncClient(timeout=10) as client:
12
+ r = await client.get(self.base_url, params={"search": query})
13
+ if r.status_code != 200:
14
+ return []
15
+ return [{
16
+ "source": "DoD FOIA Reading Room",
17
+ "query": query,
18
+ "url": str(r.url),
19
+ "snippet": "Public DoD FOIA reading room page"
20
+ }]
doj.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ def search_doj(query):
2
+ return [{
3
+ "title": f"DOJ FOIA Reading Room",
4
+ "agency": "DOJ",
5
+ "date": None,
6
+ "snippet": query,
7
+ "url": "https://www.justice.gov/oip/foia-reading-room"
8
+ }]
entity_graph.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import networkx as nx
2
+ from typing import List, Dict
3
+
4
+ def build_entity_graph(docs: List[Dict]) -> Dict:
5
+ G = nx.Graph()
6
+
7
+ for d in docs:
8
+ agency = d.get("agency", "Unknown")
9
+ G.add_node(agency, group="agency")
10
+
11
+ for token in d.get("content", "").split():
12
+ if token.isupper() and len(token) > 2:
13
+ G.add_node(token, group="entity")
14
+ G.add_edge(agency, token)
15
+
16
+ return {
17
+ "nodes": [{"id": n, "group": G.nodes[n]["group"]} for n in G.nodes],
18
+ "links": [{"source": u, "target": v} for u, v in G.edges]
19
+ }
explain.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def explain(results):
2
+ return {
3
+ "total_results": len(results),
4
+ "sources": list(set(r.get("source") for r in results)),
5
+ "methods": [
6
+ "Public FOIA reading room search",
7
+ "Async fan-out querying",
8
+ "Deduplication",
9
+ "Semantic refinement (FAISS)"
10
+ ],
11
+ "no_restricted_access": True
12
+ }
export_utils.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ def export_json(data):
4
+ path = "/tmp/results.json"
5
+ with open(path, "w") as f:
6
+ json.dump(data, f, indent=2)
7
+ return path
faiss_vector.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ try:
2
+ import faiss
3
+ from sentence_transformers import SentenceTransformer
4
+ except ImportError:
5
+ faiss = None
6
+
7
+ class FaissIndex:
8
+ def __init__(self, model_name="all-MiniLM-L6-v2"):
9
+ if faiss is None:
10
+ raise RuntimeError("FAISS not installed")
11
+ self.model = SentenceTransformer(model_name)
12
+ self.index = None
13
+ self.docs = []
14
+
15
+ def add(self, texts):
16
+ emb = self.model.encode(texts)
17
+ if self.index is None:
18
+ self.index = faiss.IndexFlatL2(emb.shape[1])
19
+ self.index.add(emb)
20
+ self.docs.extend(texts)
21
+
22
+ def search(self, query, k=5):
23
+ emb = self.model.encode([query])
24
+ D, I = self.index.search(emb, k)
25
+ return [self.docs[i] for i in I[0] if i < len(self.docs)]
fbi.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .common import fetch, clean
2
+ from bs4 import BeautifulSoup
3
+
4
+ def search_fbi(query):
5
+ html = fetch("https://vault.fbi.gov/search", {"SearchableText": query})
6
+ soup = BeautifulSoup(html, "html.parser")
7
+
8
+ results = []
9
+ for a in soup.select("a"):
10
+ href = a.get("href", "")
11
+ if "/vault/" in href:
12
+ results.append({
13
+ "title": clean(a.text),
14
+ "agency": "FBI",
15
+ "date": None,
16
+ "snippet": None,
17
+ "url": href if href.startswith("http") else "https://vault.fbi.gov" + href
18
+ })
19
+ return results
fbi_vault.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ def ingest_fbi_vault(url: str) -> dict:
5
+ r = requests.get(url, timeout=10)
6
+ r.raise_for_status()
7
+ soup = BeautifulSoup(r.text, "html.parser")
8
+
9
+ text = soup.get_text(separator=" ", strip=True)
10
+ title = soup.find("h1")
11
+
12
+ return {
13
+ "source": "FBI Vault",
14
+ "agency": "FBI",
15
+ "url": url,
16
+ "title": title.text if title else "FBI Vault Document",
17
+ "text": text[:10000]
18
+ }(r.text, "html.parser")
19
+
20
+ title = soup.find("h1")
21
+ body = soup.get_text(separator=" ", strip=True)
22
+
23
+ return {
24
+ "source": "FBI Vault",
25
+ "url": url,
26
+ "title": title.text if title else "Untitled FBI Vault Document",
27
+ "text": body,
28
+ "agency": "FBI"
29
+ }