Spaces:
Sleeping
Sleeping
Upload 98 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- AGENCY_COVERAGE.md +11 -0
- CODE_OF_CONDUCT.md +28 -0
- Dockerfile.hf +5 -0
- ETHICS.md +24 -0
- FILE_INVENTORY.txt +60 -0
- HASH_MANIFEST.json +62 -0
- HF_JUSTIFICATION.md +12 -0
- HF_SPACE_README.md +32 -0
- LEGAL_MEMO.md +6 -0
- README.md +35 -0
- README_PROD.md +40 -0
- SOURCES.md +23 -0
- __init__.py +0 -0
- aatip_reading_room.py +9 -0
- aatip_sample.txt +1 -0
- agency_registry.py +39 -0
- air_force_foia_reading_room.py +9 -0
- analysis.py +12 -0
- analytics.py +14 -0
- app.py +51 -0
- appeal_pdf.py +8 -0
- appeals.py +16 -0
- army_foia_reading_room.py +9 -0
- async_search.py +10 -0
- audit.py +18 -0
- cache.py +53 -0
- cia.py +21 -0
- cia_reading_room.py +21 -0
- citations.py +7 -0
- cluster.py +12 -0
- coast_guard_foia_reading_room.py +9 -0
- collaboration.py +17 -0
- common.py +14 -0
- darpa_reading_room.py +9 -0
- dea.py +8 -0
- dea_reading_room.py +6 -0
- dhs.py +8 -0
- dhs_reading_room.py +6 -0
- dia.py +8 -0
- dia_reading_room.py +6 -0
- dod.py +8 -0
- dod_reading_room.py +6 -0
- dod_reading_room_live.py +20 -0
- doj.py +8 -0
- entity_graph.py +19 -0
- explain.py +12 -0
- export_utils.py +7 -0
- faiss_vector.py +25 -0
- fbi.py +19 -0
- fbi_vault.py +29 -0
AGENCY_COVERAGE.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Agency Coverage Map
|
| 3 |
+
|
| 4 |
+
| Agency | Public FOIA Reading Room |
|
| 5 |
+
|------|---------------------------|
|
| 6 |
+
| CIA | https://www.cia.gov/readingroom/ |
|
| 7 |
+
| FBI | https://vault.fbi.gov/ |
|
| 8 |
+
| DoD | https://www.foia.mil/ |
|
| 9 |
+
| NSA | https://www.nsa.gov/readingroom/ |
|
| 10 |
+
| NRO | https://www.nro.gov/FOIA/ |
|
| 11 |
+
| DHS | https://www.dhs.gov/foia-reading-room |
|
CODE_OF_CONDUCT.md
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Code of Conduct
|
| 2 |
+
|
| 3 |
+
## Our Pledge
|
| 4 |
+
|
| 5 |
+
This project is committed to providing a respectful, inclusive, and responsible environment for all contributors and users.
|
| 6 |
+
|
| 7 |
+
## Acceptable Use
|
| 8 |
+
|
| 9 |
+
Participants agree to:
|
| 10 |
+
- Use this project for lawful, ethical, and non-harmful purposes
|
| 11 |
+
- Respect the public-record nature of FOIA documents
|
| 12 |
+
- Avoid speculative, defamatory, or misleading interpretations
|
| 13 |
+
|
| 14 |
+
## Unacceptable Use
|
| 15 |
+
|
| 16 |
+
This project must not be used to:
|
| 17 |
+
- Harass or target individuals
|
| 18 |
+
- Make unsubstantiated allegations
|
| 19 |
+
- Claim access to classified or restricted information
|
| 20 |
+
- Bypass legal or ethical safeguards
|
| 21 |
+
|
| 22 |
+
## Enforcement
|
| 23 |
+
|
| 24 |
+
Maintainers may remove content or restrict access that violates this Code of Conduct.
|
| 25 |
+
|
| 26 |
+
---
|
| 27 |
+
|
| 28 |
+
This project is intended for civic transparency, education, and research.
|
Dockerfile.hf
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
COPY . /app
|
| 4 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 5 |
+
CMD ["python", "app.py"]
|
ETHICS.md
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ethics Policy
|
| 2 |
+
|
| 3 |
+
## Purpose
|
| 4 |
+
|
| 5 |
+
This project exists to support transparency, research, and public understanding of government records released under the Freedom of Information Act (FOIA).
|
| 6 |
+
|
| 7 |
+
## Guiding Principles
|
| 8 |
+
|
| 9 |
+
- **Public Sources Only:** All data must originate from publicly released documents.
|
| 10 |
+
- **No Speculation:** The project does not infer, predict, or hypothesize beyond document text.
|
| 11 |
+
- **Citation First:** Outputs must be traceable to source material.
|
| 12 |
+
- **No Harm:** The tool must not be used to defame, harass, or mislead.
|
| 13 |
+
|
| 14 |
+
## Redactions
|
| 15 |
+
|
| 16 |
+
Redacted content is respected. This project does not attempt to reconstruct or infer withheld information.
|
| 17 |
+
|
| 18 |
+
## Accountability
|
| 19 |
+
|
| 20 |
+
Users are responsible for how they interpret and use results. This tool provides analytical assistance, not conclusions.
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
Ethical transparency is foundational to this project.
|
FILE_INVENTORY.txt
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CODE_OF_CONDUCT.md
|
| 2 |
+
ETHICS.md
|
| 3 |
+
README.md
|
| 4 |
+
README_PROD.md
|
| 5 |
+
__init__.py
|
| 6 |
+
adapters/__init__.py
|
| 7 |
+
adapters/cia.py
|
| 8 |
+
adapters/common.py
|
| 9 |
+
adapters/dea.py
|
| 10 |
+
adapters/dhs.py
|
| 11 |
+
adapters/dia.py
|
| 12 |
+
adapters/dod.py
|
| 13 |
+
adapters/doj.py
|
| 14 |
+
adapters/fbi.py
|
| 15 |
+
adapters/ice.py
|
| 16 |
+
adapters/nia.py
|
| 17 |
+
adapters/nsa.py
|
| 18 |
+
app.py
|
| 19 |
+
appeal_pdf.py
|
| 20 |
+
appeals/__init__.py
|
| 21 |
+
appeals/pdf_appeal.py
|
| 22 |
+
audit.py
|
| 23 |
+
collaboration.py
|
| 24 |
+
collaboration/__init__.py
|
| 25 |
+
collaboration/icij.py
|
| 26 |
+
core/__init__.py
|
| 27 |
+
core/analysis.py
|
| 28 |
+
core/appeals.py
|
| 29 |
+
core/explain.py
|
| 30 |
+
core/index.py
|
| 31 |
+
core/multi_program.py
|
| 32 |
+
core/redaction.py
|
| 33 |
+
core/search.py
|
| 34 |
+
core/vector.py
|
| 35 |
+
data/demo/documents/aatip_sample.txt
|
| 36 |
+
data/demo/documents/tencap_sample.txt
|
| 37 |
+
data/demo/metadata.json
|
| 38 |
+
data/foia_sources.json
|
| 39 |
+
entity_graph.py
|
| 40 |
+
export_utils.py
|
| 41 |
+
file_structure.txt
|
| 42 |
+
foia_pdf.py
|
| 43 |
+
foia_requests.py
|
| 44 |
+
gitattributes.txt
|
| 45 |
+
ingest/__init__.py
|
| 46 |
+
ingest/agency_registry.py
|
| 47 |
+
ingest/cia_reading_room.py
|
| 48 |
+
ingest/fbi_vault.py
|
| 49 |
+
ingest/generic_public_foia.py
|
| 50 |
+
ingest/loader.py
|
| 51 |
+
ingest/sources.py
|
| 52 |
+
requirements.txt
|
| 53 |
+
schemas.py
|
| 54 |
+
search/__init__.py
|
| 55 |
+
search/semantic.py
|
| 56 |
+
semantic.py
|
| 57 |
+
tests/__init__.py
|
| 58 |
+
tests/test_core.py
|
| 59 |
+
tests/test_schema.py
|
| 60 |
+
vector_store.py
|
HASH_MANIFEST.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"CODE_OF_CONDUCT.md": "b674f96cae26f0050be863c4b8782510fcae5ab855f0822ec4a0217763a84601",
|
| 3 |
+
"ETHICS.md": "d4f7c23c1e60297712786e392800158fcbe21116576496632e8221b0b8a16ff2",
|
| 4 |
+
"README.md": "e9bfdd2d6a4422fcb132bd4033a69d2241574c31fab71820e4643491b3b1225b",
|
| 5 |
+
"app.py": "c2a2b16ce45a327de0d42196104cb7fc50ec29ff1cb1fb95517a8ca655a3192a",
|
| 6 |
+
"appeal_pdf.py": "2d28ca1d0e796bfb5da25eac05a91354aadd58deefd041acded9a01a64055f9c",
|
| 7 |
+
"audit.py": "01c286d4067c6fffcb990391d8f750719c1ccac07eafc4477ccbdd1be4dd11e8",
|
| 8 |
+
"collaboration.py": "7cbd52c0da9be9f205b2901d8a94f28cb96612ffe506bcad1c7991885cd2d947",
|
| 9 |
+
"entity_graph.py": "dbe21fa0d8e7528daeee34d598efba836ab6370ad609de80746be1b12a4e0ff5",
|
| 10 |
+
"export_utils.py": "a01a088fd650a947a7831e795508208d3caa430d099aa5a8d7823ba462f0a80e",
|
| 11 |
+
"file_structure.txt": "6eee55e586751e3ae1405349f01dd35703e678d8e105ea19fc58eb15e4c2a6fa",
|
| 12 |
+
"foia_pdf.py": "babbd69a2da67681f15596ab254174310b8381d5853da72fe068d31d746725ab",
|
| 13 |
+
"foia_requests.py": "ca9c765bb7a591c462a94b0aa42957d1b3124128266d4880f0654895ce0ca6c0",
|
| 14 |
+
"gitattributes.txt": "11ad7efa24975ee4b0c3c3a38ed18737f0658a5f75a0a96787b576a78a023361",
|
| 15 |
+
"requirements.txt": "444bc9beedfa3fde82790f47c1e9b94bab90be2fefd0648de0ffdebbcc2eb61c",
|
| 16 |
+
"schemas.py": "e08b38513be2572af7d022e013f037c4f614f2117db85d4d776c408be96815ef",
|
| 17 |
+
"semantic.py": "4ffcf9149f08b8e69473e5418588dd370bbd470b137f2d0761901fccf09238cf",
|
| 18 |
+
"vector_store.py": "c61701e38e12150c541d284e13824341dde1794d3b4149d2a7d332b8023ad923",
|
| 19 |
+
"__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
| 20 |
+
"README_PROD.md": "3b5d0a9f882f8f980a08452ca589a788b3c7cfe2ed8b7ca13a01f9c4a12e9060",
|
| 21 |
+
"adapters/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
| 22 |
+
"adapters/cia.py": "a934e6a67aa6662391814036a9084779f15ad9ca5059f5461e2c374dfa9c3344",
|
| 23 |
+
"adapters/common.py": "c76c7ea1ce1616a2c99bfeec47ca046e75088f4d807f9c94ce5f87c9eeed5714",
|
| 24 |
+
"adapters/dea.py": "f1b9832aeaabecf5da8f1125e33883ce28e37d081149f6c75bf9ef49ac3ead8a",
|
| 25 |
+
"adapters/dhs.py": "66c44ee323135ee8e3c0cb7a2bb83d2d9dc20f7b88b6db4823e1bd5d03be6227",
|
| 26 |
+
"adapters/dia.py": "5c003321750582f502bcf0e2115956edb9af3aa8937917b72d7b25036b493f6d",
|
| 27 |
+
"adapters/dod.py": "410726bcab164fa9991d0ba61b3d9586d271ee4d55f65d1bd02193e84f02ed30",
|
| 28 |
+
"adapters/doj.py": "56080addcaef0a01d2395b6d44a93e9e271bc569a688f65657617d730a054eac",
|
| 29 |
+
"adapters/fbi.py": "b81b80972adf70b8283f2c16b241d17f46ab3ab73cd3ab4155dc88f7afbbcfc2",
|
| 30 |
+
"adapters/ice.py": "f0d06239d483933ba53966bc8015b9ca9f3ead3ebb535f4f963f5a26afd340b3",
|
| 31 |
+
"adapters/nia.py": "cbc240d23d7ac144d0ca0a49e83341df579903092c13c7603cfe438e7dd58a84",
|
| 32 |
+
"adapters/nsa.py": "a5a7ff4f8d3b1397bccc6095471de814aad75e2711566065f8cf7f4f43c59303",
|
| 33 |
+
"appeals/pdf_appeal.py": "cfe7ca493bf9a4280eff3d90494b2e2afc8bfed92ee99d5e175c1daf49ddadf6",
|
| 34 |
+
"appeals/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
| 35 |
+
"collaboration/icij.py": "bd02217afd54664762594dfcd1e8088ac3666c641acd450d3b233cf05f08a641",
|
| 36 |
+
"collaboration/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
| 37 |
+
"core/analysis.py": "e745cc6ad43d5193c92b5d7c417db4546ec301a96761090a319f7a477722dd99",
|
| 38 |
+
"core/appeals.py": "9ac66f34fdb2e741b6341de258291fd99db7f4a95862e39aa4cae94448726609",
|
| 39 |
+
"core/explain.py": "accdde04f5faf85b48302917f6274a12f06b9058fac5941cfa7ce9a64a6c45a3",
|
| 40 |
+
"core/index.py": "d266fc0aacbc2445b25cafbc29530e9138bb626090fb716681f300976927903c",
|
| 41 |
+
"core/multi_program.py": "444928c79f9778ebffcdb47262ba63b2eb19d2ed4d97d5632682a92e91861138",
|
| 42 |
+
"core/redaction.py": "b99bbbcb659e1f60902bca7e2bde5b0c28f371b7a6feb9daff489bb8fd96b878",
|
| 43 |
+
"core/search.py": "5843e5ee44d88688862b73e5457ff596dd229fc9433600c2e1a978868c8a2296",
|
| 44 |
+
"core/vector.py": "518e78f8c363735f5629584d2d5e25876a7f80063cd74e72a080723380141ce8",
|
| 45 |
+
"core/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
| 46 |
+
"data/foia_sources.json": "8fe166a285717548afb937ad7a669020c60f91d9ec9f06dbdee9954f3396bd2a",
|
| 47 |
+
"data/demo/metadata.json": "89d069dd00b20d1c74eb6f192a09b9d11226d86d5f7754159b3b1717512302d3",
|
| 48 |
+
"data/demo/documents/aatip_sample.txt": "8b8d9a6167699a123885330093dac739025bfe0d7fabfdfd596707ab53db9f81",
|
| 49 |
+
"data/demo/documents/tencap_sample.txt": "e1930579e04e76cc2ced2b5b253fa59e907b28bbffba2f8c1710693cfc84b167",
|
| 50 |
+
"ingest/agency_registry.py": "89581ae5dcf6f0e5614939ce8538e17f4e22a1751d806bfce5cd51fbf9d35f85",
|
| 51 |
+
"ingest/cia_reading_room.py": "ebfa118842937a7929a1ce58998650f11081306e8a017d53f01e11262917f2e5",
|
| 52 |
+
"ingest/fbi_vault.py": "9a24fd572db556cc182239738ca2c551d6cb6a393a325f3fc8f6db9cbf1c157b",
|
| 53 |
+
"ingest/generic_public_foia.py": "60f174b9ada68330a70ca11898ae3fbb7d225e2f404265903a5079aaa274baa1",
|
| 54 |
+
"ingest/loader.py": "12b2b68d4c3a902270be73bebb1218314f19b225f7df4e436191f433378aca18",
|
| 55 |
+
"ingest/sources.py": "4b995bff081e14cbe3b66deb516abc74fce09e29f3e36463f60bbbcaf11b075b",
|
| 56 |
+
"ingest/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
| 57 |
+
"search/semantic.py": "974faa592af9a67ec50a691180ad68d90e00d38244871680c0c45f31a77f8f36",
|
| 58 |
+
"search/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
| 59 |
+
"tests/test_core.py": "58e5d87c0de8482328abcc27d7a1452cdc6a69740eb0de4395c78a250d12d79e",
|
| 60 |
+
"tests/test_schema.py": "04c0343db5c7516679395717a1dd4c2eca4e325cf038e5c6ee794c2a62649119",
|
| 61 |
+
"tests/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
|
| 62 |
+
}
|
HF_JUSTIFICATION.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
This Hugging Face Space provides a public-interest federated search interface
|
| 3 |
+
across U.S. Government FOIA Electronic Reading Rooms.
|
| 4 |
+
|
| 5 |
+
Safeguards:
|
| 6 |
+
- Public sources only
|
| 7 |
+
- No authentication bypass
|
| 8 |
+
- Rate limiting and health checks
|
| 9 |
+
- Redaction-aware previews
|
| 10 |
+
- Metadata indexing only
|
| 11 |
+
|
| 12 |
+
Intended for journalism, research, and accountability.
|
HF_SPACE_README.md
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FOIA Federated Document Search (Public Interest)
|
| 2 |
+
|
| 3 |
+
🚀 **Hugging Face Space – Transparency & Accountability Tool**
|
| 4 |
+
|
| 5 |
+
This application provides **semantic search across publicly released U.S. Government FOIA electronic reading rooms**.
|
| 6 |
+
It does **not** access classified, private, or restricted systems.
|
| 7 |
+
|
| 8 |
+
## What This Is
|
| 9 |
+
- Federated FOIA document search
|
| 10 |
+
- Semantic + keyword hybrid retrieval
|
| 11 |
+
- Redaction-aware exports
|
| 12 |
+
- Audit logging
|
| 13 |
+
|
| 14 |
+
## What This Is NOT
|
| 15 |
+
- Surveillance
|
| 16 |
+
- Intelligence gathering
|
| 17 |
+
- Law enforcement tooling
|
| 18 |
+
- Political persuasion
|
| 19 |
+
|
| 20 |
+
## Data Sources
|
| 21 |
+
- CIA FOIA Electronic Reading Room
|
| 22 |
+
- FBI Vault
|
| 23 |
+
- Other agency FOIA libraries (public releases only)
|
| 24 |
+
|
| 25 |
+
## Compliance
|
| 26 |
+
- FOIA-only sources
|
| 27 |
+
- robots.txt respected
|
| 28 |
+
- Rate-limited adapters
|
| 29 |
+
- Redaction before export
|
| 30 |
+
|
| 31 |
+
## Intended Users
|
| 32 |
+
Researchers, journalists, historians, and the general public.
|
LEGAL_MEMO.md
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
FOIA Federated Search – Legal Summary
|
| 3 |
+
|
| 4 |
+
This system indexes publicly released FOIA records.
|
| 5 |
+
No restricted access, no scraping of protected systems.
|
| 6 |
+
Fully compliant with 5 U.S.C. § 552.
|
README.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: FOIA Federated Search
|
| 3 |
+
emoji: 📜
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "4.0"
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# FOIA Federated Search (Public Interest)
|
| 14 |
+
|
| 15 |
+
A Hugging Face Space that provides **live federated search** across publicly available
|
| 16 |
+
U.S. Government FOIA Electronic Reading Rooms (CIA, FBI, DoD, and more).
|
| 17 |
+
|
| 18 |
+
## Key Features
|
| 19 |
+
- Live async fan-out search (no scraping beyond public endpoints)
|
| 20 |
+
- Per-agency source toggles + result counts
|
| 21 |
+
- Semantic *search-in-results* using FAISS + sentence-transformers
|
| 22 |
+
- Local caching + deduplication
|
| 23 |
+
- PDF export of search results
|
| 24 |
+
- Inline document preview (where permitted by source)
|
| 25 |
+
- Rate-limited, health-checked agency adapters
|
| 26 |
+
|
| 27 |
+
## Trust & Safety
|
| 28 |
+
- Queries only **public FOIA reading rooms**
|
| 29 |
+
- Honors robots.txt, rate limits, and agency terms
|
| 30 |
+
- No authentication bypass or restricted content
|
| 31 |
+
- Designed for research, journalism, and public accountability
|
| 32 |
+
|
| 33 |
+
## Legal
|
| 34 |
+
All content remains hosted by the originating agency.
|
| 35 |
+
This tool indexes metadata and snippets for discovery only.
|
README_PROD.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FOIA HF Document Search — Production Build
|
| 2 |
+
|
| 3 |
+
## Entry Point
|
| 4 |
+
- `app.py` — orchestrates ingestion, semantic search, export, and audit hooks.
|
| 5 |
+
|
| 6 |
+
## Ingestion Adapters (Present)
|
| 7 |
+
- CIA Reading Room
|
| 8 |
+
- FBI Vault
|
| 9 |
+
- Generic Public FOIA
|
| 10 |
+
|
| 11 |
+
## Missing / Stubbed Adapters (Recommended)
|
| 12 |
+
- DoD (incl. components)
|
| 13 |
+
- NSA
|
| 14 |
+
- DIA
|
| 15 |
+
- DHS
|
| 16 |
+
- DEA
|
| 17 |
+
- ICE
|
| 18 |
+
|
| 19 |
+
## Vector Backend Assumptions
|
| 20 |
+
- Current code supports abstract vector ops.
|
| 21 |
+
- Recommended backends:
|
| 22 |
+
- FAISS (local)
|
| 23 |
+
- Chroma (persistent)
|
| 24 |
+
- HuggingFace embeddings
|
| 25 |
+
- OpenAI embeddings (optional)
|
| 26 |
+
|
| 27 |
+
## Live Federated Search Upgrade
|
| 28 |
+
- Async querying via `asyncio` + `httpx`
|
| 29 |
+
- Adapter interface with rate limits
|
| 30 |
+
- Response caching + deduplication
|
| 31 |
+
- Circuit breakers for abuse prevention
|
| 32 |
+
|
| 33 |
+
## Compliance
|
| 34 |
+
- Respect robots.txt where applicable
|
| 35 |
+
- Rate limiting per agency
|
| 36 |
+
- Redaction before export
|
| 37 |
+
- Audit logging enabled
|
| 38 |
+
|
| 39 |
+
## Build Timestamp
|
| 40 |
+
2026-01-09T23:51:16.728748Z
|
SOURCES.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FOIA Public Sources
|
| 2 |
+
|
| 3 |
+
All sources listed here are **public FOIA electronic reading rooms** or official public-release libraries.
|
| 4 |
+
|
| 5 |
+
## Intelligence & Defense
|
| 6 |
+
- CIA FOIA Electronic Reading Room — https://www.cia.gov/readingroom/
|
| 7 |
+
- FBI Vault — https://vault.fbi.gov/
|
| 8 |
+
- DARPA FOIA Library — https://www.darpa.mil/work-with-us/foia
|
| 9 |
+
- NRO FOIA Reading Room — https://www.nro.gov/FOIA/
|
| 10 |
+
- DoD FOIA Reading Room — https://www.esd.whs.mil/FOIA/Reading-Room/
|
| 11 |
+
|
| 12 |
+
## Military Branches
|
| 13 |
+
- U.S. Army FOIA — https://www.army.mil/foia
|
| 14 |
+
- U.S. Navy FOIA — https://www.secnav.navy.mil/foia
|
| 15 |
+
- U.S. Air Force FOIA — https://www.af.mil/FOIA/
|
| 16 |
+
- U.S. Marine Corps FOIA — https://www.hqmc.marines.mil/Agencies/FOIA/
|
| 17 |
+
- U.S. Space Force FOIA — https://www.spaceforce.mil/FOIA/
|
| 18 |
+
- U.S. Coast Guard FOIA — https://www.uscg.mil/FOIA/
|
| 19 |
+
|
| 20 |
+
## Other Agencies
|
| 21 |
+
- DHS FOIA Library — https://www.dhs.gov/foia-library
|
| 22 |
+
- DEA FOIA Reading Room — https://www.dea.gov/foia
|
| 23 |
+
- Secret Service FOIA — https://www.secretservice.gov/foia
|
__init__.py
ADDED
|
File without changes
|
aatip_reading_room.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ingest.generic_public_foia import GenericFOIAAdapter
|
| 2 |
+
|
| 3 |
+
class AATIPAdapter(GenericFOIAAdapter):
|
| 4 |
+
"""Public-release FOIA reading room adapter.
|
| 5 |
+
NOTE: This adapter is restricted to publicly released materials only.
|
| 6 |
+
"""
|
| 7 |
+
name = "AATIP"
|
| 8 |
+
rate_limit = 1 # requests per second
|
| 9 |
+
robots_respected = True
|
aatip_sample.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
AATIP referenced ████ by DoD components between 2009 and 2017.
|
agency_registry.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ingest/agency_registry.py
|
| 2 |
+
|
| 3 |
+
# Domains are PUBLIC FOIA / reading room hosts only.
|
| 4 |
+
# Labels may include units if (and only if) a public FOIA page exists
|
| 5 |
+
# and the user provides its URL.
|
| 6 |
+
|
| 7 |
+
ALLOWED_FOIA_SOURCES = {
|
| 8 |
+
# Core
|
| 9 |
+
"vault.fbi.gov": "FBI",
|
| 10 |
+
"www.cia.gov": "CIA",
|
| 11 |
+
"www.archives.gov": "NARA",
|
| 12 |
+
"foia.state.gov": "State Dept",
|
| 13 |
+
"www.nsa.gov": "NSA",
|
| 14 |
+
"www.defense.gov": "DoD",
|
| 15 |
+
"www.esd.whs.mil": "DoD FOIA",
|
| 16 |
+
"www.whitehouse.gov": "White House",
|
| 17 |
+
|
| 18 |
+
# Military (public FOIA pages)
|
| 19 |
+
"www.af.mil": "USAF",
|
| 20 |
+
"www.navy.mil": "US Navy",
|
| 21 |
+
"www.marines.mil": "USMC",
|
| 22 |
+
"www.army.mil": "US Army",
|
| 23 |
+
"www.spaceforce.mil": "US Space Force",
|
| 24 |
+
|
| 25 |
+
# Intelligence / defense components (public FOIA pages only)
|
| 26 |
+
"www.dia.mil": "DIA",
|
| 27 |
+
"www.nro.gov": "NRO",
|
| 28 |
+
|
| 29 |
+
# Law enforcement / protective services (public FOIA pages)
|
| 30 |
+
"www.secretservice.gov": "US Secret Service",
|
| 31 |
+
"www.dea.gov": "DEA",
|
| 32 |
+
|
| 33 |
+
# Labels for historical / organizational references
|
| 34 |
+
# (NO claim of dedicated public repositories)
|
| 35 |
+
# These are ONLY labels applied if a public FOIA URL is supplied.
|
| 36 |
+
"label:SAC": "CIA Special Activities Center (label only)",
|
| 37 |
+
"label:SAD": "CIA Special Activities Division (label only)",
|
| 38 |
+
"label:NIA": "National Intelligence Authority (historical)"
|
| 39 |
+
}
|
air_force_foia_reading_room.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ingest.generic_public_foia import GenericFOIAAdapter
|
| 2 |
+
|
| 3 |
+
class USAirForceAdapter(GenericFOIAAdapter):
|
| 4 |
+
"""Public-release FOIA reading room adapter.
|
| 5 |
+
NOTE: This adapter is restricted to publicly released materials only.
|
| 6 |
+
"""
|
| 7 |
+
name = "USAirForce"
|
| 8 |
+
rate_limit = 1 # requests per second
|
| 9 |
+
robots_respected = True
|
analysis.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List
|
| 2 |
+
|
| 3 |
+
def build_timeline(docs: List[dict]) -> Dict[str, int]:
|
| 4 |
+
timeline: Dict[str, int] = {}
|
| 5 |
+
|
| 6 |
+
for d in docs:
|
| 7 |
+
year = d.get("date", "")[:4]
|
| 8 |
+
if not year.isdigit():
|
| 9 |
+
continue
|
| 10 |
+
timeline[year] = timeline.get(year, 0) + 1
|
| 11 |
+
|
| 12 |
+
return timeline
|
analytics.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import time
|
| 3 |
+
from collections import Counter
|
| 4 |
+
|
| 5 |
+
_events = Counter()
|
| 6 |
+
|
| 7 |
+
def track(event: str):
|
| 8 |
+
_events[event] += 1
|
| 9 |
+
|
| 10 |
+
def snapshot():
|
| 11 |
+
return {
|
| 12 |
+
"timestamp": int(time.time()),
|
| 13 |
+
"events": dict(_events)
|
| 14 |
+
}
|
app.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import asyncio
|
| 3 |
+
from ingest.cia_reading_room import CIAAdapter
|
| 4 |
+
from ingest.fbi_vault_live import FBIAdapter
|
| 5 |
+
from ingest.dod_reading_room_live import DoDAdapter
|
| 6 |
+
from core.async_search import fanout_search
|
| 7 |
+
from core.cache import dedupe
|
| 8 |
+
from core.cluster import cluster_results
|
| 9 |
+
from core.citations import citation_block
|
| 10 |
+
from core.redaction import redaction_confidence
|
| 11 |
+
from core.journalist import journalist_export
|
| 12 |
+
from core.explain import explain
|
| 13 |
+
|
| 14 |
+
cia, fbi, dod = CIAAdapter(), FBIAdapter(), DoDAdapter()
|
| 15 |
+
|
| 16 |
+
async def run(q):
|
| 17 |
+
res = await fanout_search([cia,fbi,dod], q)
|
| 18 |
+
return dedupe(res)
|
| 19 |
+
|
| 20 |
+
with gr.Blocks() as demo:
|
| 21 |
+
gr.Markdown("# FOIA Federated Search — Supreme")
|
| 22 |
+
|
| 23 |
+
q = gr.Textbox(label="Query")
|
| 24 |
+
results_state = gr.State([])
|
| 25 |
+
|
| 26 |
+
with gr.Tabs():
|
| 27 |
+
with gr.Tab("Clustered Results"):
|
| 28 |
+
clusters = gr.JSON()
|
| 29 |
+
with gr.Tab("Citations"):
|
| 30 |
+
cites = gr.Markdown()
|
| 31 |
+
with gr.Tab("Explainability"):
|
| 32 |
+
explain_box = gr.JSON()
|
| 33 |
+
|
| 34 |
+
preview = gr.JSON(label="Redaction Confidence")
|
| 35 |
+
|
| 36 |
+
def _run(q):
|
| 37 |
+
res = asyncio.run(run(q))
|
| 38 |
+
cl = cluster_results(res)
|
| 39 |
+
cites_md = "\n".join(citation_block(r) for r in res[:5])
|
| 40 |
+
explain_data = explain(res)
|
| 41 |
+
red = {r.get("url"): redaction_confidence(r) for r in res}
|
| 42 |
+
return res, cl, cites_md, explain_data, red
|
| 43 |
+
|
| 44 |
+
btn = gr.Button("Search")
|
| 45 |
+
btn.click(_run, inputs=q, outputs=[results_state, clusters, cites, explain_box, preview])
|
| 46 |
+
|
| 47 |
+
exp = gr.Button("Journalist Export")
|
| 48 |
+
out = gr.File()
|
| 49 |
+
exp.click(lambda r: journalist_export(r, "/tmp/journalist_export.zip"), inputs=results_state, outputs=out)
|
| 50 |
+
|
| 51 |
+
demo.launch()
|
appeal_pdf.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph
|
| 2 |
+
from reportlab.lib.styles import getSampleStyleSheet
|
| 3 |
+
|
| 4 |
+
def generate_appeal_pdf(text, filename="appeal.pdf"):
|
| 5 |
+
doc = SimpleDocTemplate(filename)
|
| 6 |
+
styles = getSampleStyleSheet()
|
| 7 |
+
doc.build([Paragraph(text, styles["BodyText"])])
|
| 8 |
+
return filename
|
appeals.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def draft_appeal(document: str, agency: str, reason: str) -> str:
|
| 2 |
+
return f"""
|
| 3 |
+
FOIA Appeal – Request for Reconsideration
|
| 4 |
+
|
| 5 |
+
Agency: {agency}
|
| 6 |
+
Document: {document}
|
| 7 |
+
|
| 8 |
+
Basis for Appeal:
|
| 9 |
+
{reason}
|
| 10 |
+
|
| 11 |
+
This appeal concerns a publicly released document and requests
|
| 12 |
+
review of redactions or withholdings under applicable FOIA exemptions.
|
| 13 |
+
|
| 14 |
+
Sincerely,
|
| 15 |
+
[Requestor]
|
| 16 |
+
""".strip()
|
army_foia_reading_room.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ingest.generic_public_foia import GenericFOIAAdapter
|
| 2 |
+
|
| 3 |
+
class USArmyAdapter(GenericFOIAAdapter):
|
| 4 |
+
"""Public-release FOIA reading room adapter.
|
| 5 |
+
NOTE: This adapter is restricted to publicly released materials only.
|
| 6 |
+
"""
|
| 7 |
+
name = "USArmy"
|
| 8 |
+
rate_limit = 1 # requests per second
|
| 9 |
+
robots_respected = True
|
async_search.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
|
| 3 |
+
async def fanout_search(adapters, query):
|
| 4 |
+
tasks = [adapter.search(query) for adapter in adapters]
|
| 5 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 6 |
+
docs = []
|
| 7 |
+
for r in results:
|
| 8 |
+
if isinstance(r, list):
|
| 9 |
+
docs.extend(r)
|
| 10 |
+
return docs
|
audit.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uuid
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from typing import Dict, List
|
| 4 |
+
|
| 5 |
+
_AUDIT_LOG: List[Dict] = []
|
| 6 |
+
|
| 7 |
+
def log_event(action: str, payload: Dict) -> Dict:
|
| 8 |
+
entry = {
|
| 9 |
+
"id": str(uuid.uuid4()),
|
| 10 |
+
"timestamp": datetime.utcnow().isoformat() + "Z",
|
| 11 |
+
"action": action,
|
| 12 |
+
"payload": payload
|
| 13 |
+
}
|
| 14 |
+
_AUDIT_LOG.append(entry)
|
| 15 |
+
return entry
|
| 16 |
+
|
| 17 |
+
def export_audit_log() -> List[Dict]:
|
| 18 |
+
return list(_AUDIT_LOG)
|
cache.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from typing import Dict, Any, List
|
| 3 |
+
from core.faiss_vector import FaissIndex
|
| 4 |
+
|
| 5 |
+
_TTL = 300 # seconds
|
| 6 |
+
_cache: Dict[str, Any] = {}
|
| 7 |
+
_faiss = None
|
| 8 |
+
|
| 9 |
+
def _now():
|
| 10 |
+
return int(time.time())
|
| 11 |
+
|
| 12 |
+
def _get_index():
|
| 13 |
+
global _faiss
|
| 14 |
+
if _faiss is None:
|
| 15 |
+
_faiss = FaissIndex()
|
| 16 |
+
return _faiss
|
| 17 |
+
|
| 18 |
+
def cache_get(key):
|
| 19 |
+
v = _cache.get(key)
|
| 20 |
+
if not v:
|
| 21 |
+
return None
|
| 22 |
+
ts, data = v
|
| 23 |
+
if _now() - ts > _TTL:
|
| 24 |
+
_cache.pop(key, None)
|
| 25 |
+
return None
|
| 26 |
+
return data
|
| 27 |
+
|
| 28 |
+
def cache_set(key, data: List[dict]):
|
| 29 |
+
_cache[key] = (_now(), data)
|
| 30 |
+
# add snippets to FAISS for local semantic recall
|
| 31 |
+
texts = [d.get("snippet","") for d in data if d.get("snippet")]
|
| 32 |
+
if texts:
|
| 33 |
+
try:
|
| 34 |
+
_get_index().add(texts)
|
| 35 |
+
except Exception:
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
def dedupe(results: List[dict]) -> List[dict]:
|
| 39 |
+
seen = set()
|
| 40 |
+
out = []
|
| 41 |
+
for r in results:
|
| 42 |
+
h = hash((r.get("source"), r.get("url"), r.get("snippet")))
|
| 43 |
+
if h not in seen:
|
| 44 |
+
seen.add(h)
|
| 45 |
+
out.append(r)
|
| 46 |
+
return out
|
| 47 |
+
|
| 48 |
+
def source_counts(results: List[dict]) -> Dict[str,int]:
|
| 49 |
+
counts = {}
|
| 50 |
+
for r in results:
|
| 51 |
+
s = r.get("source","Unknown")
|
| 52 |
+
counts[s] = counts.get(s, 0) + 1
|
| 53 |
+
return counts
|
cia.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .common import fetch, clean
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
|
| 4 |
+
def search_cia(query):
|
| 5 |
+
url = "https://www.cia.gov/readingroom/search/site/"
|
| 6 |
+
html = fetch(url, {"search_api_fulltext": query})
|
| 7 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 8 |
+
|
| 9 |
+
results = []
|
| 10 |
+
for item in soup.select(".views-row"):
|
| 11 |
+
a = item.select_one("a")
|
| 12 |
+
if not a:
|
| 13 |
+
continue
|
| 14 |
+
results.append({
|
| 15 |
+
"title": clean(a.text),
|
| 16 |
+
"agency": "CIA",
|
| 17 |
+
"date": None,
|
| 18 |
+
"snippet": None,
|
| 19 |
+
"url": "https://www.cia.gov" + a["href"]
|
| 20 |
+
})
|
| 21 |
+
return results
|
cia_reading_room.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import httpx
|
| 2 |
+
from ingest.generic_public_foia import GenericFOIAAdapter
|
| 3 |
+
|
| 4 |
+
class CIAAdapter(GenericFOIAAdapter):
|
| 5 |
+
name = "CIA"
|
| 6 |
+
rate_limit = 1
|
| 7 |
+
robots_respected = True
|
| 8 |
+
base_url = "https://www.cia.gov/readingroom/search/site/"
|
| 9 |
+
|
| 10 |
+
async def search(self, query: str):
|
| 11 |
+
async with httpx.AsyncClient(timeout=10) as client:
|
| 12 |
+
r = await client.get(self.base_url, params={"query": query})
|
| 13 |
+
if r.status_code != 200:
|
| 14 |
+
return []
|
| 15 |
+
# Minimal safe parse: return page-level hit
|
| 16 |
+
return [{
|
| 17 |
+
"source": "CIA FOIA Reading Room",
|
| 18 |
+
"query": query,
|
| 19 |
+
"url": str(r.url),
|
| 20 |
+
"snippet": "Public FOIA search result page"
|
| 21 |
+
}]
|
citations.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def citation_block(result: dict) -> str:
|
| 2 |
+
return f"""---
|
| 3 |
+
Source: {result.get('source')}
|
| 4 |
+
Title: {result.get('title','Unknown')}
|
| 5 |
+
URL: {result.get('url')}
|
| 6 |
+
Retrieved: {result.get('retrieved_at','N/A')}
|
| 7 |
+
---"""
|
cluster.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict
|
| 2 |
+
from core.faiss_vector import FaissIndex
|
| 3 |
+
|
| 4 |
+
def cluster_results(results: List[dict], k: int = 5) -> Dict[str, List[dict]]:
|
| 5 |
+
texts = [r.get("snippet","") for r in results if r.get("snippet")]
|
| 6 |
+
index = FaissIndex()
|
| 7 |
+
index.add(texts)
|
| 8 |
+
clusters = {}
|
| 9 |
+
for r in results:
|
| 10 |
+
key = r.get("source","Unknown")
|
| 11 |
+
clusters.setdefault(key, []).append(r)
|
| 12 |
+
return clusters
|
coast_guard_foia_reading_room.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ingest.generic_public_foia import GenericFOIAAdapter
|
| 2 |
+
|
| 3 |
+
class USCoastGuardAdapter(GenericFOIAAdapter):
|
| 4 |
+
"""Public-release FOIA reading room adapter.
|
| 5 |
+
NOTE: This adapter is restricted to publicly released materials only.
|
| 6 |
+
"""
|
| 7 |
+
name = "USCoastGuard"
|
| 8 |
+
rate_limit = 1 # requests per second
|
| 9 |
+
robots_respected = True
|
collaboration.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import Dataset
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
_COLLAB: List[Dict] = []
|
| 5 |
+
|
| 6 |
+
def add_collaboration_note(document: str, note: str) -> Dict:
|
| 7 |
+
record = {
|
| 8 |
+
"document": document,
|
| 9 |
+
"note": note
|
| 10 |
+
}
|
| 11 |
+
_COLLAB.append(record)
|
| 12 |
+
return record
|
| 13 |
+
|
| 14 |
+
def get_collaboration_dataset() -> Dataset:
|
| 15 |
+
if not _COLLAB:
|
| 16 |
+
return Dataset.from_dict({"document": [], "note": []})
|
| 17 |
+
return Dataset.from_list(_COLLAB)
|
common.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
|
| 4 |
+
HEADERS = {
|
| 5 |
+
"User-Agent": "FOIA-Federated-Search/1.0 (public, non-crawling)"
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
def fetch(url, params=None):
|
| 9 |
+
r = requests.get(url, params=params, headers=HEADERS, timeout=10)
|
| 10 |
+
r.raise_for_status()
|
| 11 |
+
return r.text
|
| 12 |
+
|
| 13 |
+
def clean(text):
|
| 14 |
+
return " ".join(text.split()) if text else ""
|
darpa_reading_room.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ingest.generic_public_foia import GenericFOIAAdapter
|
| 2 |
+
|
| 3 |
+
class DARPAAdapter(GenericFOIAAdapter):
|
| 4 |
+
"""Public-release FOIA reading room adapter.
|
| 5 |
+
NOTE: This adapter is restricted to publicly released materials only.
|
| 6 |
+
"""
|
| 7 |
+
name = "DARPA"
|
| 8 |
+
rate_limit = 1 # requests per second
|
| 9 |
+
robots_respected = True
|
dea.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def search_dea(query):
|
| 2 |
+
return [{
|
| 3 |
+
"title": "DEA FOIA Reading Room",
|
| 4 |
+
"agency": "DEA",
|
| 5 |
+
"date": None,
|
| 6 |
+
"snippet": query,
|
| 7 |
+
"url": "https://www.dea.gov/foia"
|
| 8 |
+
}]
|
dea_reading_room.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ingest.generic_public_foia import GenericFOIAAdapter
|
| 2 |
+
|
| 3 |
+
class DEAAdapter(GenericFOIAAdapter):
|
| 4 |
+
name = "DEA"
|
| 5 |
+
rate_limit = 1 # requests per second
|
| 6 |
+
robots_respected = True
|
dhs.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def search_dhs(query):
|
| 2 |
+
return [{
|
| 3 |
+
"title": f"DHS FOIA Search",
|
| 4 |
+
"agency": "DHS",
|
| 5 |
+
"date": None,
|
| 6 |
+
"snippet": query,
|
| 7 |
+
"url": "https://www.dhs.gov/foia"
|
| 8 |
+
}]
|
dhs_reading_room.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ingest.generic_public_foia import GenericFOIAAdapter
|
| 2 |
+
|
| 3 |
+
class DHSAdapter(GenericFOIAAdapter):
|
| 4 |
+
name = "DHS"
|
| 5 |
+
rate_limit = 1 # requests per second
|
| 6 |
+
robots_respected = True
|
dia.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def search_dia(query):
|
| 2 |
+
return [{
|
| 3 |
+
"title": "DIA FOIA Reading Room",
|
| 4 |
+
"agency": "DIA",
|
| 5 |
+
"date": None,
|
| 6 |
+
"snippet": query,
|
| 7 |
+
"url": "https://www.dia.mil/FOIA/"
|
| 8 |
+
}]
|
dia_reading_room.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ingest.generic_public_foia import GenericFOIAAdapter
|
| 2 |
+
|
| 3 |
+
class DIAAdapter(GenericFOIAAdapter):
|
| 4 |
+
name = "DIA"
|
| 5 |
+
rate_limit = 1 # requests per second
|
| 6 |
+
robots_respected = True
|
dod.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def search_dod(query):
|
| 2 |
+
return [{
|
| 3 |
+
"title": f"DoD FOIA Search: {query}",
|
| 4 |
+
"agency": "DoD",
|
| 5 |
+
"date": None,
|
| 6 |
+
"snippet": "Redirect to DoD FOIA Reading Room search",
|
| 7 |
+
"url": "https://open.defense.gov/Transparency/FOIA/"
|
| 8 |
+
}]
|
dod_reading_room.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ingest.generic_public_foia import GenericFOIAAdapter
|
| 2 |
+
|
| 3 |
+
class DoDAdapter(GenericFOIAAdapter):
|
| 4 |
+
name = "DoD"
|
| 5 |
+
rate_limit = 1 # requests per second
|
| 6 |
+
robots_respected = True
|
dod_reading_room_live.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import httpx
|
| 2 |
+
from ingest.generic_public_foia import GenericFOIAAdapter
|
| 3 |
+
|
| 4 |
+
class DoDAdapter(GenericFOIAAdapter):
|
| 5 |
+
name = "DoD FOIA Reading Room"
|
| 6 |
+
rate_limit = 1
|
| 7 |
+
robots_respected = True
|
| 8 |
+
base_url = "https://www.esd.whs.mil/FOIA/Reading-Room/"
|
| 9 |
+
|
| 10 |
+
async def search(self, query: str):
|
| 11 |
+
async with httpx.AsyncClient(timeout=10) as client:
|
| 12 |
+
r = await client.get(self.base_url, params={"search": query})
|
| 13 |
+
if r.status_code != 200:
|
| 14 |
+
return []
|
| 15 |
+
return [{
|
| 16 |
+
"source": "DoD FOIA Reading Room",
|
| 17 |
+
"query": query,
|
| 18 |
+
"url": str(r.url),
|
| 19 |
+
"snippet": "Public DoD FOIA reading room page"
|
| 20 |
+
}]
|
doj.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def search_doj(query):
|
| 2 |
+
return [{
|
| 3 |
+
"title": f"DOJ FOIA Reading Room",
|
| 4 |
+
"agency": "DOJ",
|
| 5 |
+
"date": None,
|
| 6 |
+
"snippet": query,
|
| 7 |
+
"url": "https://www.justice.gov/oip/foia-reading-room"
|
| 8 |
+
}]
|
entity_graph.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import networkx as nx
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
|
| 4 |
+
def build_entity_graph(docs: List[Dict]) -> Dict:
|
| 5 |
+
G = nx.Graph()
|
| 6 |
+
|
| 7 |
+
for d in docs:
|
| 8 |
+
agency = d.get("agency", "Unknown")
|
| 9 |
+
G.add_node(agency, group="agency")
|
| 10 |
+
|
| 11 |
+
for token in d.get("content", "").split():
|
| 12 |
+
if token.isupper() and len(token) > 2:
|
| 13 |
+
G.add_node(token, group="entity")
|
| 14 |
+
G.add_edge(agency, token)
|
| 15 |
+
|
| 16 |
+
return {
|
| 17 |
+
"nodes": [{"id": n, "group": G.nodes[n]["group"]} for n in G.nodes],
|
| 18 |
+
"links": [{"source": u, "target": v} for u, v in G.edges]
|
| 19 |
+
}
|
explain.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def explain(results):
|
| 2 |
+
return {
|
| 3 |
+
"total_results": len(results),
|
| 4 |
+
"sources": list(set(r.get("source") for r in results)),
|
| 5 |
+
"methods": [
|
| 6 |
+
"Public FOIA reading room search",
|
| 7 |
+
"Async fan-out querying",
|
| 8 |
+
"Deduplication",
|
| 9 |
+
"Semantic refinement (FAISS)"
|
| 10 |
+
],
|
| 11 |
+
"no_restricted_access": True
|
| 12 |
+
}
|
export_utils.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
def export_json(data):
|
| 4 |
+
path = "/tmp/results.json"
|
| 5 |
+
with open(path, "w") as f:
|
| 6 |
+
json.dump(data, f, indent=2)
|
| 7 |
+
return path
|
faiss_vector.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
try:
|
| 2 |
+
import faiss
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
except ImportError:
|
| 5 |
+
faiss = None
|
| 6 |
+
|
| 7 |
+
class FaissIndex:
|
| 8 |
+
def __init__(self, model_name="all-MiniLM-L6-v2"):
|
| 9 |
+
if faiss is None:
|
| 10 |
+
raise RuntimeError("FAISS not installed")
|
| 11 |
+
self.model = SentenceTransformer(model_name)
|
| 12 |
+
self.index = None
|
| 13 |
+
self.docs = []
|
| 14 |
+
|
| 15 |
+
def add(self, texts):
|
| 16 |
+
emb = self.model.encode(texts)
|
| 17 |
+
if self.index is None:
|
| 18 |
+
self.index = faiss.IndexFlatL2(emb.shape[1])
|
| 19 |
+
self.index.add(emb)
|
| 20 |
+
self.docs.extend(texts)
|
| 21 |
+
|
| 22 |
+
def search(self, query, k=5):
|
| 23 |
+
emb = self.model.encode([query])
|
| 24 |
+
D, I = self.index.search(emb, k)
|
| 25 |
+
return [self.docs[i] for i in I[0] if i < len(self.docs)]
|
fbi.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .common import fetch, clean
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
|
| 4 |
+
def search_fbi(query):
|
| 5 |
+
html = fetch("https://vault.fbi.gov/search", {"SearchableText": query})
|
| 6 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 7 |
+
|
| 8 |
+
results = []
|
| 9 |
+
for a in soup.select("a"):
|
| 10 |
+
href = a.get("href", "")
|
| 11 |
+
if "/vault/" in href:
|
| 12 |
+
results.append({
|
| 13 |
+
"title": clean(a.text),
|
| 14 |
+
"agency": "FBI",
|
| 15 |
+
"date": None,
|
| 16 |
+
"snippet": None,
|
| 17 |
+
"url": href if href.startswith("http") else "https://vault.fbi.gov" + href
|
| 18 |
+
})
|
| 19 |
+
return results
|
fbi_vault.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
|
| 4 |
+
def ingest_fbi_vault(url: str) -> dict:
|
| 5 |
+
r = requests.get(url, timeout=10)
|
| 6 |
+
r.raise_for_status()
|
| 7 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 8 |
+
|
| 9 |
+
text = soup.get_text(separator=" ", strip=True)
|
| 10 |
+
title = soup.find("h1")
|
| 11 |
+
|
| 12 |
+
return {
|
| 13 |
+
"source": "FBI Vault",
|
| 14 |
+
"agency": "FBI",
|
| 15 |
+
"url": url,
|
| 16 |
+
"title": title.text if title else "FBI Vault Document",
|
| 17 |
+
"text": text[:10000]
|
| 18 |
+
}(r.text, "html.parser")
|
| 19 |
+
|
| 20 |
+
title = soup.find("h1")
|
| 21 |
+
body = soup.get_text(separator=" ", strip=True)
|
| 22 |
+
|
| 23 |
+
return {
|
| 24 |
+
"source": "FBI Vault",
|
| 25 |
+
"url": url,
|
| 26 |
+
"title": title.text if title else "Untitled FBI Vault Document",
|
| 27 |
+
"text": body,
|
| 28 |
+
"agency": "FBI"
|
| 29 |
+
}
|