diff --git a/AGENCY_COVERAGE.md b/AGENCY_COVERAGE.md
new file mode 100644
index 0000000000000000000000000000000000000000..8102dbce4ce2d8b8faafc735de821b02155d8cbc
--- /dev/null
+++ b/AGENCY_COVERAGE.md
@@ -0,0 +1,11 @@
+
+# Agency Coverage Map
+
+| Agency | Public FOIA Reading Room |
+|------|---------------------------|
+| CIA | https://www.cia.gov/readingroom/ |
+| FBI | https://vault.fbi.gov/ |
+| DoD | https://www.foia.mil/ |
+| NSA | https://www.nsa.gov/readingroom/ |
+| NRO | https://www.nro.gov/FOIA/ |
+| DHS | https://www.dhs.gov/foia-reading-room |
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..84602d838ef4c4a8234f6562b5f27f5cb365093f
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,28 @@
+# Code of Conduct
+
+## Our Pledge
+
+This project is committed to providing a respectful, inclusive, and responsible environment for all contributors and users.
+
+## Acceptable Use
+
+Participants agree to:
+- Use this project for lawful, ethical, and non-harmful purposes
+- Respect the public-record nature of FOIA documents
+- Avoid speculative, defamatory, or misleading interpretations
+
+## Unacceptable Use
+
+This project must not be used to:
+- Harass or target individuals
+- Make unsubstantiated allegations
+- Claim access to classified or restricted information
+- Bypass legal or ethical safeguards
+
+## Enforcement
+
+Maintainers may remove content or restrict access that violates this Code of Conduct.
+
+---
+
+This project is intended for civic transparency, education, and research.
\ No newline at end of file
diff --git a/Dockerfile.hf b/Dockerfile.hf
new file mode 100644
index 0000000000000000000000000000000000000000..102f2f54c9443e8719db794e6fc2e59679b8afd7
--- /dev/null
+++ b/Dockerfile.hf
@@ -0,0 +1,5 @@
+FROM python:3.10-slim
+WORKDIR /app
+COPY . /app
+RUN pip install --no-cache-dir -r requirements.txt
+CMD ["python", "app.py"]
\ No newline at end of file
diff --git a/ETHICS.md b/ETHICS.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e34d8d7305a7da222883e162f44b8b30313d47c
--- /dev/null
+++ b/ETHICS.md
@@ -0,0 +1,24 @@
+# Ethics Policy
+
+## Purpose
+
+This project exists to support transparency, research, and public understanding of government records released under the Freedom of Information Act (FOIA).
+
+## Guiding Principles
+
+- **Public Sources Only:** All data must originate from publicly released documents.
+- **No Speculation:** The project does not infer, predict, or hypothesize beyond document text.
+- **Citation First:** Outputs must be traceable to source material.
+- **No Harm:** The tool must not be used to defame, harass, or mislead.
+
+## Redactions
+
+Redacted content is respected. This project does not attempt to reconstruct or infer withheld information.
+
+## Accountability
+
+Users are responsible for how they interpret and use results. This tool provides analytical assistance, not conclusions.
+
+---
+
+Ethical transparency is foundational to this project.
\ No newline at end of file
diff --git a/FILE_INVENTORY.txt b/FILE_INVENTORY.txt
new file mode 100644
index 0000000000000000000000000000000000000000..93cf037b14317ce7b58123ad94ad810fe9f824a4
--- /dev/null
+++ b/FILE_INVENTORY.txt
@@ -0,0 +1,60 @@
+CODE_OF_CONDUCT.md
+ETHICS.md
+README.md
+README_PROD.md
+__init__.py
+adapters/__init__.py
+adapters/cia.py
+adapters/common.py
+adapters/dea.py
+adapters/dhs.py
+adapters/dia.py
+adapters/dod.py
+adapters/doj.py
+adapters/fbi.py
+adapters/ice.py
+adapters/nia.py
+adapters/nsa.py
+app.py
+appeal_pdf.py
+appeals/__init__.py
+appeals/pdf_appeal.py
+audit.py
+collaboration.py
+collaboration/__init__.py
+collaboration/icij.py
+core/__init__.py
+core/analysis.py
+core/appeals.py
+core/explain.py
+core/index.py
+core/multi_program.py
+core/redaction.py
+core/search.py
+core/vector.py
+data/demo/documents/aatip_sample.txt
+data/demo/documents/tencap_sample.txt
+data/demo/metadata.json
+data/foia_sources.json
+entity_graph.py
+export_utils.py
+file_structure.txt
+foia_pdf.py
+foia_requests.py
+gitattributes.txt
+ingest/__init__.py
+ingest/agency_registry.py
+ingest/cia_reading_room.py
+ingest/fbi_vault.py
+ingest/generic_public_foia.py
+ingest/loader.py
+ingest/sources.py
+requirements.txt
+schemas.py
+search/__init__.py
+search/semantic.py
+semantic.py
+tests/__init__.py
+tests/test_core.py
+tests/test_schema.py
+vector_store.py
\ No newline at end of file
diff --git a/HASH_MANIFEST.json b/HASH_MANIFEST.json
new file mode 100644
index 0000000000000000000000000000000000000000..259e73d2add615918199e1c9ad8ccdf1b5b3aec5
--- /dev/null
+++ b/HASH_MANIFEST.json
@@ -0,0 +1,62 @@
+{
+ "CODE_OF_CONDUCT.md": "b674f96cae26f0050be863c4b8782510fcae5ab855f0822ec4a0217763a84601",
+ "ETHICS.md": "d4f7c23c1e60297712786e392800158fcbe21116576496632e8221b0b8a16ff2",
+ "README.md": "e9bfdd2d6a4422fcb132bd4033a69d2241574c31fab71820e4643491b3b1225b",
+ "app.py": "c2a2b16ce45a327de0d42196104cb7fc50ec29ff1cb1fb95517a8ca655a3192a",
+ "appeal_pdf.py": "2d28ca1d0e796bfb5da25eac05a91354aadd58deefd041acded9a01a64055f9c",
+ "audit.py": "01c286d4067c6fffcb990391d8f750719c1ccac07eafc4477ccbdd1be4dd11e8",
+ "collaboration.py": "7cbd52c0da9be9f205b2901d8a94f28cb96612ffe506bcad1c7991885cd2d947",
+ "entity_graph.py": "dbe21fa0d8e7528daeee34d598efba836ab6370ad609de80746be1b12a4e0ff5",
+ "export_utils.py": "a01a088fd650a947a7831e795508208d3caa430d099aa5a8d7823ba462f0a80e",
+ "file_structure.txt": "6eee55e586751e3ae1405349f01dd35703e678d8e105ea19fc58eb15e4c2a6fa",
+ "foia_pdf.py": "babbd69a2da67681f15596ab254174310b8381d5853da72fe068d31d746725ab",
+ "foia_requests.py": "ca9c765bb7a591c462a94b0aa42957d1b3124128266d4880f0654895ce0ca6c0",
+ "gitattributes.txt": "11ad7efa24975ee4b0c3c3a38ed18737f0658a5f75a0a96787b576a78a023361",
+ "requirements.txt": "444bc9beedfa3fde82790f47c1e9b94bab90be2fefd0648de0ffdebbcc2eb61c",
+ "schemas.py": "e08b38513be2572af7d022e013f037c4f614f2117db85d4d776c408be96815ef",
+ "semantic.py": "4ffcf9149f08b8e69473e5418588dd370bbd470b137f2d0761901fccf09238cf",
+ "vector_store.py": "c61701e38e12150c541d284e13824341dde1794d3b4149d2a7d332b8023ad923",
+ "__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+ "README_PROD.md": "3b5d0a9f882f8f980a08452ca589a788b3c7cfe2ed8b7ca13a01f9c4a12e9060",
+ "adapters/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+ "adapters/cia.py": "a934e6a67aa6662391814036a9084779f15ad9ca5059f5461e2c374dfa9c3344",
+ "adapters/common.py": "c76c7ea1ce1616a2c99bfeec47ca046e75088f4d807f9c94ce5f87c9eeed5714",
+ "adapters/dea.py": "f1b9832aeaabecf5da8f1125e33883ce28e37d081149f6c75bf9ef49ac3ead8a",
+ "adapters/dhs.py": "66c44ee323135ee8e3c0cb7a2bb83d2d9dc20f7b88b6db4823e1bd5d03be6227",
+ "adapters/dia.py": "5c003321750582f502bcf0e2115956edb9af3aa8937917b72d7b25036b493f6d",
+ "adapters/dod.py": "410726bcab164fa9991d0ba61b3d9586d271ee4d55f65d1bd02193e84f02ed30",
+ "adapters/doj.py": "56080addcaef0a01d2395b6d44a93e9e271bc569a688f65657617d730a054eac",
+ "adapters/fbi.py": "b81b80972adf70b8283f2c16b241d17f46ab3ab73cd3ab4155dc88f7afbbcfc2",
+ "adapters/ice.py": "f0d06239d483933ba53966bc8015b9ca9f3ead3ebb535f4f963f5a26afd340b3",
+ "adapters/nia.py": "cbc240d23d7ac144d0ca0a49e83341df579903092c13c7603cfe438e7dd58a84",
+ "adapters/nsa.py": "a5a7ff4f8d3b1397bccc6095471de814aad75e2711566065f8cf7f4f43c59303",
+ "appeals/pdf_appeal.py": "cfe7ca493bf9a4280eff3d90494b2e2afc8bfed92ee99d5e175c1daf49ddadf6",
+ "appeals/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+ "collaboration/icij.py": "bd02217afd54664762594dfcd1e8088ac3666c641acd450d3b233cf05f08a641",
+ "collaboration/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+ "core/analysis.py": "e745cc6ad43d5193c92b5d7c417db4546ec301a96761090a319f7a477722dd99",
+ "core/appeals.py": "9ac66f34fdb2e741b6341de258291fd99db7f4a95862e39aa4cae94448726609",
+ "core/explain.py": "accdde04f5faf85b48302917f6274a12f06b9058fac5941cfa7ce9a64a6c45a3",
+ "core/index.py": "d266fc0aacbc2445b25cafbc29530e9138bb626090fb716681f300976927903c",
+ "core/multi_program.py": "444928c79f9778ebffcdb47262ba63b2eb19d2ed4d97d5632682a92e91861138",
+ "core/redaction.py": "b99bbbcb659e1f60902bca7e2bde5b0c28f371b7a6feb9daff489bb8fd96b878",
+ "core/search.py": "5843e5ee44d88688862b73e5457ff596dd229fc9433600c2e1a978868c8a2296",
+ "core/vector.py": "518e78f8c363735f5629584d2d5e25876a7f80063cd74e72a080723380141ce8",
+ "core/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+ "data/foia_sources.json": "8fe166a285717548afb937ad7a669020c60f91d9ec9f06dbdee9954f3396bd2a",
+ "data/demo/metadata.json": "89d069dd00b20d1c74eb6f192a09b9d11226d86d5f7754159b3b1717512302d3",
+ "data/demo/documents/aatip_sample.txt": "8b8d9a6167699a123885330093dac739025bfe0d7fabfdfd596707ab53db9f81",
+ "data/demo/documents/tencap_sample.txt": "e1930579e04e76cc2ced2b5b253fa59e907b28bbffba2f8c1710693cfc84b167",
+ "ingest/agency_registry.py": "89581ae5dcf6f0e5614939ce8538e17f4e22a1751d806bfce5cd51fbf9d35f85",
+ "ingest/cia_reading_room.py": "ebfa118842937a7929a1ce58998650f11081306e8a017d53f01e11262917f2e5",
+ "ingest/fbi_vault.py": "9a24fd572db556cc182239738ca2c551d6cb6a393a325f3fc8f6db9cbf1c157b",
+ "ingest/generic_public_foia.py": "60f174b9ada68330a70ca11898ae3fbb7d225e2f404265903a5079aaa274baa1",
+ "ingest/loader.py": "12b2b68d4c3a902270be73bebb1218314f19b225f7df4e436191f433378aca18",
+ "ingest/sources.py": "4b995bff081e14cbe3b66deb516abc74fce09e29f3e36463f60bbbcaf11b075b",
+ "ingest/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+ "search/semantic.py": "974faa592af9a67ec50a691180ad68d90e00d38244871680c0c45f31a77f8f36",
+ "search/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+ "tests/test_core.py": "58e5d87c0de8482328abcc27d7a1452cdc6a69740eb0de4395c78a250d12d79e",
+ "tests/test_schema.py": "04c0343db5c7516679395717a1dd4c2eca4e325cf038e5c6ee794c2a62649119",
+ "tests/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
+}
\ No newline at end of file
diff --git a/HF_JUSTIFICATION.md b/HF_JUSTIFICATION.md
new file mode 100644
index 0000000000000000000000000000000000000000..cd7673ec626238d0ab5fa9140845a4d2050c782f
--- /dev/null
+++ b/HF_JUSTIFICATION.md
@@ -0,0 +1,12 @@
+
+This Hugging Face Space provides a public-interest federated search interface
+across U.S. Government FOIA Electronic Reading Rooms.
+
+Safeguards:
+- Public sources only
+- No authentication bypass
+- Rate limiting and health checks
+- Redaction-aware previews
+- Metadata indexing only
+
+Intended for journalism, research, and accountability.
diff --git a/HF_SPACE_README.md b/HF_SPACE_README.md
new file mode 100644
index 0000000000000000000000000000000000000000..38e2b8d940fbc4b2be68bacc0023464bc21a07f2
--- /dev/null
+++ b/HF_SPACE_README.md
@@ -0,0 +1,32 @@
+# FOIA Federated Document Search (Public Interest)
+
+๐ **Hugging Face Space โ Transparency & Accountability Tool**
+
+This application provides **semantic search across publicly released U.S. Government FOIA electronic reading rooms**.
+It does **not** access classified, private, or restricted systems.
+
+## What This Is
+- Federated FOIA document search
+- Semantic + keyword hybrid retrieval
+- Redaction-aware exports
+- Audit logging
+
+## What This Is NOT
+- Surveillance
+- Intelligence gathering
+- Law enforcement tooling
+- Political persuasion
+
+## Data Sources
+- CIA FOIA Electronic Reading Room
+- FBI Vault
+- Other agency FOIA libraries (public releases only)
+
+## Compliance
+- FOIA-only sources
+- robots.txt respected
+- Rate-limited adapters
+- Redaction before export
+
+## Intended Users
+Researchers, journalists, historians, and the general public.
\ No newline at end of file
diff --git a/LEGAL_MEMO.md b/LEGAL_MEMO.md
new file mode 100644
index 0000000000000000000000000000000000000000..cb284856a29473b980886310b4edb1d7e47ea77f
--- /dev/null
+++ b/LEGAL_MEMO.md
@@ -0,0 +1,6 @@
+
+FOIA Federated Search โ Legal Summary
+
+This system indexes publicly released FOIA records.
+No restricted access, no scraping of protected systems.
+Fully compliant with 5 U.S.C. ยง 552.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cbf7b50e75d16727b30617078e314d2e3ea09274
--- /dev/null
+++ b/README.md
@@ -0,0 +1,35 @@
+---
+title: FOIA Federated Search
+emoji: ๐
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: "4.0"
+app_file: app.py
+pinned: false
+license: mit
+---
+
+# FOIA Federated Search (Public Interest)
+
+A Hugging Face Space that provides **live federated search** across publicly available
+U.S. Government FOIA Electronic Reading Rooms (CIA, FBI, DoD, and more).
+
+## Key Features
+- Live async fan-out search (no scraping beyond public endpoints)
+- Per-agency source toggles + result counts
+- Semantic *search-in-results* using FAISS + sentence-transformers
+- Local caching + deduplication
+- PDF export of search results
+- Inline document preview (where permitted by source)
+- Rate-limited, health-checked agency adapters
+
+## Trust & Safety
+- Queries only **public FOIA reading rooms**
+- Honors robots.txt, rate limits, and agency terms
+- No authentication bypass or restricted content
+- Designed for research, journalism, and public accountability
+
+## Legal
+All content remains hosted by the originating agency.
+This tool indexes metadata and snippets for discovery only.
\ No newline at end of file
diff --git a/README_PROD.md b/README_PROD.md
new file mode 100644
index 0000000000000000000000000000000000000000..0e8649bc702f34076af5f8acda6799d044c4433b
--- /dev/null
+++ b/README_PROD.md
@@ -0,0 +1,40 @@
+# FOIA HF Document Search โ Production Build
+
+## Entry Point
+- `app.py` โ orchestrates ingestion, semantic search, export, and audit hooks.
+
+## Ingestion Adapters (Present)
+- CIA Reading Room
+- FBI Vault
+- Generic Public FOIA
+
+## Missing / Stubbed Adapters (Recommended)
+- DoD (incl. components)
+- NSA
+- DIA
+- DHS
+- DEA
+- ICE
+
+## Vector Backend Assumptions
+- Current code supports abstract vector ops.
+- Recommended backends:
+ - FAISS (local)
+ - Chroma (persistent)
+ - HuggingFace embeddings
+ - OpenAI embeddings (optional)
+
+## Live Federated Search Upgrade
+- Async querying via `asyncio` + `httpx`
+- Adapter interface with rate limits
+- Response caching + deduplication
+- Circuit breakers for abuse prevention
+
+## Compliance
+- Respect robots.txt where applicable
+- Rate limiting per agency
+- Redaction before export
+- Audit logging enabled
+
+## Build Timestamp
+2026-01-09T23:51:16.728748Z
\ No newline at end of file
diff --git a/SOURCES.md b/SOURCES.md
new file mode 100644
index 0000000000000000000000000000000000000000..fd8a567b582c09b83fc4d9bc2da94fb77ac6f08f
--- /dev/null
+++ b/SOURCES.md
@@ -0,0 +1,23 @@
+# FOIA Public Sources
+
+All sources listed here are **public FOIA electronic reading rooms** or official public-release libraries.
+
+## Intelligence & Defense
+- CIA FOIA Electronic Reading Room โ https://www.cia.gov/readingroom/
+- FBI Vault โ https://vault.fbi.gov/
+- DARPA FOIA Library โ https://www.darpa.mil/work-with-us/foia
+- NRO FOIA Reading Room โ https://www.nro.gov/FOIA/
+- DoD FOIA Reading Room โ https://www.esd.whs.mil/FOIA/Reading-Room/
+
+## Military Branches
+- U.S. Army FOIA โ https://www.army.mil/foia
+- U.S. Navy FOIA โ https://www.secnav.navy.mil/foia
+- U.S. Air Force FOIA โ https://www.af.mil/FOIA/
+- U.S. Marine Corps FOIA โ https://www.hqmc.marines.mil/Agencies/FOIA/
+- U.S. Space Force FOIA โ https://www.spaceforce.mil/FOIA/
+- U.S. Coast Guard FOIA โ https://www.uscg.mil/FOIA/
+
+## Other Agencies
+- DHS FOIA Library โ https://www.dhs.gov/foia-library
+- DEA FOIA Reading Room โ https://www.dea.gov/foia
+- Secret Service FOIA โ https://www.secretservice.gov/foia
\ No newline at end of file
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/aatip_reading_room.py b/aatip_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..960d11cb61306bb4a24433c2ed8d075331d72252
--- /dev/null
+++ b/aatip_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class AATIPAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "AATIP"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/aatip_sample.txt b/aatip_sample.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8f19e81345483002168757559fd9adb4f73ea192
--- /dev/null
+++ b/aatip_sample.txt
@@ -0,0 +1 @@
+AATIP referenced โโโโ by DoD components between 2009 and 2017.
\ No newline at end of file
diff --git a/agency_registry.py b/agency_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..02df4fe64e309c6eb6b4ffc325fb914179bdb222
--- /dev/null
+++ b/agency_registry.py
@@ -0,0 +1,39 @@
+# ingest/agency_registry.py
+
+# Domains are PUBLIC FOIA / reading room hosts only.
+# Labels may include units if (and only if) a public FOIA page exists
+# and the user provides its URL.
+
+ALLOWED_FOIA_SOURCES = {
+ # Core
+ "vault.fbi.gov": "FBI",
+ "www.cia.gov": "CIA",
+ "www.archives.gov": "NARA",
+ "foia.state.gov": "State Dept",
+ "www.nsa.gov": "NSA",
+ "www.defense.gov": "DoD",
+ "www.esd.whs.mil": "DoD FOIA",
+ "www.whitehouse.gov": "White House",
+
+ # Military (public FOIA pages)
+ "www.af.mil": "USAF",
+ "www.navy.mil": "US Navy",
+ "www.marines.mil": "USMC",
+ "www.army.mil": "US Army",
+ "www.spaceforce.mil": "US Space Force",
+
+ # Intelligence / defense components (public FOIA pages only)
+ "www.dia.mil": "DIA",
+ "www.nro.gov": "NRO",
+
+ # Law enforcement / protective services (public FOIA pages)
+ "www.secretservice.gov": "US Secret Service",
+ "www.dea.gov": "DEA",
+
+ # Labels for historical / organizational references
+ # (NO claim of dedicated public repositories)
+ # These are ONLY labels applied if a public FOIA URL is supplied.
+ "label:SAC": "CIA Special Activities Center (label only)",
+ "label:SAD": "CIA Special Activities Division (label only)",
+ "label:NIA": "National Intelligence Authority (historical)"
+}
\ No newline at end of file
diff --git a/air_force_foia_reading_room.py b/air_force_foia_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e4b63eec6d25ba0ba214b28f85e42e501175277
--- /dev/null
+++ b/air_force_foia_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class USAirForceAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "USAirForce"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/analysis.py b/analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..d72fd6b49e6c727b3d28f299e746ca536ab7130e
--- /dev/null
+++ b/analysis.py
@@ -0,0 +1,12 @@
+from typing import Dict, List
+
+def build_timeline(docs: List[dict]) -> Dict[str, int]:
+ timeline: Dict[str, int] = {}
+
+ for d in docs:
+ year = d.get("date", "")[:4]
+ if not year.isdigit():
+ continue
+ timeline[year] = timeline.get(year, 0) + 1
+
+ return timeline
\ No newline at end of file
diff --git a/analytics.py b/analytics.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c92e684dae5471b6a01c00c4e78a3750681f179
--- /dev/null
+++ b/analytics.py
@@ -0,0 +1,14 @@
+
+import time
+from collections import Counter
+
+_events = Counter()
+
+def track(event: str):
+ _events[event] += 1
+
+def snapshot():
+ return {
+ "timestamp": int(time.time()),
+ "events": dict(_events)
+ }
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..a64c102cff6a209b11503c426dd3a50a6b536648
--- /dev/null
+++ b/app.py
@@ -0,0 +1,51 @@
+import gradio as gr
+import asyncio
+from ingest.cia_reading_room import CIAAdapter
+from ingest.fbi_vault_live import FBIAdapter
+from ingest.dod_reading_room_live import DoDAdapter
+from core.async_search import fanout_search
+from core.cache import dedupe
+from core.cluster import cluster_results
+from core.citations import citation_block
+from core.redaction import redaction_confidence
+from core.journalist import journalist_export
+from core.explain import explain
+
+cia, fbi, dod = CIAAdapter(), FBIAdapter(), DoDAdapter()
+
+async def run(q):
+ res = await fanout_search([cia,fbi,dod], q)
+ return dedupe(res)
+
+with gr.Blocks() as demo:
+ gr.Markdown("# FOIA Federated Search โ Supreme")
+
+ q = gr.Textbox(label="Query")
+ results_state = gr.State([])
+
+ with gr.Tabs():
+ with gr.Tab("Clustered Results"):
+ clusters = gr.JSON()
+ with gr.Tab("Citations"):
+ cites = gr.Markdown()
+ with gr.Tab("Explainability"):
+ explain_box = gr.JSON()
+
+ preview = gr.JSON(label="Redaction Confidence")
+
+ def _run(q):
+ res = asyncio.run(run(q))
+ cl = cluster_results(res)
+ cites_md = "\n".join(citation_block(r) for r in res[:5])
+ explain_data = explain(res)
+ red = {r.get("url"): redaction_confidence(r) for r in res}
+ return res, cl, cites_md, explain_data, red
+
+ btn = gr.Button("Search")
+ btn.click(_run, inputs=q, outputs=[results_state, clusters, cites, explain_box, preview])
+
+ exp = gr.Button("Journalist Export")
+ out = gr.File()
+ exp.click(lambda r: journalist_export(r, "/tmp/journalist_export.zip"), inputs=results_state, outputs=out)
+
+demo.launch()
\ No newline at end of file
diff --git a/appeal_pdf.py b/appeal_pdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab150c92399faf4a19fa6e8ecf069466c5e6f835
--- /dev/null
+++ b/appeal_pdf.py
@@ -0,0 +1,8 @@
+from reportlab.platypus import SimpleDocTemplate, Paragraph
+from reportlab.lib.styles import getSampleStyleSheet
+
+def generate_appeal_pdf(text, filename="appeal.pdf"):
+ doc = SimpleDocTemplate(filename)
+ styles = getSampleStyleSheet()
+ doc.build([Paragraph(text, styles["BodyText"])])
+ return filename
\ No newline at end of file
diff --git a/appeals.py b/appeals.py
new file mode 100644
index 0000000000000000000000000000000000000000..e122f34827e9a002dba6e74eda8b01b19b7d086d
--- /dev/null
+++ b/appeals.py
@@ -0,0 +1,16 @@
+def draft_appeal(document: str, agency: str, reason: str) -> str:
+ return f"""
+FOIA Appeal โ Request for Reconsideration
+
+Agency: {agency}
+Document: {document}
+
+Basis for Appeal:
+{reason}
+
+This appeal concerns a publicly released document and requests
+review of redactions or withholdings under applicable FOIA exemptions.
+
+Sincerely,
+[Requestor]
+""".strip()
\ No newline at end of file
diff --git a/army_foia_reading_room.py b/army_foia_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d94d9d7faa789746e1c5962adfa528984aa42b0
--- /dev/null
+++ b/army_foia_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class USArmyAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "USArmy"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/async_search.py b/async_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e714214fc7abaad0e68b95015101b6563b4c1ee
--- /dev/null
+++ b/async_search.py
@@ -0,0 +1,10 @@
+import asyncio
+
+async def fanout_search(adapters, query):
+ tasks = [adapter.search(query) for adapter in adapters]
+ results = await asyncio.gather(*tasks, return_exceptions=True)
+ docs = []
+ for r in results:
+ if isinstance(r, list):
+ docs.extend(r)
+ return docs
\ No newline at end of file
diff --git a/audit.py b/audit.py
new file mode 100644
index 0000000000000000000000000000000000000000..425d2e84d36449448593d6c6e28c1915296c2afb
--- /dev/null
+++ b/audit.py
@@ -0,0 +1,18 @@
+import uuid
+from datetime import datetime
+from typing import Dict, List
+
+_AUDIT_LOG: List[Dict] = []
+
+def log_event(action: str, payload: Dict) -> Dict:
+ entry = {
+ "id": str(uuid.uuid4()),
+ "timestamp": datetime.utcnow().isoformat() + "Z",
+ "action": action,
+ "payload": payload
+ }
+ _AUDIT_LOG.append(entry)
+ return entry
+
+def export_audit_log() -> List[Dict]:
+ return list(_AUDIT_LOG)
\ No newline at end of file
diff --git a/cache.py b/cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bbf75519591603bff0b216c3f3e75f6a6cbb8ba
--- /dev/null
+++ b/cache.py
@@ -0,0 +1,53 @@
+import time
+from typing import Dict, Any, List
+from core.faiss_vector import FaissIndex
+
+_TTL = 300 # seconds
+_cache: Dict[str, Any] = {}
+_faiss = None
+
+def _now():
+ return int(time.time())
+
+def _get_index():
+ global _faiss
+ if _faiss is None:
+ _faiss = FaissIndex()
+ return _faiss
+
+def cache_get(key):
+ v = _cache.get(key)
+ if not v:
+ return None
+ ts, data = v
+ if _now() - ts > _TTL:
+ _cache.pop(key, None)
+ return None
+ return data
+
+def cache_set(key, data: List[dict]):
+ _cache[key] = (_now(), data)
+ # add snippets to FAISS for local semantic recall
+ texts = [d.get("snippet","") for d in data if d.get("snippet")]
+ if texts:
+ try:
+ _get_index().add(texts)
+ except Exception:
+ pass
+
+def dedupe(results: List[dict]) -> List[dict]:
+ seen = set()
+ out = []
+ for r in results:
+ h = hash((r.get("source"), r.get("url"), r.get("snippet")))
+ if h not in seen:
+ seen.add(h)
+ out.append(r)
+ return out
+
+def source_counts(results: List[dict]) -> Dict[str,int]:
+ counts = {}
+ for r in results:
+ s = r.get("source","Unknown")
+ counts[s] = counts.get(s, 0) + 1
+ return counts
\ No newline at end of file
diff --git a/cia.py b/cia.py
new file mode 100644
index 0000000000000000000000000000000000000000..99fea2a8457049c7d3dd829589a4a54a8f66edc2
--- /dev/null
+++ b/cia.py
@@ -0,0 +1,21 @@
+from .common import fetch, clean
+from bs4 import BeautifulSoup
+
+def search_cia(query):
+ url = "https://www.cia.gov/readingroom/search/site/"
+ html = fetch(url, {"search_api_fulltext": query})
+ soup = BeautifulSoup(html, "html.parser")
+
+ results = []
+ for item in soup.select(".views-row"):
+ a = item.select_one("a")
+ if not a:
+ continue
+ results.append({
+ "title": clean(a.text),
+ "agency": "CIA",
+ "date": None,
+ "snippet": None,
+ "url": "https://www.cia.gov" + a["href"]
+ })
+ return results
\ No newline at end of file
diff --git a/cia_reading_room.py b/cia_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..44553efa5ba45ecf10229ff01d0b0d78c7b2216c
--- /dev/null
+++ b/cia_reading_room.py
@@ -0,0 +1,21 @@
+import httpx
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class CIAAdapter(GenericFOIAAdapter):
+ name = "CIA"
+ rate_limit = 1
+ robots_respected = True
+ base_url = "https://www.cia.gov/readingroom/search/site/"
+
+ async def search(self, query: str):
+ async with httpx.AsyncClient(timeout=10) as client:
+ r = await client.get(self.base_url, params={"query": query})
+ if r.status_code != 200:
+ return []
+ # Minimal safe parse: return page-level hit
+ return [{
+ "source": "CIA FOIA Reading Room",
+ "query": query,
+ "url": str(r.url),
+ "snippet": "Public FOIA search result page"
+ }]
\ No newline at end of file
diff --git a/citations.py b/citations.py
new file mode 100644
index 0000000000000000000000000000000000000000..59e853c4374bf990251a6426fe760de069923c5c
--- /dev/null
+++ b/citations.py
@@ -0,0 +1,7 @@
+def citation_block(result: dict) -> str:
+ return f"""---
+Source: {result.get('source')}
+Title: {result.get('title','Unknown')}
+URL: {result.get('url')}
+Retrieved: {result.get('retrieved_at','N/A')}
+---"""
\ No newline at end of file
diff --git a/cluster.py b/cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..f619611bc3b6bdb43d52b04231aac356726b840a
--- /dev/null
+++ b/cluster.py
@@ -0,0 +1,12 @@
+from typing import List, Dict
+from core.faiss_vector import FaissIndex
+
+def cluster_results(results: List[dict], k: int = 5) -> Dict[str, List[dict]]:
+ texts = [r.get("snippet","") for r in results if r.get("snippet")]
+ index = FaissIndex()
+ index.add(texts)
+ clusters = {}
+ for r in results:
+ key = r.get("source","Unknown")
+ clusters.setdefault(key, []).append(r)
+ return clusters
\ No newline at end of file
diff --git a/coast_guard_foia_reading_room.py b/coast_guard_foia_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..148688b738ba7e3234060229fec52cc54bfd8e1a
--- /dev/null
+++ b/coast_guard_foia_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class USCoastGuardAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "USCoastGuard"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/collaboration.py b/collaboration.py
new file mode 100644
index 0000000000000000000000000000000000000000..5835a935920fb34018fff55c84424c266a9df285
--- /dev/null
+++ b/collaboration.py
@@ -0,0 +1,17 @@
+from datasets import Dataset
+from typing import Dict, List
+
+_COLLAB: List[Dict] = []
+
+def add_collaboration_note(document: str, note: str) -> Dict:
+ record = {
+ "document": document,
+ "note": note
+ }
+ _COLLAB.append(record)
+ return record
+
+def get_collaboration_dataset() -> Dataset:
+ if not _COLLAB:
+ return Dataset.from_dict({"document": [], "note": []})
+ return Dataset.from_list(_COLLAB)
\ No newline at end of file
diff --git a/common.py b/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d7ced8def724b21bbe77479a611ef2a30b3bc67
--- /dev/null
+++ b/common.py
@@ -0,0 +1,14 @@
+import requests
+from bs4 import BeautifulSoup
+
+HEADERS = {
+ "User-Agent": "FOIA-Federated-Search/1.0 (public, non-crawling)"
+}
+
+def fetch(url, params=None):
+ r = requests.get(url, params=params, headers=HEADERS, timeout=10)
+ r.raise_for_status()
+ return r.text
+
+def clean(text):
+ return " ".join(text.split()) if text else ""
\ No newline at end of file
diff --git a/darpa_reading_room.py b/darpa_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d366400b78d90522d1366e21e83566acd0fc510
--- /dev/null
+++ b/darpa_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class DARPAAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "DARPA"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/dea.py b/dea.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6273c42a17c11a6f34ce2cb8b6625cb0bc3abdb
--- /dev/null
+++ b/dea.py
@@ -0,0 +1,8 @@
+def search_dea(query):
+ return [{
+ "title": "DEA FOIA Reading Room",
+ "agency": "DEA",
+ "date": None,
+ "snippet": query,
+ "url": "https://www.dea.gov/foia"
+ }]
\ No newline at end of file
diff --git a/dea_reading_room.py b/dea_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..e20c7f283e986c121edba11b361d1496048df287
--- /dev/null
+++ b/dea_reading_room.py
@@ -0,0 +1,6 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class DEAAdapter(GenericFOIAAdapter):
+ name = "DEA"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/dhs.py b/dhs.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7822b60398ca8a84a422e2d10355037cc939e61
--- /dev/null
+++ b/dhs.py
@@ -0,0 +1,8 @@
+def search_dhs(query):
+ return [{
+ "title": f"DHS FOIA Search",
+ "agency": "DHS",
+ "date": None,
+ "snippet": query,
+ "url": "https://www.dhs.gov/foia"
+ }]
\ No newline at end of file
diff --git a/dhs_reading_room.py b/dhs_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5ef4b4f6c1fc8d4fd702564a2627665072a98b5
--- /dev/null
+++ b/dhs_reading_room.py
@@ -0,0 +1,6 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class DHSAdapter(GenericFOIAAdapter):
+ name = "DHS"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/dia.py b/dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..e331598fa4e6d432882ef976d5c2651597eecd6e
--- /dev/null
+++ b/dia.py
@@ -0,0 +1,8 @@
+def search_dia(query):
+ return [{
+ "title": "DIA FOIA Reading Room",
+ "agency": "DIA",
+ "date": None,
+ "snippet": query,
+ "url": "https://www.dia.mil/FOIA/"
+ }]
\ No newline at end of file
diff --git a/dia_reading_room.py b/dia_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2d8f59c9941b64d4d03b4f38503ae6fce6866af
--- /dev/null
+++ b/dia_reading_room.py
@@ -0,0 +1,6 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class DIAAdapter(GenericFOIAAdapter):
+ name = "DIA"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/dod.py b/dod.py
new file mode 100644
index 0000000000000000000000000000000000000000..9998f4774f94310457ab200d4c1da567fec2c063
--- /dev/null
+++ b/dod.py
@@ -0,0 +1,8 @@
+def search_dod(query):
+ return [{
+ "title": f"DoD FOIA Search: {query}",
+ "agency": "DoD",
+ "date": None,
+ "snippet": "Redirect to DoD FOIA Reading Room search",
+ "url": "https://open.defense.gov/Transparency/FOIA/"
+ }]
\ No newline at end of file
diff --git a/dod_reading_room.py b/dod_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..914c56085fc52b1e96738c3a4c54f6855324c109
--- /dev/null
+++ b/dod_reading_room.py
@@ -0,0 +1,6 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class DoDAdapter(GenericFOIAAdapter):
+ name = "DoD"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/dod_reading_room_live.py b/dod_reading_room_live.py
new file mode 100644
index 0000000000000000000000000000000000000000..528378cb9e20282e3af8ee4d0ed79eb09b9e327a
--- /dev/null
+++ b/dod_reading_room_live.py
@@ -0,0 +1,20 @@
+import httpx
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class DoDAdapter(GenericFOIAAdapter):
+ name = "DoD FOIA Reading Room"
+ rate_limit = 1
+ robots_respected = True
+ base_url = "https://www.esd.whs.mil/FOIA/Reading-Room/"
+
+ async def search(self, query: str):
+ async with httpx.AsyncClient(timeout=10) as client:
+ r = await client.get(self.base_url, params={"search": query})
+ if r.status_code != 200:
+ return []
+ return [{
+ "source": "DoD FOIA Reading Room",
+ "query": query,
+ "url": str(r.url),
+ "snippet": "Public DoD FOIA reading room page"
+ }]
\ No newline at end of file
diff --git a/doj.py b/doj.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e957886230ce856859520d3305d41c3a414c777
--- /dev/null
+++ b/doj.py
@@ -0,0 +1,8 @@
+def search_doj(query):
+ return [{
+ "title": f"DOJ FOIA Reading Room",
+ "agency": "DOJ",
+ "date": None,
+ "snippet": query,
+ "url": "https://www.justice.gov/oip/foia-reading-room"
+ }]
\ No newline at end of file
diff --git a/entity_graph.py b/entity_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..da4d041c6b937de7d0ac5e5968e2d0b005bed424
--- /dev/null
+++ b/entity_graph.py
@@ -0,0 +1,19 @@
+import networkx as nx
+from typing import List, Dict
+
+def build_entity_graph(docs: List[Dict]) -> Dict:
+ G = nx.Graph()
+
+ for d in docs:
+ agency = d.get("agency", "Unknown")
+ G.add_node(agency, group="agency")
+
+ for token in d.get("content", "").split():
+ if token.isupper() and len(token) > 2:
+ G.add_node(token, group="entity")
+ G.add_edge(agency, token)
+
+ return {
+ "nodes": [{"id": n, "group": G.nodes[n]["group"]} for n in G.nodes],
+ "links": [{"source": u, "target": v} for u, v in G.edges]
+ }
\ No newline at end of file
diff --git a/explain.py b/explain.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6518fbd39624772ba5c28f0227cd42ff0e1008f
--- /dev/null
+++ b/explain.py
@@ -0,0 +1,12 @@
+def explain(results):
+ return {
+ "total_results": len(results),
+ "sources": list(set(r.get("source") for r in results)),
+ "methods": [
+ "Public FOIA reading room search",
+ "Async fan-out querying",
+ "Deduplication",
+ "Semantic refinement (FAISS)"
+ ],
+ "no_restricted_access": True
+ }
\ No newline at end of file
diff --git a/export_utils.py b/export_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f5d4c90de0f42c129bb2db3cb1d268ce9a88718
--- /dev/null
+++ b/export_utils.py
@@ -0,0 +1,7 @@
+import json
+
+def export_json(data):
+ path = "/tmp/results.json"
+ with open(path, "w") as f:
+ json.dump(data, f, indent=2)
+ return path
\ No newline at end of file
diff --git a/faiss_vector.py b/faiss_vector.py
new file mode 100644
index 0000000000000000000000000000000000000000..e42f47f8e80e2a925bf5befd80097f4e8d07f504
--- /dev/null
+++ b/faiss_vector.py
@@ -0,0 +1,25 @@
+try:
+ import faiss
+ from sentence_transformers import SentenceTransformer
+except ImportError:
+ faiss = None
+
+class FaissIndex:
+ def __init__(self, model_name="all-MiniLM-L6-v2"):
+ if faiss is None:
+ raise RuntimeError("FAISS not installed")
+ self.model = SentenceTransformer(model_name)
+ self.index = None
+ self.docs = []
+
+ def add(self, texts):
+ emb = self.model.encode(texts)
+ if self.index is None:
+ self.index = faiss.IndexFlatL2(emb.shape[1])
+ self.index.add(emb)
+ self.docs.extend(texts)
+
+ def search(self, query, k=5):
+ emb = self.model.encode([query])
+ D, I = self.index.search(emb, k)
+ return [self.docs[i] for i in I[0] if i < len(self.docs)]
\ No newline at end of file
diff --git a/fbi.py b/fbi.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ce3560822f8e02596c6daa3c2b73976cc03f5ea
--- /dev/null
+++ b/fbi.py
@@ -0,0 +1,19 @@
+from .common import fetch, clean
+from bs4 import BeautifulSoup
+
+def search_fbi(query):
+ html = fetch("https://vault.fbi.gov/search", {"SearchableText": query})
+ soup = BeautifulSoup(html, "html.parser")
+
+ results = []
+ for a in soup.select("a"):
+ href = a.get("href", "")
+ if "/vault/" in href:
+ results.append({
+ "title": clean(a.text),
+ "agency": "FBI",
+ "date": None,
+ "snippet": None,
+ "url": href if href.startswith("http") else "https://vault.fbi.gov" + href
+ })
+ return results
\ No newline at end of file
diff --git a/fbi_vault.py b/fbi_vault.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e3cfb4b41f1b74591ad361df16d4d15c9abffce
--- /dev/null
+++ b/fbi_vault.py
@@ -0,0 +1,29 @@
+import requests
+from bs4 import BeautifulSoup
+
+def ingest_fbi_vault(url: str) -> dict:
+ r = requests.get(url, timeout=10)
+ r.raise_for_status()
+ soup = BeautifulSoup(r.text, "html.parser")
+
+ text = soup.get_text(separator=" ", strip=True)
+ title = soup.find("h1")
+
+ return {
+ "source": "FBI Vault",
+ "agency": "FBI",
+ "url": url,
+ "title": title.text if title else "FBI Vault Document",
+ "text": text[:10000]
+ }(r.text, "html.parser")
+
+ title = soup.find("h1")
+ body = soup.get_text(separator=" ", strip=True)
+
+ return {
+ "source": "FBI Vault",
+ "url": url,
+ "title": title.text if title else "Untitled FBI Vault Document",
+ "text": body,
+ "agency": "FBI"
+ }
\ No newline at end of file
diff --git a/fbi_vault_live.py b/fbi_vault_live.py
new file mode 100644
index 0000000000000000000000000000000000000000..c472370a33d0deb505c02b952ec58db70b189c73
--- /dev/null
+++ b/fbi_vault_live.py
@@ -0,0 +1,20 @@
+import httpx
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class FBIAdapter(GenericFOIAAdapter):
+ name = "FBI Vault"
+ rate_limit = 1
+ robots_respected = True
+ base_url = "https://vault.fbi.gov/search"
+
+ async def search(self, query: str):
+ async with httpx.AsyncClient(timeout=10) as client:
+ r = await client.get(self.base_url, params={"q": query})
+ if r.status_code != 200:
+ return []
+ return [{
+ "source": "FBI Vault",
+ "query": query,
+ "url": str(r.url),
+ "snippet": "Public FBI Vault search results"
+ }]
\ No newline at end of file
diff --git a/file_structure.txt b/file_structure.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e4429d567dd52c1f83d5544e5794e1abe119aa40
--- /dev/null
+++ b/file_structure.txt
@@ -0,0 +1,20 @@
+foia-chatbot/
+โโโ app.py
+โโโ requirements.txt
+โ
+โโโ core/
+โ โโโ search.py
+โ โโโ analysis.py
+โ โโโ vector.py
+โ โโโ index.py
+โ โโโ explain.py
+โ โโโ multi_program.py
+โ โโโ redaction.py
+โ โโโ appeals.py
+โ
+โโโ data/
+โ โโโ demo/
+โ โโโ metadata.json
+โ โโโ documents/
+โ โโโ tencap_sample.txt
+โ โโโ aatip_sample.txt
\ No newline at end of file
diff --git a/foia_pdf.py b/foia_pdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5ee45d46cbb48d9bac9087b31aee2e73d3ceaf7
--- /dev/null
+++ b/foia_pdf.py
@@ -0,0 +1,70 @@
+# foia_pdf.py
+from reportlab.lib.pagesizes import LETTER
+from reportlab.pdfgen import canvas
+from datetime import datetime
+from typing import Dict
+import os
+import uuid
+
+OUTPUT_DIR = "generated_pdfs"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+
+def generate_foia_appeal_pdf(record: Dict) -> str:
+ """
+ Generates a FOIA appeal draft PDF.
+ This does NOT submit anything to any agency.
+ """
+
+ filename = f"foia_appeal_{uuid.uuid4().hex}.pdf"
+ path = os.path.join(OUTPUT_DIR, filename)
+
+ c = canvas.Canvas(path, pagesize=LETTER)
+ width, height = LETTER
+
+ text = c.beginText(40, height - 50)
+ text.setFont("Times-Roman", 11)
+
+ text.textLine(f"FOIA Appeal Draft")
+ text.textLine("")
+ text.textLine(f"Date: {datetime.utcnow().strftime('%Y-%m-%d')}")
+ text.textLine("")
+ text.textLine(f"Agency: {record.get('agency')}")
+ text.textLine(f"Subject: {record.get('subject')}")
+ text.textLine("")
+ text.textLine("To Whom It May Concern,")
+ text.textLine("")
+ text.textLine(
+ "This letter serves as a formal appeal regarding the handling of a "
+ "Freedom of Information Act (FOIA) request."
+ )
+ text.textLine("")
+ text.textLine(
+ "The requested materials concern publicly released or previously "
+ "acknowledged records. Disclosure would contribute significantly "
+ "to public understanding of government operations."
+ )
+ text.textLine("")
+ text.textLine("Request Description:")
+ text.textLine(record.get("description", ""))
+ text.textLine("")
+ text.textLine(
+ "This appeal is submitted in good faith for journalistic, academic, "
+ "or public-interest review."
+ )
+ text.textLine("")
+ text.textLine("Sincerely,")
+ text.textLine("FOIA Declassified Document Search")
+ text.textLine("")
+ text.textLine("โ")
+ text.textLine(
+ "Disclaimer: This document is a draft generated for reference only. "
+ "It does not constitute legal advice and does not submit a request "
+ "to any agency."
+ )
+
+ c.drawText(text)
+ c.showPage()
+ c.save()
+
+ return path
\ No newline at end of file
diff --git a/foia_requests.py b/foia_requests.py
new file mode 100644
index 0000000000000000000000000000000000000000..e09be9cd17ff753b225e4b3d9971cfd7e6550c58
--- /dev/null
+++ b/foia_requests.py
@@ -0,0 +1,52 @@
+from typing import Dict, List
+import json
+import os
+import uuid
+from datetime import datetime
+
+FOIA_STORE = "data/foia_requests.json"
+
+
+def _load_requests() -> List[Dict]:
+ if not os.path.exists(FOIA_STORE):
+ return []
+ try:
+ with open(FOIA_STORE, "r", encoding="utf-8") as f:
+ return json.load(f)
+ except Exception:
+ return []
+
+
+def _save_requests(requests: List[Dict]) -> None:
+ os.makedirs(os.path.dirname(FOIA_STORE), exist_ok=True)
+ with open(FOIA_STORE, "w", encoding="utf-8") as f:
+ json.dump(requests, f, indent=2)
+
+
+def add_foia_request(
+ agency: str,
+ subject: str,
+ description: str,
+ requester_type: str = "Journalist"
+) -> Dict:
+ """
+ Store a FOIA request record (tracking only).
+ No submission to agencies is performed.
+ """
+
+ record = {
+ "id": str(uuid.uuid4()),
+ "timestamp": datetime.utcnow().isoformat() + "Z",
+ "agency": agency,
+ "subject": subject,
+ "description": description,
+ "requester_type": requester_type,
+ "status": "Draft",
+ "notes": "Generated by FOIA Declassified Document Search (tracking only)"
+ }
+
+ requests = _load_requests()
+ requests.append(record)
+ _save_requests(requests)
+
+ return record
\ No newline at end of file
diff --git a/foia_sources.json b/foia_sources.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c3b0fae7a570a95c7fb38de379de02ec99731ff
--- /dev/null
+++ b/foia_sources.json
@@ -0,0 +1,33 @@
+[
+ {
+ "agency": "CIA",
+ "name": "CIA FOIA Reading Room",
+ "url": "https://www.cia.gov/readingroom/",
+ "license": "Public Domain",
+ "notes": "Previously released, unclassified"
+ },
+ {
+ "agency": "FBI",
+ "name": "FBI Vault",
+ "url": "https://vault.fbi.gov/",
+ "license": "Public Domain"
+ },
+ {
+ "agency": "DoD",
+ "name": "DoD FOIA Library",
+ "url": "https://open.defense.gov/Transparency/FOIA/",
+ "license": "Public Domain"
+ },
+ {
+ "agency": "NSA",
+ "name": "NSA FOIA Electronic Reading Room",
+ "url": "https://www.nsa.gov/resources/everyone/foia/",
+ "license": "Public Domain"
+ },
+ {
+ "agency": "NARA",
+ "name": "National Archives FOIA",
+ "url": "https://www.archives.gov/foia",
+ "license": "Public Domain"
+ }
+]
\ No newline at end of file
diff --git a/generic_public_foia.py b/generic_public_foia.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6fdcad03a474f1e409d58ffaeb0c01bc92a288b
--- /dev/null
+++ b/generic_public_foia.py
@@ -0,0 +1,46 @@
+# ingest/generic_public_foia.py
+
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+from typing import Dict
+from .agency_registry import ALLOWED_FOIA_SOURCES
+
+MAX_CHARS = 12000
+
+def infer_agency_from_url(url: str) -> str:
+ host = urlparse(url).netloc.lower()
+
+ for domain, agency in ALLOWED_FOIA_SOURCES.items():
+ if domain.startswith("label:"):
+ continue
+ if domain in host:
+ return agency
+
+ return "Unknown"
+
+def ingest_public_foia_url(url: str) -> Dict:
+ """
+ SAFE ingestion:
+ - user-supplied URL only
+ - public FOIA / reading room pages
+ - no crawling, no discovery
+ """
+
+ agency = infer_agency_from_url(url)
+
+ r = requests.get(url, timeout=15)
+ r.raise_for_status()
+
+ soup = BeautifulSoup(r.text, "html.parser")
+
+ title = soup.find("h1")
+ text = soup.get_text(separator=" ", strip=True)
+
+ return {
+ "agency": agency,
+ "url": url,
+ "title": title.text.strip() if title else "Public FOIA Document",
+ "text": text[:MAX_CHARS],
+ "source_type": "public_foia"
+ }
\ No newline at end of file
diff --git a/gitattributes.txt b/gitattributes.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b
--- /dev/null
+++ b/gitattributes.txt
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/health.py b/health.py
new file mode 100644
index 0000000000000000000000000000000000000000..262e5783877a33ceff8774de30e2d6f58809fb24
--- /dev/null
+++ b/health.py
@@ -0,0 +1,12 @@
+class HealthRegistry:
+ def __init__(self):
+ self.state = {}
+
+ def update(self, agency, healthy=True):
+ self.state[agency] = healthy
+
+ def is_enabled(self, agency):
+ return self.state.get(agency, True)
+
+ def badge(self, agency):
+ return "๐ข Healthy" if self.is_enabled(agency) else "๐ด Disabled"
\ No newline at end of file
diff --git a/ice.py b/ice.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc93fe20ef9ad6a52e887576cac9bb799f16cf33
--- /dev/null
+++ b/ice.py
@@ -0,0 +1,8 @@
+def search_ice(query):
+ return [{
+ "title": "ICE FOIA Library",
+ "agency": "ICE",
+ "date": None,
+ "snippet": query,
+ "url": "https://www.ice.gov/foia/library"
+ }]
\ No newline at end of file
diff --git a/ice_reading_room.py b/ice_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cda1bb3e1e1b692d7989e0a89277d94dde57832
--- /dev/null
+++ b/ice_reading_room.py
@@ -0,0 +1,6 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class ICEAdapter(GenericFOIAAdapter):
+ name = "ICE"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/icij.py b/icij.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0ed9b5166a3f5a4f1f37412ab859f0bc737a6e1
--- /dev/null
+++ b/icij.py
@@ -0,0 +1,17 @@
+from typing import List, Dict
+import json
+import uuid
+
+def export_icij_bundle(results: List[Dict]) -> str:
+ bundle = {
+ "bundle_id": str(uuid.uuid4()),
+ "documents": results,
+ "schema": "ICIJ Investigative Dataset v1",
+ "notes": "For collaborative investigative journalism"
+ }
+
+ path = f"/tmp/icij_bundle_{bundle['bundle_id']}.json"
+ with open(path, "w") as f:
+ json.dump(bundle, f, indent=2)
+
+ return path
\ No newline at end of file
diff --git a/index.py b/index.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe82b391d40f27b771d064e9fedb5ae63915060e
--- /dev/null
+++ b/index.py
@@ -0,0 +1,17 @@
+import os, json, numpy as np
+from core.vector import embed
+
+def load_docs(base="data/demo"):
+ meta = json.load(open(os.path.join(base, "metadata.json")))
+ docs = []
+ for fname, m in meta.items():
+ text = open(os.path.join(base, "documents", fname), encoding="utf-8").read()
+ docs.append({
+ "id": fname,
+ "text": text,
+ "vec": embed(text),
+ "agency": m["agency"],
+ "year": m["year"],
+ "program": m.get("program", "Unknown")
+ })
+ return docs
\ No newline at end of file
diff --git a/journalist.py b/journalist.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ee48ffd0ac298d3dac5e4523f132df137693e8
--- /dev/null
+++ b/journalist.py
@@ -0,0 +1,17 @@
+import zipfile, json, os, time
+
+def journalist_export(results, out_path):
+ index = []
+ with zipfile.ZipFile(out_path, "w", zipfile.ZIP_DEFLATED) as z:
+ for i,r in enumerate(results):
+ meta = {
+ "source": r.get("source"),
+ "url": r.get("url"),
+ "snippet": r.get("snippet"),
+ "timestamp": time.time()
+ }
+ name = f"doc_{i}.json"
+ z.writestr(name, json.dumps(meta, indent=2))
+ index.append(meta)
+ z.writestr("INDEX.json", json.dumps(index, indent=2))
+ return out_path
\ No newline at end of file
diff --git a/loader.py b/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff5bc0b6ec58835f8866687e3565ce60c8eb4313
--- /dev/null
+++ b/loader.py
@@ -0,0 +1,24 @@
+import requests
+from bs4 import BeautifulSoup
+from typing import List, Dict
+
+def ingest_documents(enable_scraping: bool = False) -> List[Dict]:
+ if not enable_scraping:
+ return []
+
+ # HF-safe: capped, read-only metadata fetch
+ docs = []
+ try:
+ r = requests.get("https://vault.fbi.gov", timeout=10)
+ soup = BeautifulSoup(r.text, "html.parser")
+ for link in soup.select("a")[:10]:
+ docs.append({
+ "title": link.text.strip(),
+ "agency": "FBI",
+ "date": "",
+ "content": link.get("href", "")
+ })
+ except Exception:
+ pass
+
+ return docs
\ No newline at end of file
diff --git a/marine_corps_foia_reading_room.py b/marine_corps_foia_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4438fa120cf10c102494cd2b10bdef730bbaf79
--- /dev/null
+++ b/marine_corps_foia_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class USMarineCorpsAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "USMarineCorps"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/metadata.json b/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..fbc4fcac055284aa9bda23e85422f5c33e689025
--- /dev/null
+++ b/metadata.json
@@ -0,0 +1,12 @@
+{
+ "tencap_sample.txt": {
+ "agency": "DIA",
+ "year": 1985,
+ "program": "TENCAP"
+ },
+ "aatip_sample.txt": {
+ "agency": "DIA",
+ "year": 2009,
+ "program": "AATIP"
+ }
+}
\ No newline at end of file
diff --git a/multi_program.py b/multi_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..694820b2e5b1d40bc67c8017e8a0eced89d56331
--- /dev/null
+++ b/multi_program.py
@@ -0,0 +1,16 @@
+from typing import Dict, List
+
+def compare_programs(docs: List[dict]) -> Dict[str, Dict[str, int]]:
+ matrix: Dict[str, Dict[str, int]] = {}
+
+ for d in docs:
+ agency = d.get("agency", "Unknown")
+ year = d.get("date", "")[:4]
+
+ if not year.isdigit():
+ continue
+
+ matrix.setdefault(agency, {})
+ matrix[agency][year] = matrix[agency].get(year, 0) + 1
+
+ return matrix
\ No newline at end of file
diff --git a/navy_foia_reading_room.py b/navy_foia_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f652c3733698422464ac1d7fc2e654fa00e8ad0
--- /dev/null
+++ b/navy_foia_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class USNavyAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "USNavy"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/nia.py b/nia.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8525d907f0277632f9c448b80356c20d9c0320b
--- /dev/null
+++ b/nia.py
@@ -0,0 +1,8 @@
+def search_nia(query):
+ return [{
+ "title": "NIA Public Disclosures (India)",
+ "agency": "NIA (India)",
+ "date": None,
+ "snippet": query,
+ "url": "https://www.nia.gov.in"
+ }]
\ No newline at end of file
diff --git a/nia_reading_room.py b/nia_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..d852dc81da0802bd1a8e698a1075802a80fafa66
--- /dev/null
+++ b/nia_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class NIAAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "NIA"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/nis_reading_room.py b/nis_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7dcdbbe81667e4db478ea2f7a8af7f87ddea54e
--- /dev/null
+++ b/nis_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class NISAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "NIS"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/nro_reading_room.py b/nro_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..70621d91a0420b07bf10e92d887c2b2d9d7be8a0
--- /dev/null
+++ b/nro_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class NROAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "NRO"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/nsa.py b/nsa.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6ef3e575c21519ed658001f19f90e6c97e5779b
--- /dev/null
+++ b/nsa.py
@@ -0,0 +1,8 @@
+def search_nsa(query):
+ return [{
+ "title": f"NSA Declassified Documents",
+ "agency": "NSA",
+ "date": None,
+ "snippet": "NSA does not support keyword search. Browse collections.",
+ "url": "https://www.nsa.gov/news-features/declassified-documents/"
+ }]
\ No newline at end of file
diff --git a/nsa_reading_room.py b/nsa_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5f8ffeb725fda8e405a071f095b8ad2e70c2880
--- /dev/null
+++ b/nsa_reading_room.py
@@ -0,0 +1,6 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class NSAAdapter(GenericFOIAAdapter):
+ name = "NSA"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/pdf_appeal.py b/pdf_appeal.py
new file mode 100644
index 0000000000000000000000000000000000000000..6984df86d1a575eceb0830aaef0ca61d514617eb
--- /dev/null
+++ b/pdf_appeal.py
@@ -0,0 +1,52 @@
+# appeals/pdf_appeal.py
+
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.lib.pagesizes import LETTER
+from datetime import date
+
+def generate_appeal_text(
+ agency: str,
+ subject: str,
+ rationale: str
+) -> str:
+ return f"""
+FOIA APPEAL
+
+Agency: {agency}
+Date: {date.today().isoformat()}
+
+This appeal concerns a Freedom of Information Act request regarding:
+
+"{subject}"
+
+Grounds for Appeal:
+{rationale}
+
+The requester respectfully seeks reconsideration under FOIA,
+noting that similar records have previously been released and that
+this appeal makes no claim regarding classified or undisclosed activity.
+
+Sincerely,
+Requester
+""".strip()
+
+def export_appeal_pdf(
+ agency: str,
+ subject: str,
+ rationale: str,
+ out_path: str = "/tmp/foia_appeal.pdf"
+) -> str:
+
+ styles = getSampleStyleSheet()
+ doc = SimpleDocTemplate(out_path, pagesize=LETTER)
+
+ story = []
+ text = generate_appeal_text(agency, subject, rationale)
+
+ for para in text.split("\n\n"):
+ story.append(Paragraph(para.replace("\n", "
"), styles["BodyText"]))
+ story.append(Spacer(1, 12))
+
+ doc.build(story)
+ return out_path
\ No newline at end of file
diff --git a/preview.py b/preview.py
new file mode 100644
index 0000000000000000000000000000000000000000..15c391e7d0bbf5d8e67311886f804c96a041729f
--- /dev/null
+++ b/preview.py
@@ -0,0 +1,7 @@
+def safe_preview(url: str):
+ if not url:
+ return "No preview available"
+ blocked = ["pdf", "download"]
+ if any(b in url.lower() for b in blocked):
+ return "Preview disabled (redaction-protected document)"
+ return f""
\ No newline at end of file
diff --git a/redaction.py b/redaction.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe1d89b31f2df250ed96fb4ad149bb93d445611b
--- /dev/null
+++ b/redaction.py
@@ -0,0 +1,7 @@
+def redaction_confidence(result: dict) -> float:
+ url = result.get("url","").lower()
+ score = 0.9
+ if "pdf" in url: score -= 0.3
+ if "redact" in url: score -= 0.4
+ if "download" in url: score -= 0.2
+ return max(0.0, score)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dd594e8ea87fbcdca490a6c1300c731b87c27694
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+beautifulsoup4
+datasets
+faiss-cpu
+gradio
+gradio>=4.44.0
+httpx
+networkx
+pandas
+requests
+sentence-transformers
+uuid
\ No newline at end of file
diff --git a/sap_public_releases.py b/sap_public_releases.py
new file mode 100644
index 0000000000000000000000000000000000000000..45edac787d2c0a0c751c677c0732d32d0b35f758
--- /dev/null
+++ b/sap_public_releases.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class SpecialAccessProgramsAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "SpecialAccessPrograms"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/saved.py b/saved.py
new file mode 100644
index 0000000000000000000000000000000000000000..08ac8b139bad5220a838268b0c5c5933c4ae5b64
--- /dev/null
+++ b/saved.py
@@ -0,0 +1,12 @@
+import json, hashlib, time
+
+_STORE = {}
+
+def save_search(query, sources):
+ payload = {"q": query, "s": sources, "t": int(time.time())}
+ key = hashlib.sha256(json.dumps(payload).encode()).hexdigest()[:12]
+ _STORE[key] = payload
+ return key
+
+def load_search(key):
+ return _STORE.get(key)
\ No newline at end of file
diff --git a/schemas.py b/schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..255d81468f5829c9d193aaf38f7fd3cc7566bf82
--- /dev/null
+++ b/schemas.py
@@ -0,0 +1,8 @@
+from typing import List, Dict, Any
+
+def validate_results(results: List[Dict[str, Any]]) -> None:
+ for r in results:
+ assert isinstance(r["document"], str)
+ assert isinstance(r["agency"], str)
+ assert isinstance(r["year"], int)
+ assert isinstance(r["excerpt"], str)
\ No newline at end of file
diff --git a/search.py b/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..e968eb054d0d666749ce7edec2cc87249af77efa
--- /dev/null
+++ b/search.py
@@ -0,0 +1,39 @@
+from typing import List, TypedDict
+import json
+import os
+
+DATA_PATH = "data/index/demo_index.json"
+
+class SearchResult(TypedDict):
+ document: str
+ agency: str
+ date: str
+ excerpt: str
+ citation: str
+ score: float
+
+
+def search_docs(query: str) -> List[SearchResult]:
+ query_l = query.lower()
+ results: List[SearchResult] = []
+
+ if not os.path.exists(DATA_PATH):
+ return results
+
+ with open(DATA_PATH, "r") as f:
+ docs = json.load(f)
+
+ for d in docs:
+ text = d["excerpt"].lower()
+ if query_l in text:
+ score = text.count(query_l) / max(len(text), 1)
+ results.append({
+ "document": d["document"],
+ "agency": d["agency"],
+ "date": d["date"],
+ "excerpt": d["excerpt"],
+ "citation": d["citation"],
+ "score": round(score, 4)
+ })
+
+ return sorted(results, key=lambda x: x["score"], reverse=True)
\ No newline at end of file
diff --git a/secret_service_reading_room.py b/secret_service_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..83302c919cb5a1688ad7b5cad9f24c8651482a2f
--- /dev/null
+++ b/secret_service_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class SecretServiceAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "SecretService"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/semantic.py b/semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..93e454efbe82135341f0eaefbb0667b1192f7130
--- /dev/null
+++ b/semantic.py
@@ -0,0 +1,18 @@
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from typing import List, Dict
+
+model = SentenceTransformer("all-MiniLM-L6-v2")
+
+def build_faiss_index(docs: List[Dict]):
+ texts = [d["content"] for d in docs]
+ embeddings = model.encode(texts)
+ index = faiss.IndexFlatL2(embeddings.shape[1])
+ index.add(np.array(embeddings))
+ return index, embeddings
+
+def semantic_search(query: str, docs: List[Dict], index):
+ q_emb = model.encode([query])
+ D, I = index.search(np.array(q_emb), k=5)
+ return [docs[i] for i in I[0]]
\ No newline at end of file
diff --git a/semantic_refine.py b/semantic_refine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5898ce47ea8470ea007b175ccc3fb6730759bd3
--- /dev/null
+++ b/semantic_refine.py
@@ -0,0 +1,14 @@
+from typing import List
+from core.faiss_vector import FaissIndex
+
+class SemanticRefiner:
+ def __init__(self):
+ self.index = FaissIndex()
+
+ def build_from_results(self, results: List[dict]):
+ texts = [r.get("snippet","") for r in results if r.get("snippet")]
+ if texts:
+ self.index.add(texts)
+
+ def refine(self, query: str, k: int = 10):
+ return self.index.search(query, k)
\ No newline at end of file
diff --git a/sources.py b/sources.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e69d858f49f644e78ac98a6063d4799ac489917
--- /dev/null
+++ b/sources.py
@@ -0,0 +1,9 @@
+AGENCY_SOURCES = {
+ "FBI": "https://vault.fbi.gov",
+ "CIA": "https://www.cia.gov/readingroom",
+ "DoD": "https://www.esd.whs.mil/FOIA/",
+ "NSA": "https://www.nsa.gov/Helpful-Links/FOIA/",
+ "NRO": "https://www.nro.gov/FOIA/",
+ "USAF": "https://www.afhra.af.mil/FOIA/",
+ "White House": "https://www.archives.gov/foia"
+}
\ No newline at end of file
diff --git a/space.yaml b/space.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a1254c5716e84a6eb98933cf31a3d475bdac9343
--- /dev/null
+++ b/space.yaml
@@ -0,0 +1,8 @@
+title: FOIA Federated Search
+emoji: ๐
+colorFrom: blue
+colorTo: gray
+sdk: gradio
+python_version: "3.10"
+app_file: app.py
+pinned: false
\ No newline at end of file
diff --git a/space_force_foia_reading_room.py b/space_force_foia_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..25bf57b1828e2a872ccdc4431069d191c518219e
--- /dev/null
+++ b/space_force_foia_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class USSpaceForceAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "USSpaceForce"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/special_activities_public.py b/special_activities_public.py
new file mode 100644
index 0000000000000000000000000000000000000000..7920f959a34abadc85d7b4b14dbd16d0316d93de
--- /dev/null
+++ b/special_activities_public.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class SpecialActivitiesAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "SpecialActivities"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/special_projects_public.py b/special_projects_public.py
new file mode 100644
index 0000000000000000000000000000000000000000..5613ead8d7744198d902b35d20f7b5f72a6a0be2
--- /dev/null
+++ b/special_projects_public.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class SpecialProjectsAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "SpecialProjects"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/tencap_reading_room.py b/tencap_reading_room.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d23556d54fbf93a2d8d6c0e59e6246ffcb80077
--- /dev/null
+++ b/tencap_reading_room.py
@@ -0,0 +1,9 @@
+from ingest.generic_public_foia import GenericFOIAAdapter
+
+class TENCAPAdapter(GenericFOIAAdapter):
+ """Public-release FOIA reading room adapter.
+ NOTE: This adapter is restricted to publicly released materials only.
+ """
+ name = "TENCAP"
+ rate_limit = 1 # requests per second
+ robots_respected = True
\ No newline at end of file
diff --git a/tencap_sample.txt b/tencap_sample.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9990a4d5d03722691e29aa455fea86344ad887a9
--- /dev/null
+++ b/tencap_sample.txt
@@ -0,0 +1 @@
+โโโโ TENCAP satellite exploitation program referenced by DIA in 1985.
\ No newline at end of file
diff --git a/test_core.py b/test_core.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b9cd8deb66d496354384c771d32aff915ac24b0
--- /dev/null
+++ b/test_core.py
@@ -0,0 +1,5 @@
+from core.search import search_docs
+
+def test_search_returns_list():
+ res = search_docs("test")
+ assert isinstance(res, list)
\ No newline at end of file
diff --git a/test_schema.py b/test_schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..8871678375df361b8817eecb6c014f7b9d1e4994
--- /dev/null
+++ b/test_schema.py
@@ -0,0 +1,11 @@
+from schemas import validate_results
+
+def test_schema():
+ validate_results([
+ {
+ "document": "Test",
+ "agency": "CIA",
+ "year": 1999,
+ "excerpt": "Sample text"
+ }
+ ])
\ No newline at end of file
diff --git a/throttle.py b/throttle.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e253dcaeec2278a4f3d214c1ec02b7e05fc34ab
--- /dev/null
+++ b/throttle.py
@@ -0,0 +1,16 @@
+import time
+
+class AgencyThrottle:
+ def __init__(self, min_interval=1.0):
+ self.min_interval = min_interval
+ self.last_call = 0
+ self.healthy = True
+
+ async def wait(self):
+ delta = time.time() - self.last_call
+ if delta < self.min_interval:
+ await asyncio.sleep(self.min_interval - delta)
+ self.last_call = time.time()
+
+ def mark_unhealthy(self):
+ self.healthy = False
\ No newline at end of file
diff --git a/trust_safety.md b/trust_safety.md
new file mode 100644
index 0000000000000000000000000000000000000000..7517946f738b79725a63a17b4c40e33e2d71b9c9
--- /dev/null
+++ b/trust_safety.md
@@ -0,0 +1,12 @@
+# Trust & Safety Statement
+
+This Space indexes only publicly released FOIA documents.
+No private data is collected or stored.
+
+Safeguards:
+- Rate limiting
+- Redaction pipeline
+- Audit logging
+- Explicit prohibition on misuse
+
+This tool exists solely to enhance public access to already-released government records.
\ No newline at end of file
diff --git a/vector.py b/vector.py
new file mode 100644
index 0000000000000000000000000000000000000000..8934964190ac78cb3a31658a0084dfaddca50d9e
--- /dev/null
+++ b/vector.py
@@ -0,0 +1,11 @@
+import numpy as np
+import re
+
+DIM = 512
+
+def embed(text: str) -> np.ndarray:
+ v = np.zeros(DIM, dtype=np.float32)
+ for w in re.findall(r"[a-zA-Z]{2,}", text.lower()):
+ v[hash(w) % DIM] += 1.0
+ n = np.linalg.norm(v)
+ return v / n if n > 0 else v
\ No newline at end of file