Spaces:

GodsDevProject
/

FOIA_Doc_Search

Sleeping

App Files Files Community

GodsDevProject commited on Jan 10

Commit

bb0b469

verified ·

1 Parent(s): d7f9fdb

Upload 98 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

AGENCY_COVERAGE.md +11 -0
CODE_OF_CONDUCT.md +28 -0
Dockerfile.hf +5 -0
ETHICS.md +24 -0
FILE_INVENTORY.txt +60 -0
HASH_MANIFEST.json +62 -0
HF_JUSTIFICATION.md +12 -0
HF_SPACE_README.md +32 -0
LEGAL_MEMO.md +6 -0
README.md +35 -0
README_PROD.md +40 -0
SOURCES.md +23 -0
__init__.py +0 -0
aatip_reading_room.py +9 -0
aatip_sample.txt +1 -0
agency_registry.py +39 -0
air_force_foia_reading_room.py +9 -0
analysis.py +12 -0
analytics.py +14 -0
app.py +51 -0
appeal_pdf.py +8 -0
appeals.py +16 -0
army_foia_reading_room.py +9 -0
async_search.py +10 -0
audit.py +18 -0
cache.py +53 -0
cia.py +21 -0
cia_reading_room.py +21 -0
citations.py +7 -0
cluster.py +12 -0
coast_guard_foia_reading_room.py +9 -0
collaboration.py +17 -0
common.py +14 -0
darpa_reading_room.py +9 -0
dea.py +8 -0
dea_reading_room.py +6 -0
dhs.py +8 -0
dhs_reading_room.py +6 -0
dia.py +8 -0
dia_reading_room.py +6 -0
dod.py +8 -0
dod_reading_room.py +6 -0
dod_reading_room_live.py +20 -0
doj.py +8 -0
entity_graph.py +19 -0
explain.py +12 -0
export_utils.py +7 -0
faiss_vector.py +25 -0
fbi.py +19 -0
fbi_vault.py +29 -0

AGENCY_COVERAGE.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Agency Coverage Map
+| Agency | Public FOIA Reading Room |
+|------|---------------------------|
+| CIA | https://www.cia.gov/readingroom/ |
+| FBI | https://vault.fbi.gov/ |
+| DoD | https://www.foia.mil/ |
+| NSA | https://www.nsa.gov/readingroom/ |
+| NRO | https://www.nro.gov/FOIA/ |
+| DHS | https://www.dhs.gov/foia-reading-room |

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,28 @@

+# Code of Conduct
+## Our Pledge
+This project is committed to providing a respectful, inclusive, and responsible environment for all contributors and users.
+## Acceptable Use
+Participants agree to:
+- Use this project for lawful, ethical, and non-harmful purposes
+- Respect the public-record nature of FOIA documents
+- Avoid speculative, defamatory, or misleading interpretations
+## Unacceptable Use
+This project must not be used to:
+- Harass or target individuals
+- Make unsubstantiated allegations
+- Claim access to classified or restricted information
+- Bypass legal or ethical safeguards
+## Enforcement
+Maintainers may remove content or restrict access that violates this Code of Conduct.
+---
+This project is intended for civic transparency, education, and research.

Dockerfile.hf ADDED Viewed

	@@ -0,0 +1,5 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY . /app
+RUN pip install --no-cache-dir -r requirements.txt
+CMD ["python", "app.py"]

ETHICS.md ADDED Viewed

	@@ -0,0 +1,24 @@

+# Ethics Policy
+## Purpose
+This project exists to support transparency, research, and public understanding of government records released under the Freedom of Information Act (FOIA).
+## Guiding Principles
+- **Public Sources Only:** All data must originate from publicly released documents.
+- **No Speculation:** The project does not infer, predict, or hypothesize beyond document text.
+- **Citation First:** Outputs must be traceable to source material.
+- **No Harm:** The tool must not be used to defame, harass, or mislead.
+## Redactions
+Redacted content is respected. This project does not attempt to reconstruct or infer withheld information.
+## Accountability
+Users are responsible for how they interpret and use results. This tool provides analytical assistance, not conclusions.
+---
+Ethical transparency is foundational to this project.

FILE_INVENTORY.txt ADDED Viewed

	@@ -0,0 +1,60 @@

+CODE_OF_CONDUCT.md
+ETHICS.md
+README.md
+README_PROD.md
+__init__.py
+adapters/__init__.py
+adapters/cia.py
+adapters/common.py
+adapters/dea.py
+adapters/dhs.py
+adapters/dia.py
+adapters/dod.py
+adapters/doj.py
+adapters/fbi.py
+adapters/ice.py
+adapters/nia.py
+adapters/nsa.py
+app.py
+appeal_pdf.py
+appeals/__init__.py
+appeals/pdf_appeal.py
+audit.py
+collaboration.py
+collaboration/__init__.py
+collaboration/icij.py
+core/__init__.py
+core/analysis.py
+core/appeals.py
+core/explain.py
+core/index.py
+core/multi_program.py
+core/redaction.py
+core/search.py
+core/vector.py
+data/demo/documents/aatip_sample.txt
+data/demo/documents/tencap_sample.txt
+data/demo/metadata.json
+data/foia_sources.json
+entity_graph.py
+export_utils.py
+file_structure.txt
+foia_pdf.py
+foia_requests.py
+gitattributes.txt
+ingest/__init__.py
+ingest/agency_registry.py
+ingest/cia_reading_room.py
+ingest/fbi_vault.py
+ingest/generic_public_foia.py
+ingest/loader.py
+ingest/sources.py
+requirements.txt
+schemas.py
+search/__init__.py
+search/semantic.py
+semantic.py
+tests/__init__.py
+tests/test_core.py
+tests/test_schema.py
+vector_store.py

HASH_MANIFEST.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "CODE_OF_CONDUCT.md": "b674f96cae26f0050be863c4b8782510fcae5ab855f0822ec4a0217763a84601",
+  "ETHICS.md": "d4f7c23c1e60297712786e392800158fcbe21116576496632e8221b0b8a16ff2",
+  "README.md": "e9bfdd2d6a4422fcb132bd4033a69d2241574c31fab71820e4643491b3b1225b",
+  "app.py": "c2a2b16ce45a327de0d42196104cb7fc50ec29ff1cb1fb95517a8ca655a3192a",
+  "appeal_pdf.py": "2d28ca1d0e796bfb5da25eac05a91354aadd58deefd041acded9a01a64055f9c",
+  "audit.py": "01c286d4067c6fffcb990391d8f750719c1ccac07eafc4477ccbdd1be4dd11e8",
+  "collaboration.py": "7cbd52c0da9be9f205b2901d8a94f28cb96612ffe506bcad1c7991885cd2d947",
+  "entity_graph.py": "dbe21fa0d8e7528daeee34d598efba836ab6370ad609de80746be1b12a4e0ff5",
+  "export_utils.py": "a01a088fd650a947a7831e795508208d3caa430d099aa5a8d7823ba462f0a80e",
+  "file_structure.txt": "6eee55e586751e3ae1405349f01dd35703e678d8e105ea19fc58eb15e4c2a6fa",
+  "foia_pdf.py": "babbd69a2da67681f15596ab254174310b8381d5853da72fe068d31d746725ab",
+  "foia_requests.py": "ca9c765bb7a591c462a94b0aa42957d1b3124128266d4880f0654895ce0ca6c0",
+  "gitattributes.txt": "11ad7efa24975ee4b0c3c3a38ed18737f0658a5f75a0a96787b576a78a023361",
+  "requirements.txt": "444bc9beedfa3fde82790f47c1e9b94bab90be2fefd0648de0ffdebbcc2eb61c",
+  "schemas.py": "e08b38513be2572af7d022e013f037c4f614f2117db85d4d776c408be96815ef",
+  "semantic.py": "4ffcf9149f08b8e69473e5418588dd370bbd470b137f2d0761901fccf09238cf",
+  "vector_store.py": "c61701e38e12150c541d284e13824341dde1794d3b4149d2a7d332b8023ad923",
+  "__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+  "README_PROD.md": "3b5d0a9f882f8f980a08452ca589a788b3c7cfe2ed8b7ca13a01f9c4a12e9060",
+  "adapters/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+  "adapters/cia.py": "a934e6a67aa6662391814036a9084779f15ad9ca5059f5461e2c374dfa9c3344",
+  "adapters/common.py": "c76c7ea1ce1616a2c99bfeec47ca046e75088f4d807f9c94ce5f87c9eeed5714",
+  "adapters/dea.py": "f1b9832aeaabecf5da8f1125e33883ce28e37d081149f6c75bf9ef49ac3ead8a",
+  "adapters/dhs.py": "66c44ee323135ee8e3c0cb7a2bb83d2d9dc20f7b88b6db4823e1bd5d03be6227",
+  "adapters/dia.py": "5c003321750582f502bcf0e2115956edb9af3aa8937917b72d7b25036b493f6d",
+  "adapters/dod.py": "410726bcab164fa9991d0ba61b3d9586d271ee4d55f65d1bd02193e84f02ed30",
+  "adapters/doj.py": "56080addcaef0a01d2395b6d44a93e9e271bc569a688f65657617d730a054eac",
+  "adapters/fbi.py": "b81b80972adf70b8283f2c16b241d17f46ab3ab73cd3ab4155dc88f7afbbcfc2",
+  "adapters/ice.py": "f0d06239d483933ba53966bc8015b9ca9f3ead3ebb535f4f963f5a26afd340b3",
+  "adapters/nia.py": "cbc240d23d7ac144d0ca0a49e83341df579903092c13c7603cfe438e7dd58a84",
+  "adapters/nsa.py": "a5a7ff4f8d3b1397bccc6095471de814aad75e2711566065f8cf7f4f43c59303",
+  "appeals/pdf_appeal.py": "cfe7ca493bf9a4280eff3d90494b2e2afc8bfed92ee99d5e175c1daf49ddadf6",
+  "appeals/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+  "collaboration/icij.py": "bd02217afd54664762594dfcd1e8088ac3666c641acd450d3b233cf05f08a641",
+  "collaboration/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+  "core/analysis.py": "e745cc6ad43d5193c92b5d7c417db4546ec301a96761090a319f7a477722dd99",
+  "core/appeals.py": "9ac66f34fdb2e741b6341de258291fd99db7f4a95862e39aa4cae94448726609",
+  "core/explain.py": "accdde04f5faf85b48302917f6274a12f06b9058fac5941cfa7ce9a64a6c45a3",
+  "core/index.py": "d266fc0aacbc2445b25cafbc29530e9138bb626090fb716681f300976927903c",
+  "core/multi_program.py": "444928c79f9778ebffcdb47262ba63b2eb19d2ed4d97d5632682a92e91861138",
+  "core/redaction.py": "b99bbbcb659e1f60902bca7e2bde5b0c28f371b7a6feb9daff489bb8fd96b878",
+  "core/search.py": "5843e5ee44d88688862b73e5457ff596dd229fc9433600c2e1a978868c8a2296",
+  "core/vector.py": "518e78f8c363735f5629584d2d5e25876a7f80063cd74e72a080723380141ce8",
+  "core/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+  "data/foia_sources.json": "8fe166a285717548afb937ad7a669020c60f91d9ec9f06dbdee9954f3396bd2a",
+  "data/demo/metadata.json": "89d069dd00b20d1c74eb6f192a09b9d11226d86d5f7754159b3b1717512302d3",
+  "data/demo/documents/aatip_sample.txt": "8b8d9a6167699a123885330093dac739025bfe0d7fabfdfd596707ab53db9f81",
+  "data/demo/documents/tencap_sample.txt": "e1930579e04e76cc2ced2b5b253fa59e907b28bbffba2f8c1710693cfc84b167",
+  "ingest/agency_registry.py": "89581ae5dcf6f0e5614939ce8538e17f4e22a1751d806bfce5cd51fbf9d35f85",
+  "ingest/cia_reading_room.py": "ebfa118842937a7929a1ce58998650f11081306e8a017d53f01e11262917f2e5",
+  "ingest/fbi_vault.py": "9a24fd572db556cc182239738ca2c551d6cb6a393a325f3fc8f6db9cbf1c157b",
+  "ingest/generic_public_foia.py": "60f174b9ada68330a70ca11898ae3fbb7d225e2f404265903a5079aaa274baa1",
+  "ingest/loader.py": "12b2b68d4c3a902270be73bebb1218314f19b225f7df4e436191f433378aca18",
+  "ingest/sources.py": "4b995bff081e14cbe3b66deb516abc74fce09e29f3e36463f60bbbcaf11b075b",
+  "ingest/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+  "search/semantic.py": "974faa592af9a67ec50a691180ad68d90e00d38244871680c0c45f31a77f8f36",
+  "search/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+  "tests/test_core.py": "58e5d87c0de8482328abcc27d7a1452cdc6a69740eb0de4395c78a250d12d79e",
+  "tests/test_schema.py": "04c0343db5c7516679395717a1dd4c2eca4e325cf038e5c6ee794c2a62649119",
+  "tests/__init__.py": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
+}

HF_JUSTIFICATION.md ADDED Viewed

	@@ -0,0 +1,12 @@

+This Hugging Face Space provides a public-interest federated search interface
+across U.S. Government FOIA Electronic Reading Rooms.
+Safeguards:
+- Public sources only
+- No authentication bypass
+- Rate limiting and health checks
+- Redaction-aware previews
+- Metadata indexing only
+Intended for journalism, research, and accountability.

HF_SPACE_README.md ADDED Viewed

	@@ -0,0 +1,32 @@

+# FOIA Federated Document Search (Public Interest)
+🚀 **Hugging Face Space – Transparency & Accountability Tool**
+This application provides **semantic search across publicly released U.S. Government FOIA electronic reading rooms**.
+It does **not** access classified, private, or restricted systems.
+## What This Is
+- Federated FOIA document search
+- Semantic + keyword hybrid retrieval
+- Redaction-aware exports
+- Audit logging
+## What This Is NOT
+- Surveillance
+- Intelligence gathering
+- Law enforcement tooling
+- Political persuasion
+## Data Sources
+- CIA FOIA Electronic Reading Room
+- FBI Vault
+- Other agency FOIA libraries (public releases only)
+## Compliance
+- FOIA-only sources
+- robots.txt respected
+- Rate-limited adapters
+- Redaction before export
+## Intended Users
+Researchers, journalists, historians, and the general public.

LEGAL_MEMO.md ADDED Viewed

	@@ -0,0 +1,6 @@

+FOIA Federated Search – Legal Summary
+This system indexes publicly released FOIA records.
+No restricted access, no scraping of protected systems.
+Fully compliant with 5 U.S.C. § 552.

README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+---
+title: FOIA Federated Search
+emoji: 📜
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: "4.0"
+app_file: app.py
+pinned: false
+license: mit
+---
+# FOIA Federated Search (Public Interest)
+A Hugging Face Space that provides **live federated search** across publicly available
+U.S. Government FOIA Electronic Reading Rooms (CIA, FBI, DoD, and more).
+## Key Features
+- Live async fan-out search (no scraping beyond public endpoints)
+- Per-agency source toggles + result counts
+- Semantic *search-in-results* using FAISS + sentence-transformers
+- Local caching + deduplication
+- PDF export of search results
+- Inline document preview (where permitted by source)
+- Rate-limited, health-checked agency adapters
+## Trust & Safety
+- Queries only **public FOIA reading rooms**
+- Honors robots.txt, rate limits, and agency terms
+- No authentication bypass or restricted content
+- Designed for research, journalism, and public accountability
+## Legal
+All content remains hosted by the originating agency.
+This tool indexes metadata and snippets for discovery only.

README_PROD.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# FOIA HF Document Search — Production Build
+## Entry Point
+- `app.py` — orchestrates ingestion, semantic search, export, and audit hooks.
+## Ingestion Adapters (Present)
+- CIA Reading Room
+- FBI Vault
+- Generic Public FOIA
+## Missing / Stubbed Adapters (Recommended)
+- DoD (incl. components)
+- NSA
+- DIA
+- DHS
+- DEA
+- ICE
+## Vector Backend Assumptions
+- Current code supports abstract vector ops.
+- Recommended backends:
+  - FAISS (local)
+  - Chroma (persistent)
+  - HuggingFace embeddings
+  - OpenAI embeddings (optional)
+## Live Federated Search Upgrade
+- Async querying via `asyncio` + `httpx`
+- Adapter interface with rate limits
+- Response caching + deduplication
+- Circuit breakers for abuse prevention
+## Compliance
+- Respect robots.txt where applicable
+- Rate limiting per agency
+- Redaction before export
+- Audit logging enabled
+## Build Timestamp
+2026-01-09T23:51:16.728748Z

SOURCES.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# FOIA Public Sources
+All sources listed here are **public FOIA electronic reading rooms** or official public-release libraries.
+## Intelligence & Defense
+- CIA FOIA Electronic Reading Room — https://www.cia.gov/readingroom/
+- FBI Vault — https://vault.fbi.gov/
+- DARPA FOIA Library — https://www.darpa.mil/work-with-us/foia
+- NRO FOIA Reading Room — https://www.nro.gov/FOIA/
+- DoD FOIA Reading Room — https://www.esd.whs.mil/FOIA/Reading-Room/
+## Military Branches
+- U.S. Army FOIA — https://www.army.mil/foia
+- U.S. Navy FOIA — https://www.secnav.navy.mil/foia
+- U.S. Air Force FOIA — https://www.af.mil/FOIA/
+- U.S. Marine Corps FOIA — https://www.hqmc.marines.mil/Agencies/FOIA/
+- U.S. Space Force FOIA — https://www.spaceforce.mil/FOIA/
+- U.S. Coast Guard FOIA — https://www.uscg.mil/FOIA/
+## Other Agencies
+- DHS FOIA Library — https://www.dhs.gov/foia-library
+- DEA FOIA Reading Room — https://www.dea.gov/foia
+- Secret Service FOIA — https://www.secretservice.gov/foia

__init__.py ADDED Viewed

File without changes

aatip_reading_room.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from ingest.generic_public_foia import GenericFOIAAdapter
+class AATIPAdapter(GenericFOIAAdapter):
+    """Public-release FOIA reading room adapter.
+    NOTE: This adapter is restricted to publicly released materials only.
+    """
+    name = "AATIP"
+    rate_limit = 1  # requests per second
+    robots_respected = True

aatip_sample.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ AATIP referenced ████ by DoD components between 2009 and 2017.

agency_registry.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# ingest/agency_registry.py
+# Domains are PUBLIC FOIA / reading room hosts only.
+# Labels may include units if (and only if) a public FOIA page exists
+# and the user provides its URL.
+ALLOWED_FOIA_SOURCES = {
+    # Core
+    "vault.fbi.gov": "FBI",
+    "www.cia.gov": "CIA",
+    "www.archives.gov": "NARA",
+    "foia.state.gov": "State Dept",
+    "www.nsa.gov": "NSA",
+    "www.defense.gov": "DoD",
+    "www.esd.whs.mil": "DoD FOIA",
+    "www.whitehouse.gov": "White House",
+    # Military (public FOIA pages)
+    "www.af.mil": "USAF",
+    "www.navy.mil": "US Navy",
+    "www.marines.mil": "USMC",
+    "www.army.mil": "US Army",
+    "www.spaceforce.mil": "US Space Force",
+    # Intelligence / defense components (public FOIA pages only)
+    "www.dia.mil": "DIA",
+    "www.nro.gov": "NRO",
+    # Law enforcement / protective services (public FOIA pages)
+    "www.secretservice.gov": "US Secret Service",
+    "www.dea.gov": "DEA",
+    # Labels for historical / organizational references
+    # (NO claim of dedicated public repositories)
+    # These are ONLY labels applied if a public FOIA URL is supplied.
+    "label:SAC": "CIA Special Activities Center (label only)",
+    "label:SAD": "CIA Special Activities Division (label only)",
+    "label:NIA": "National Intelligence Authority (historical)"
+}

air_force_foia_reading_room.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from ingest.generic_public_foia import GenericFOIAAdapter
+class USAirForceAdapter(GenericFOIAAdapter):
+    """Public-release FOIA reading room adapter.
+    NOTE: This adapter is restricted to publicly released materials only.
+    """
+    name = "USAirForce"
+    rate_limit = 1  # requests per second
+    robots_respected = True

analysis.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from typing import Dict, List
+def build_timeline(docs: List[dict]) -> Dict[str, int]:
+    timeline: Dict[str, int] = {}
+    for d in docs:
+        year = d.get("date", "")[:4]
+        if not year.isdigit():
+            continue
+        timeline[year] = timeline.get(year, 0) + 1
+    return timeline

analytics.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import time
+from collections import Counter
+_events = Counter()
+def track(event: str):
+    _events[event] += 1
+def snapshot():
+    return {
+        "timestamp": int(time.time()),
+        "events": dict(_events)
+    }

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import gradio as gr
+import asyncio
+from ingest.cia_reading_room import CIAAdapter
+from ingest.fbi_vault_live import FBIAdapter
+from ingest.dod_reading_room_live import DoDAdapter
+from core.async_search import fanout_search
+from core.cache import dedupe
+from core.cluster import cluster_results
+from core.citations import citation_block
+from core.redaction import redaction_confidence
+from core.journalist import journalist_export
+from core.explain import explain
+cia, fbi, dod = CIAAdapter(), FBIAdapter(), DoDAdapter()
+async def run(q):
+    res = await fanout_search([cia,fbi,dod], q)
+    return dedupe(res)
+with gr.Blocks() as demo:
+    gr.Markdown("# FOIA Federated Search — Supreme")
+    q = gr.Textbox(label="Query")
+    results_state = gr.State([])
+    with gr.Tabs():
+        with gr.Tab("Clustered Results"):
+            clusters = gr.JSON()
+        with gr.Tab("Citations"):
+            cites = gr.Markdown()
+        with gr.Tab("Explainability"):
+            explain_box = gr.JSON()
+    preview = gr.JSON(label="Redaction Confidence")
+    def _run(q):
+        res = asyncio.run(run(q))
+        cl = cluster_results(res)
+        cites_md = "\n".join(citation_block(r) for r in res[:5])
+        explain_data = explain(res)
+        red = {r.get("url"): redaction_confidence(r) for r in res}
+        return res, cl, cites_md, explain_data, red
+    btn = gr.Button("Search")
+    btn.click(_run, inputs=q, outputs=[results_state, clusters, cites, explain_box, preview])
+    exp = gr.Button("Journalist Export")
+    out = gr.File()
+    exp.click(lambda r: journalist_export(r, "/tmp/journalist_export.zip"), inputs=results_state, outputs=out)
+demo.launch()

appeal_pdf.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from reportlab.platypus import SimpleDocTemplate, Paragraph
+from reportlab.lib.styles import getSampleStyleSheet
+def generate_appeal_pdf(text, filename="appeal.pdf"):
+    doc = SimpleDocTemplate(filename)
+    styles = getSampleStyleSheet()
+    doc.build([Paragraph(text, styles["BodyText"])])
+    return filename

appeals.py ADDED Viewed

	@@ -0,0 +1,16 @@

+def draft_appeal(document: str, agency: str, reason: str) -> str:
+    return f"""
+FOIA Appeal – Request for Reconsideration
+Agency: {agency}
+Document: {document}
+Basis for Appeal:
+{reason}
+This appeal concerns a publicly released document and requests
+review of redactions or withholdings under applicable FOIA exemptions.
+Sincerely,
+[Requestor]
+""".strip()

army_foia_reading_room.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from ingest.generic_public_foia import GenericFOIAAdapter
+class USArmyAdapter(GenericFOIAAdapter):
+    """Public-release FOIA reading room adapter.
+    NOTE: This adapter is restricted to publicly released materials only.
+    """
+    name = "USArmy"
+    rate_limit = 1  # requests per second
+    robots_respected = True

async_search.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import asyncio
+async def fanout_search(adapters, query):
+    tasks = [adapter.search(query) for adapter in adapters]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    docs = []
+    for r in results:
+        if isinstance(r, list):
+            docs.extend(r)
+    return docs

audit.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import uuid
+from datetime import datetime
+from typing import Dict, List
+_AUDIT_LOG: List[Dict] = []
+def log_event(action: str, payload: Dict) -> Dict:
+    entry = {
+        "id": str(uuid.uuid4()),
+        "timestamp": datetime.utcnow().isoformat() + "Z",
+        "action": action,
+        "payload": payload
+    }
+    _AUDIT_LOG.append(entry)
+    return entry
+def export_audit_log() -> List[Dict]:
+    return list(_AUDIT_LOG)

cache.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import time
+from typing import Dict, Any, List
+from core.faiss_vector import FaissIndex
+_TTL = 300  # seconds
+_cache: Dict[str, Any] = {}
+_faiss = None
+def _now():
+    return int(time.time())
+def _get_index():
+    global _faiss
+    if _faiss is None:
+        _faiss = FaissIndex()
+    return _faiss
+def cache_get(key):
+    v = _cache.get(key)
+    if not v:
+        return None
+    ts, data = v
+    if _now() - ts > _TTL:
+        _cache.pop(key, None)
+        return None
+    return data
+def cache_set(key, data: List[dict]):
+    _cache[key] = (_now(), data)
+    # add snippets to FAISS for local semantic recall
+    texts = [d.get("snippet","") for d in data if d.get("snippet")]
+    if texts:
+        try:
+            _get_index().add(texts)
+        except Exception:
+            pass
+def dedupe(results: List[dict]) -> List[dict]:
+    seen = set()
+    out = []
+    for r in results:
+        h = hash((r.get("source"), r.get("url"), r.get("snippet")))
+        if h not in seen:
+            seen.add(h)
+            out.append(r)
+    return out
+def source_counts(results: List[dict]) -> Dict[str,int]:
+    counts = {}
+    for r in results:
+        s = r.get("source","Unknown")
+        counts[s] = counts.get(s, 0) + 1
+    return counts

cia.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from .common import fetch, clean
+from bs4 import BeautifulSoup
+def search_cia(query):
+    url = "https://www.cia.gov/readingroom/search/site/"
+    html = fetch(url, {"search_api_fulltext": query})
+    soup = BeautifulSoup(html, "html.parser")
+    results = []
+    for item in soup.select(".views-row"):
+        a = item.select_one("a")
+        if not a:
+            continue
+        results.append({
+            "title": clean(a.text),
+            "agency": "CIA",
+            "date": None,
+            "snippet": None,
+            "url": "https://www.cia.gov" + a["href"]
+        })
+    return results

cia_reading_room.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import httpx
+from ingest.generic_public_foia import GenericFOIAAdapter
+class CIAAdapter(GenericFOIAAdapter):
+    name = "CIA"
+    rate_limit = 1
+    robots_respected = True
+    base_url = "https://www.cia.gov/readingroom/search/site/"
+    async def search(self, query: str):
+        async with httpx.AsyncClient(timeout=10) as client:
+            r = await client.get(self.base_url, params={"query": query})
+            if r.status_code != 200:
+                return []
+            # Minimal safe parse: return page-level hit
+            return [{
+                "source": "CIA FOIA Reading Room",
+                "query": query,
+                "url": str(r.url),
+                "snippet": "Public FOIA search result page"
+            }]

citations.py ADDED Viewed

	@@ -0,0 +1,7 @@

+def citation_block(result: dict) -> str:
+    return f"""---
+Source: {result.get('source')}
+Title: {result.get('title','Unknown')}
+URL: {result.get('url')}
+Retrieved: {result.get('retrieved_at','N/A')}
+---"""

cluster.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from typing import List, Dict
+from core.faiss_vector import FaissIndex
+def cluster_results(results: List[dict], k: int = 5) -> Dict[str, List[dict]]:
+    texts = [r.get("snippet","") for r in results if r.get("snippet")]
+    index = FaissIndex()
+    index.add(texts)
+    clusters = {}
+    for r in results:
+        key = r.get("source","Unknown")
+        clusters.setdefault(key, []).append(r)
+    return clusters

coast_guard_foia_reading_room.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from ingest.generic_public_foia import GenericFOIAAdapter
+class USCoastGuardAdapter(GenericFOIAAdapter):
+    """Public-release FOIA reading room adapter.
+    NOTE: This adapter is restricted to publicly released materials only.
+    """
+    name = "USCoastGuard"
+    rate_limit = 1  # requests per second
+    robots_respected = True

collaboration.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from datasets import Dataset
+from typing import Dict, List
+_COLLAB: List[Dict] = []
+def add_collaboration_note(document: str, note: str) -> Dict:
+    record = {
+        "document": document,
+        "note": note
+    }
+    _COLLAB.append(record)
+    return record
+def get_collaboration_dataset() -> Dataset:
+    if not _COLLAB:
+        return Dataset.from_dict({"document": [], "note": []})
+    return Dataset.from_list(_COLLAB)

common.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import requests
+from bs4 import BeautifulSoup
+HEADERS = {
+    "User-Agent": "FOIA-Federated-Search/1.0 (public, non-crawling)"
+}
+def fetch(url, params=None):
+    r = requests.get(url, params=params, headers=HEADERS, timeout=10)
+    r.raise_for_status()
+    return r.text
+def clean(text):
+    return " ".join(text.split()) if text else ""

darpa_reading_room.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from ingest.generic_public_foia import GenericFOIAAdapter
+class DARPAAdapter(GenericFOIAAdapter):
+    """Public-release FOIA reading room adapter.
+    NOTE: This adapter is restricted to publicly released materials only.
+    """
+    name = "DARPA"
+    rate_limit = 1  # requests per second
+    robots_respected = True

dea.py ADDED Viewed

	@@ -0,0 +1,8 @@

+def search_dea(query):
+    return [{
+        "title": "DEA FOIA Reading Room",
+        "agency": "DEA",
+        "date": None,
+        "snippet": query,
+        "url": "https://www.dea.gov/foia"
+    }]

dea_reading_room.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from ingest.generic_public_foia import GenericFOIAAdapter
+class DEAAdapter(GenericFOIAAdapter):
+    name = "DEA"
+    rate_limit = 1  # requests per second
+    robots_respected = True

dhs.py ADDED Viewed

	@@ -0,0 +1,8 @@

+def search_dhs(query):
+    return [{
+        "title": f"DHS FOIA Search",
+        "agency": "DHS",
+        "date": None,
+        "snippet": query,
+        "url": "https://www.dhs.gov/foia"
+    }]

dhs_reading_room.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from ingest.generic_public_foia import GenericFOIAAdapter
+class DHSAdapter(GenericFOIAAdapter):
+    name = "DHS"
+    rate_limit = 1  # requests per second
+    robots_respected = True

dia.py ADDED Viewed

	@@ -0,0 +1,8 @@

+def search_dia(query):
+    return [{
+        "title": "DIA FOIA Reading Room",
+        "agency": "DIA",
+        "date": None,
+        "snippet": query,
+        "url": "https://www.dia.mil/FOIA/"
+    }]

dia_reading_room.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from ingest.generic_public_foia import GenericFOIAAdapter
+class DIAAdapter(GenericFOIAAdapter):
+    name = "DIA"
+    rate_limit = 1  # requests per second
+    robots_respected = True

dod.py ADDED Viewed

	@@ -0,0 +1,8 @@

+def search_dod(query):
+    return [{
+        "title": f"DoD FOIA Search: {query}",
+        "agency": "DoD",
+        "date": None,
+        "snippet": "Redirect to DoD FOIA Reading Room search",
+        "url": "https://open.defense.gov/Transparency/FOIA/"
+    }]

dod_reading_room.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from ingest.generic_public_foia import GenericFOIAAdapter
+class DoDAdapter(GenericFOIAAdapter):
+    name = "DoD"
+    rate_limit = 1  # requests per second
+    robots_respected = True

dod_reading_room_live.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import httpx
+from ingest.generic_public_foia import GenericFOIAAdapter
+class DoDAdapter(GenericFOIAAdapter):
+    name = "DoD FOIA Reading Room"
+    rate_limit = 1
+    robots_respected = True
+    base_url = "https://www.esd.whs.mil/FOIA/Reading-Room/"
+    async def search(self, query: str):
+        async with httpx.AsyncClient(timeout=10) as client:
+            r = await client.get(self.base_url, params={"search": query})
+            if r.status_code != 200:
+                return []
+            return [{
+                "source": "DoD FOIA Reading Room",
+                "query": query,
+                "url": str(r.url),
+                "snippet": "Public DoD FOIA reading room page"
+            }]

doj.py ADDED Viewed

	@@ -0,0 +1,8 @@

+def search_doj(query):
+    return [{
+        "title": f"DOJ FOIA Reading Room",
+        "agency": "DOJ",
+        "date": None,
+        "snippet": query,
+        "url": "https://www.justice.gov/oip/foia-reading-room"
+    }]

entity_graph.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import networkx as nx
+from typing import List, Dict
+def build_entity_graph(docs: List[Dict]) -> Dict:
+    G = nx.Graph()
+    for d in docs:
+        agency = d.get("agency", "Unknown")
+        G.add_node(agency, group="agency")
+        for token in d.get("content", "").split():
+            if token.isupper() and len(token) > 2:
+                G.add_node(token, group="entity")
+                G.add_edge(agency, token)
+    return {
+        "nodes": [{"id": n, "group": G.nodes[n]["group"]} for n in G.nodes],
+        "links": [{"source": u, "target": v} for u, v in G.edges]
+    }

explain.py ADDED Viewed

	@@ -0,0 +1,12 @@

+def explain(results):
+    return {
+        "total_results": len(results),
+        "sources": list(set(r.get("source") for r in results)),
+        "methods": [
+            "Public FOIA reading room search",
+            "Async fan-out querying",
+            "Deduplication",
+            "Semantic refinement (FAISS)"
+        ],
+        "no_restricted_access": True
+    }

export_utils.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import json
+def export_json(data):
+    path = "/tmp/results.json"
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+    return path

faiss_vector.py ADDED Viewed

	@@ -0,0 +1,25 @@

+try:
+    import faiss
+    from sentence_transformers import SentenceTransformer
+except ImportError:
+    faiss = None
+class FaissIndex:
+    def __init__(self, model_name="all-MiniLM-L6-v2"):
+        if faiss is None:
+            raise RuntimeError("FAISS not installed")
+        self.model = SentenceTransformer(model_name)
+        self.index = None
+        self.docs = []
+    def add(self, texts):
+        emb = self.model.encode(texts)
+        if self.index is None:
+            self.index = faiss.IndexFlatL2(emb.shape[1])
+        self.index.add(emb)
+        self.docs.extend(texts)
+    def search(self, query, k=5):
+        emb = self.model.encode([query])
+        D, I = self.index.search(emb, k)
+        return [self.docs[i] for i in I[0] if i < len(self.docs)]

fbi.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .common import fetch, clean
+from bs4 import BeautifulSoup
+def search_fbi(query):
+    html = fetch("https://vault.fbi.gov/search", {"SearchableText": query})
+    soup = BeautifulSoup(html, "html.parser")
+    results = []
+    for a in soup.select("a"):
+        href = a.get("href", "")
+        if "/vault/" in href:
+            results.append({
+                "title": clean(a.text),
+                "agency": "FBI",
+                "date": None,
+                "snippet": None,
+                "url": href if href.startswith("http") else "https://vault.fbi.gov" + href
+            })
+    return results

fbi_vault.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import requests
+from bs4 import BeautifulSoup
+def ingest_fbi_vault(url: str) -> dict:
+    r = requests.get(url, timeout=10)
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "html.parser")
+    text = soup.get_text(separator=" ", strip=True)
+    title = soup.find("h1")
+    return {
+        "source": "FBI Vault",
+        "agency": "FBI",
+        "url": url,
+        "title": title.text if title else "FBI Vault Document",
+        "text": text[:10000]
+    }(r.text, "html.parser")
+    title = soup.find("h1")
+    body = soup.get_text(separator=" ", strip=True)
+    return {
+        "source": "FBI Vault",
+        "url": url,
+        "title": title.text if title else "Untitled FBI Vault Document",
+        "text": body,
+        "agency": "FBI"
+    }