Spaces:

GodsDevProject
/

FOIA_Doc_Search

Sleeping

App Files Files Community

GodsDevProject commited on Jan 11

Commit

59314d7

verified ·

1 Parent(s): 9b034b1

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -162

app.py CHANGED Viewed

@@ -4,30 +4,15 @@
 # ======================================================
 import gradio as gr
-import time, hashlib, io, zipfile, os, tempfile, base64
 from datetime import datetime
 from urllib.parse import quote_plus
 import requests
-# Optional PDF tooling (safe fallbacks)
-PDF_TEXT_AVAILABLE = False
-PDF_THUMBNAIL_AVAILABLE = False
-try:
-    from pdfminer.high_level import extract_text
-    PDF_TEXT_AVAILABLE = True
-except Exception:
-    pass
-try:
-    from pdf2image import convert_from_bytes
-    PDF_THUMBNAIL_AVAILABLE = True
-except Exception:
-    pass
-from reportlab.platypus import (
-    SimpleDocTemplate, Paragraph, PageBreak
-)
 from reportlab.lib.styles import getSampleStyleSheet
 from reportlab.lib.pagesizes import LETTER
@@ -35,16 +20,17 @@ from reportlab.lib.pagesizes import LETTER
 # HARD GOVERNANCE FLAGS (NON-NEGOTIABLE)
 # ======================================================
-ENABLE_FAISS_PHASE_4 = False        # HARD DISABLED
-ENABLE_AI = True                   # USER OPT-IN ONLY
-ENABLE_PDF_EXTRACTION = True        # USER OPT-IN ONLY
-FIPS_140_MODE = False
 # ======================================================
 # SESSION STATE (EPHEMERAL)
 # ======================================================
 LAST_RESULTS = []
 SELECTED_INDEX = None
 # ======================================================
@@ -52,7 +38,7 @@ SELECTED_INDEX = None
 # ======================================================
 def sha256_text(t: str):
-    return hashlib.sha256(t.encode()).hexdigest()
 def citation_hash(r):
     return hashlib.sha256(
@@ -66,39 +52,43 @@ def provenance_headers(payload: str):
         "Content-SHA256": sha256_text(payload),
         "Public-Source-Only": "true",
         "AI-Assisted": "formatting-only",
-        "FIPS-140-Mode": str(FIPS_140_MODE).lower(),
     }
 def render_provenance_block(text: str):
-    return "\n".join(
-        f"{k}: {v}" for k, v in provenance_headers(text).items()
-    )
 # ======================================================
-# FAISS PHASE-4 (STUB — GOVERNANCE LOCKED)
 # ======================================================
-class Phase4FAISSStub:
     def __init__(self):
-        if ENABLE_FAISS_PHASE_4:
             raise RuntimeError(
-                "FAISS Phase-4 indexing is disabled by governance policy."
             )
 # ======================================================
-# FOIA ADAPTERS (LINK-OUT ONLY, API-READY)
 # ======================================================
 class FOIAAdapter:
     agency = "UNKNOWN"
     search_url = ""
-    api_endpoint = None   # future document-level APIs
     def search(self, query):
         start = time.time()
         url = self.search_url.format(q=quote_plus(query))
         latency = round((time.time() - start) * 1000, 1)
         return [{
             "agency": self.agency,
             "title": f"{self.agency} FOIA Reading Room",
@@ -106,16 +96,26 @@ class FOIAAdapter:
             "timestamp": datetime.utcnow().isoformat(),
             "latency_ms": latency,
             "sealed": False,
-            "redacted": False,
         }]
 class CIA(FOIAAdapter):
     agency = "CIA"
     search_url = "https://www.cia.gov/readingroom/search/site/{q}"
 class FBI(FOIAAdapter):
     agency = "FBI"
     search_url = "https://vault.fbi.gov/search?SearchableText={q}"
 class DOJ(FOIAAdapter):
     agency = "DOJ"
@@ -123,7 +123,7 @@ class DOJ(FOIAAdapter):
 class DHS(FOIAAdapter):
     agency = "DHS"
-    search_url = "https://www.dhs.gov/foia-library"
 class STATE(FOIAAdapter):
     agency = "State Department"
@@ -131,7 +131,7 @@ class STATE(FOIAAdapter):
 class NSA(FOIAAdapter):
     agency = "NSA"
-    search_url = "https://www.nsa.gov/resources/everyone/foia/reading-room/"
 ALL_ADAPTERS = {
     "CIA": CIA(),
@@ -143,7 +143,7 @@ ALL_ADAPTERS = {
 }
 # ======================================================
-# PDF RESOLUTION (SAFE HEAD CHECK)
 # ======================================================
 def resolve_pdf_url(url):
@@ -155,25 +155,6 @@ def resolve_pdf_url(url):
     except Exception:
         return False, url
-def generate_pdf_thumbnails(url, max_pages=2):
-    if not PDF_THUMBNAIL_AVAILABLE:
-        return []
-    try:
-        r = requests.get(url, timeout=10)
-        images = convert_from_bytes(
-            r.content,
-            first_page=1,
-            last_page=max_pages
-        )
-        thumbs = []
-        for img in images:
-            buf = io.BytesIO()
-            img.save(buf, format="PNG")
-            thumbs.append(base64.b64encode(buf.getvalue()).decode())
-        return thumbs
-    except Exception:
-        return []
 # ======================================================
 # SEARCH
 # ======================================================
@@ -182,21 +163,14 @@ def run_search(query, agencies):
     global LAST_RESULTS, SELECTED_INDEX
     LAST_RESULTS = []
     SELECTED_INDEX = None
     rows = []
     for name in agencies:
         adapter = ALL_ADAPTERS[name]
         for r in adapter.search(query):
-            is_pdf, resolved = resolve_pdf_url(r["url"])
-            r["resolved_pdf"] = is_pdf
-            r["resolved_url"] = resolved
             r["hash"] = citation_hash(r)
-            r["thumbnails"] = (
-                generate_pdf_thumbnails(resolved) if is_pdf else []
-            )
             LAST_RESULTS.append(r)
             rows.append([
                 r["agency"],
                 r["title"],
@@ -208,143 +182,183 @@ def run_search(query, agencies):
     return rows, render_cards(), "No document selected"
 # ======================================================
-# RESULT CARDS
 # ======================================================
 def render_cards():
     cards = []
     for idx, r in enumerate(LAST_RESULTS):
-        thumbs = "".join(
-            f'<img src="data:image/png;base64,{t}" '
-            f'style="width:30%;margin:4px;border-radius:6px;" />'
-            for t in r["thumbnails"]
-        )
-        preview = thumbs or f'<a href="{r["resolved_url"]}" target="_blank">View Source</a>'
         cards.append(f"""
         <div class="card">
           <div class="card-header">
-            <b>{r['agency']}</b>
-            <span class="badge">PUBLIC</span>
           </div>
           <div><b>{r['title']}</b></div>
-          <div>{preview}</div>
           <div class="actions">
-            <a href="{r['resolved_url']}" target="_blank">View</a>
-            {"<a href='"+r['resolved_url']+"' download>Download</a>" if r["resolved_pdf"] else ""}
-            <a href="{r['resolved_url']}" target="_blank">Share</a>
           </div>
         </div>
         """)
-    return "".join(cards) or "No results."
 # ======================================================
-# CM/ECF BUNDLE
 # ======================================================
-def generate_ecf_filing_number():
-    return f"ECF-PREFILE-{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}"
-def generate_cover_sheet_pdf(district, ecf_no):
-    buf = io.BytesIO()
-    styles = getSampleStyleSheet()
-    body = (
-        f"<b>CM/ECF PRE-FILING COVER SHEET</b><br/><br/>"
-        f"<b>District:</b> {district}<br/>"
-        f"<b>Reference No.:</b> {ecf_no}<br/><br/>"
-        "This bundle contains public FOIA references only.<br/>"
-        "No filing, certification, or authentication is made."
-    )
-    doc = SimpleDocTemplate(buf, pagesize=LETTER)
-    doc.build([
-        Paragraph(body, styles["Normal"]),
-        PageBreak(),
-        Paragraph(
-            render_provenance_block(body).replace("\n", "<br/>"),
-            styles["Code"]
-        )
-    ])
-    buf.seek(0)
-    return buf
-def generate_court_bundle(district):
-    ecf_no = generate_ecf_filing_number()
     with tempfile.TemporaryDirectory() as td:
         zpath = os.path.join(td, "court_bundle.zip")
         with zipfile.ZipFile(zpath, "w") as z:
-            z.writestr(
-                "00_Cover_Sheet.pdf",
-                generate_cover_sheet_pdf(district, ecf_no).read()
-            )
             for i, r in enumerate(LAST_RESULTS, 1):
                 z.writestr(
-                    f"Exhibit_{i:03d}.txt",
-                    f"{r['agency']}\n{r['resolved_url']}"
                 )
                 z.writestr(
-                    f"Exhibit_{i:03d}.sha256",
-                    r["hash"]
                 )
-        return open(zpath, "rb")
 # ======================================================
 # UI
 # ======================================================
 CSS = """
-.card { border:1px solid #ddd; border-radius:16px; padding:16px; margin-bottom:20px; }
 .card-header { display:flex; justify-content:space-between; }
-.actions { margin-top:10px; display:flex; gap:12px; }
-.badge { background:#eef; padding:4px 10px; border-radius:999px; }
 """
 with gr.Blocks(css=CSS, title="Federal FOIA Intelligence Search") as app:
-    gr.Markdown(
-        "## Federal FOIA Intelligence Search\n"
-        "Public FOIA reading rooms only • Research & education use"
-    )
     with gr.Tab("Search"):
-        agencies = gr.CheckboxGroup(
-            list(ALL_ADAPTERS.keys()),
-            value=list(ALL_ADAPTERS.keys()),
-            label="Agencies"
-        )
-        query = gr.Textbox(placeholder="Search terms")
-        table = gr.Dataframe(
-            headers=["Agency", "Title", "Resolved URL", "Hash", "Latency"]
-        )
         gallery = gr.HTML()
-        status = gr.Textbox(label="Status")
-        gr.Button("Search").click(
-            run_search,
-            [query, agencies],
-            [table, gallery, status]
         )
-    with gr.Tab("Court / Clerk"):
-        district = gr.Dropdown(
-            ["Generic", "D.D.C.", "S.D.N.Y.", "N.D. Cal."],
-            value="Generic"
-        )
-        gr.File(label="Download CM/ECF Bundle").upload(
-            lambda d=district: generate_court_bundle(d)
         )
-    with gr.Tab("Governance"):
-        gr.Markdown(
-            "• No scraping\n"
-            "• No certification\n"
-            "• AI formatting only\n"
-            "• Court-safe by design"
-        )
-app.launch(
-    server_name="0.0.0.0",
-    server_port=7860,
-    share=True
-)

 # ======================================================
 import gradio as gr
+import time, hashlib, io, zipfile, os, tempfile, base64, json
 from datetime import datetime
 from urllib.parse import quote_plus
 import requests
+from fastapi import FastAPI, Response
+from fastapi.staticfiles import StaticFiles
+from reportlab.platypus import SimpleDocTemplate, Paragraph, PageBreak
 from reportlab.lib.styles import getSampleStyleSheet
 from reportlab.lib.pagesizes import LETTER
 # HARD GOVERNANCE FLAGS (NON-NEGOTIABLE)
 # ======================================================
+ENABLE_FAISS_PHASE_4 = False          # DEFAULT OFF – requires formal approval
+ENABLE_AI = True                     # USER OPT-IN ONLY
+ENABLE_PDF_EXTRACTION = True         # USER OPT-IN ONLY
+ENABLE_DOC_LEVEL_APIS = False        # API-ONLY, OFF BY DEFAULT
 # ======================================================
 # SESSION STATE (EPHEMERAL)
 # ======================================================
 LAST_RESULTS = []
+AI_APPENDICES = []
 SELECTED_INDEX = None
 # ======================================================
 # ======================================================
 def sha256_text(t: str):
+    return hashlib.sha256(t.encode("utf-8")).hexdigest()
 def citation_hash(r):
     return hashlib.sha256(
         "Content-SHA256": sha256_text(payload),
         "Public-Source-Only": "true",
         "AI-Assisted": "formatting-only",
+        "Court-Safe": "true",
     }
 def render_provenance_block(text: str):
+    return "\n".join(f"{k}: {v}" for k, v in provenance_headers(text).items())
 # ======================================================
+# PHASE-4 FAISS (HARD GATED)
 # ======================================================
+class Phase4FAISS:
     def __init__(self):
+        if not ENABLE_FAISS_PHASE_4:
             raise RuntimeError(
+                "Phase-4 FAISS indexing is disabled pending formal approval."
             )
+        self.index = {}
+    def add_document(self, doc_id, text):
+        self.index[doc_id] = text
+    def search(self, query):
+        return []  # intentionally non-operational until approved
 # ======================================================
+# FOIA ADAPTERS (LIVE LINK-OUT + API-READY)
 # ======================================================
 class FOIAAdapter:
     agency = "UNKNOWN"
     search_url = ""
+    api_endpoint = None  # API-ONLY when available
     def search(self, query):
         start = time.time()
         url = self.search_url.format(q=quote_plus(query))
         latency = round((time.time() - start) * 1000, 1)
         return [{
             "agency": self.agency,
             "title": f"{self.agency} FOIA Reading Room",
             "timestamp": datetime.utcnow().isoformat(),
             "latency_ms": latency,
             "sealed": False,
         }]
+    def api_ingest(self, query):
+        if not ENABLE_DOC_LEVEL_APIS or not self.api_endpoint:
+            return []
+        try:
+            r = requests.get(self.api_endpoint, params={"q": query}, timeout=10)
+            return r.json().get("documents", [])
+        except Exception:
+            return []
 class CIA(FOIAAdapter):
     agency = "CIA"
     search_url = "https://www.cia.gov/readingroom/search/site/{q}"
+    api_endpoint = None  # published when CIA releases API
 class FBI(FOIAAdapter):
     agency = "FBI"
     search_url = "https://vault.fbi.gov/search?SearchableText={q}"
+    api_endpoint = None  # placeholder for FBI API
 class DOJ(FOIAAdapter):
     agency = "DOJ"
 class DHS(FOIAAdapter):
     agency = "DHS"
+    search_url = "https://www.dhs.gov/foia-library/search?search={q}"
 class STATE(FOIAAdapter):
     agency = "State Department"
 class NSA(FOIAAdapter):
     agency = "NSA"
+    search_url = "https://www.nsa.gov/resources/everyone/foia/reading-room/?q={q}"
 ALL_ADAPTERS = {
     "CIA": CIA(),
 }
 # ======================================================
+# PDF RESOLUTION (SAFE)
 # ======================================================
 def resolve_pdf_url(url):
     except Exception:
         return False, url
 # ======================================================
 # SEARCH
 # ======================================================
     global LAST_RESULTS, SELECTED_INDEX
     LAST_RESULTS = []
     SELECTED_INDEX = None
     rows = []
     for name in agencies:
         adapter = ALL_ADAPTERS[name]
         for r in adapter.search(query):
+            r["resolved_pdf"], r["resolved_url"] = resolve_pdf_url(r["url"])
             r["hash"] = citation_hash(r)
             LAST_RESULTS.append(r)
             rows.append([
                 r["agency"],
                 r["title"],
     return rows, render_cards(), "No document selected"
 # ======================================================
+# AI GOVERNANCE + HASHED OUTPUT
+# ======================================================
+def can_enable_ai(r):
+    return (
+        ENABLE_AI
+        and r.get("resolved_pdf", False)
+        and not r.get("sealed", False)
+    )
+def ask_ai_for_document(index):
+    global SELECTED_INDEX, AI_APPENDICES
+    SELECTED_INDEX = index
+    r = LAST_RESULTS[index]
+    ai_text = (
+        "AI-ASSISTED REFERENCE SUMMARY\n\n"
+        f"Agency: {r['agency']}\n"
+        f"Source URL: {r['resolved_url']}\n\n"
+        "This content is assistive, non-authoritative, "
+        "and not offered as evidence or legal analysis."
+    )
+    ai_hash = sha256_text(ai_text)
+    provenance = render_provenance_block(ai_text)
+    appendix = {
+        "index": index,
+        "text": ai_text,
+        "hash": ai_hash,
+        "provenance": provenance,
+    }
+    AI_APPENDICES.append(appendix)
+    return (
+        ai_text
+        + "\n\n---\nAI HASH:\n"
+        + ai_hash
+        + "\n\nPROVENANCE:\n"
+        + provenance
+    )
+# ======================================================
+# RENDER RESULT CARDS
 # ======================================================
 def render_cards():
     cards = []
     for idx, r in enumerate(LAST_RESULTS):
+        enabled = can_enable_ai(r)
         cards.append(f"""
         <div class="card">
           <div class="card-header">
+            <strong>{r['agency']}</strong>
+            <button class="ask-ai"
+              onclick="window.askAI({idx})"
+              {"disabled" if not enabled else ""}>
+              Ask AI
+            </button>
           </div>
           <div><b>{r['title']}</b></div>
           <div class="actions">
+            <a href="{r['resolved_url']}" target="_blank">View Source</a>
           </div>
         </div>
         """)
+    return "".join(cards) or "No results found."
 # ======================================================
+# COURT / CM-ECF BUNDLE (AI SEPARATED)
 # ======================================================
+def generate_court_bundle():
+    ecf_no = f"ECF-PREFILE-{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}"
     with tempfile.TemporaryDirectory() as td:
         zpath = os.path.join(td, "court_bundle.zip")
         with zipfile.ZipFile(zpath, "w") as z:
             for i, r in enumerate(LAST_RESULTS, 1):
+                content = (
+                    f"{r['agency']} FOIA Reading Room\n"
+                    f"{r['resolved_url']}\n\n"
+                    f"{render_provenance_block(r['resolved_url'])}"
+                )
+                z.writestr(f"Exhibit_{i:03d}.txt", content)
+                z.writestr(f"Exhibit_{i:03d}.sha256", r["hash"])
+            for j, a in enumerate(AI_APPENDICES, 1):
                 z.writestr(
+                    f"AI_Appendix_{j:03d}.txt",
+                    a["text"] + "\n\n" + a["provenance"],
                 )
                 z.writestr(
+                    f"AI_Appendix_{j:03d}.sha256",
+                    a["hash"],
                 )
+            z.writestr(
+                "HF_Reviewer_Cover_Letter.txt",
+                "This application indexes public FOIA materials only.\n"
+                "AI output is segregated, hashed, disclosed, and non-evidentiary."
+            )
+            z.writestr(
+                "Judicial_Clerk_Training_Notes.txt",
+                "• FOIA sources only\n"
+                "• Verify URL + hash\n"
+                "• AI appendices are informational only\n"
+            )
+            z.writestr(
+                "Trust_and_Safety_Justification.txt",
+                "HF Trust & Safety Review:\n"
+                "No private data, no training on user content, no deception."
+            )
+        return zpath
+# ======================================================
+# FASTAPI MOUNT (GOVERNANCE SITE)
+# ======================================================
+fastapi_app = FastAPI()
+if os.path.exists("governance-site"):
+    fastapi_app.mount(
+        "/gov",
+        StaticFiles(directory="governance-site", html=True),
+        name="governance",
+    )
+@fastapi_app.get("/ask_ai")
+def ask_ai_endpoint(index: int):
+    return Response(ask_ai_for_document(index), media_type="text/plain")
 # ======================================================
 # UI
 # ======================================================
 CSS = """
+.card { border:1px solid #2a2a2a; border-radius:18px; padding:18px;
+        margin-bottom:22px; background:#0f0f0f; }
 .card-header { display:flex; justify-content:space-between; }
+.ask-ai { background:#1e88e5; color:white; border:none;
+          padding:6px 16px; border-radius:999px; }
+.ask-ai:disabled { background:#555; }
 """
 with gr.Blocks(css=CSS, title="Federal FOIA Intelligence Search") as app:
+    gr.Markdown("## Federal FOIA Intelligence Search\nPublic FOIA sources only")
     with gr.Tab("Search"):
+        agencies = gr.CheckboxGroup(list(ALL_ADAPTERS.keys()),
+                                    value=list(ALL_ADAPTERS.keys()))
+        query = gr.Textbox()
+        table = gr.Dataframe(headers=["Agency","Title","URL","Hash","Latency"])
         gallery = gr.HTML()
+        status = gr.Textbox(lines=10)
+        gr.Button("Search").click(run_search, [query, agencies],
+                                  [table, gallery, status])
+    with gr.Tab("Court / CM-ECF"):
+        gr.File(label="Download Court Bundle").upload(
+            lambda: generate_court_bundle()
         )
+    with gr.Tab("Trust & Governance"):
+        gr.HTML(
+            "<iframe src='/gov/index.html' "
+            "style='width:100%;height:700px;border:1px solid #ccc'></iframe>"
         )
+app = gr.mount_gradio_app(fastapi_app, app, path="/")
+app.js = """
+window.askAI = function(idx) {
+  fetch('/ask_ai?index=' + idx)
+}
+"""