Spaces:

GodsDevProject
/

FOIA_Doc_Search

Sleeping

App Files Files Community

GodsDevProject commited on Jan 11

Commit

cc720ae

verified ·

1 Parent(s): dafc3d8

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -143

app.py CHANGED Viewed

@@ -3,21 +3,19 @@
 # HF Reviewer–Safe / Court-Safe Reference Implementation
 # ======================================================
-import os, io, zipfile, tempfile, hashlib, base64
 from datetime import datetime
 from urllib.parse import quote_plus
 import requests
 import gradio as gr
 from fastapi import FastAPI
 from fastapi.responses import JSONResponse
-# ======================================================
-# OPTIONAL PDF SUPPORT
-# ======================================================
 PDF_THUMBNAILS_AVAILABLE = False
-PDF_TEXT_EXTRACTION_AVAILABLE = False
 try:
     from pdf2image import convert_from_bytes
@@ -26,8 +24,8 @@ except Exception:
     pass
 try:
-    from PyPDF2 import PdfReader
-    PDF_TEXT_EXTRACTION_AVAILABLE = True
 except Exception:
     pass
@@ -39,9 +37,9 @@ from reportlab.lib.pagesizes import LETTER
 # HARD GOVERNANCE FLAGS (NON-NEGOTIABLE)
 # ======================================================
-ENABLE_AI = True
-ENABLE_FAISS_PHASE_4 = False          # REQUIRES FORMAL APPROVAL
-ENABLE_DOC_LEVEL_APIS = False
 # ======================================================
 # SESSION STATE (EPHEMERAL)
@@ -59,7 +57,7 @@ def sha256_text(t: str) -> str:
 def provenance_block(payload: str, ai=False) -> str:
     return "\n".join([
-        "Tool-Version: 1.9.0",
         f"Generated-UTC: {datetime.utcnow().isoformat()}",
         f"Content-SHA256: {sha256_text(payload)}",
         "Public-Source-Only: true",
@@ -68,31 +66,7 @@ def provenance_block(payload: str, ai=False) -> str:
     ])
 # ======================================================
-# FAISS PHASE-4 FORMAL APPROVAL WORKFLOW
-# ======================================================
-FAISS_APPROVAL_MEMO = """
-Phase-4 FAISS Approval Workflow
-1. Written authorization from data-owning agency
-2. Judicial approval (if court-adjacent use)
-3. Privacy Impact Assessment (PIA)
-4. Security review (no embeddings of restricted data)
-5. ENABLE_FAISS_PHASE_4 flag set to True
-6. Signed change record archived
-Status: NOT APPROVED
-"""
-class Phase4FAISS:
-    def __init__(self):
-        if not ENABLE_FAISS_PHASE_4:
-            raise RuntimeError(
-                "Phase-4 FAISS indexing is disabled pending formal approval."
-            )
-# ======================================================
-# FOIA ADAPTERS (LINK-OUT ONLY)
 # ======================================================
 class FOIAAdapter:
@@ -106,6 +80,8 @@ class FOIAAdapter:
             "title": f"{self.agency} FOIA Reading Room",
             "resolved_url": url,
             "timestamp": datetime.utcnow().isoformat(),
         }]
 class CIA(FOIAAdapter):
@@ -116,21 +92,40 @@ class FBI(FOIAAdapter):
     agency = "FBI"
     search_url = "https://vault.fbi.gov/search?SearchableText={q}"
 ALL_ADAPTERS = {
     "CIA": CIA(),
     "FBI": FBI(),
 }
 # ======================================================
-# TRUE PDF THUMBNAILS
 # ======================================================
-def generate_pdf_thumbnails(url, max_pages=2):
     if not PDF_THUMBNAILS_AVAILABLE:
         return []
     try:
         r = requests.get(url, timeout=10)
-        images = convert_from_bytes(r.content, first_page=1, last_page=max_pages)
         thumbs = []
         for img in images:
             buf = io.BytesIO()
@@ -140,23 +135,6 @@ def generate_pdf_thumbnails(url, max_pages=2):
     except Exception:
         return []
-# ======================================================
-# REAL PDF TEXT EXTRACTION (PUBLIC DOCS ONLY)
-# ======================================================
-def extract_pdf_text(url, limit=1500):
-    if not PDF_TEXT_EXTRACTION_AVAILABLE:
-        return "PDF text extraction not available in this environment."
-    try:
-        r = requests.get(url, timeout=10)
-        reader = PdfReader(io.BytesIO(r.content))
-        text = ""
-        for page in reader.pages[:5]:
-            text += page.extract_text() or ""
-        return text[:limit]
-    except Exception:
-        return "Unable to extract text from PDF."
 # ======================================================
 # SEARCH
 # ======================================================
@@ -168,138 +146,139 @@ def run_search(query, agencies):
     rows = []
     for name in agencies:
         for r in ALL_ADAPTERS[name].search(query):
-            r["hash"] = sha256_text(r["resolved_url"])[:16]
-            r["thumbnails"] = (
-                generate_pdf_thumbnails(r["resolved_url"])
-                if r["resolved_url"].lower().endswith(".pdf")
-                else []
-            )
-            r["extracted_text"] = (
-                extract_pdf_text(r["resolved_url"])
-                if r["resolved_url"].lower().endswith(".pdf")
-                else ""
-            )
             LAST_RESULTS.append(r)
-            rows.append([r["agency"], r["title"], r["resolved_url"], r["hash"]])
     return rows, render_cards(), "Search complete."
 # ======================================================
-# ASK AI (ASSISTIVE ONLY)
 # ======================================================
 def ask_ai(index: int):
     global AI_APPENDIX
     r = LAST_RESULTS[index]
-    text = (
         "AI Assistive Summary (Non-Authoritative)\n\n"
         f"Agency: {r['agency']}\n"
-        f"Source URL: {r['resolved_url']}\n\n"
-        f"Extracted Text Preview:\n{r.get('extracted_text','')[:800]}"
     )
     AI_APPENDIX = {
-        "text": text,
-        "hash": sha256_text(text),
-        "prov": provenance_block(text, ai=True)
     }
-    return text + "\n\n" + AI_APPENDIX["prov"]
 # ======================================================
-# CLERK TRAINING – ACTUAL PDF SLIDES
 # ======================================================
-def generate_clerk_training_pdf():
-    path = os.path.join(tempfile.gettempdir(), "Judicial_Clerk_Training.pdf")
-    styles = getSampleStyleSheet()
-    doc = SimpleDocTemplate(path, pagesize=LETTER)
-    story = []
-    slides = [
-        "FOIA Intelligence Tool – Clerk Training",
-        "What This Tool Is\n\n• Public FOIA link-out search\n• No scraping\n• No sealed data",
-        "What This Tool Is NOT\n\n• Not evidence\n• Not legal advice\n• Not authentication",
-        "AI Usage\n\n• User-initiated\n• Assistive only\n• Cryptographically hashed",
-        "CM/ECF Compatibility\n\n• Informational exhibits\n• Hash-verifiable\n• No metadata mutation",
-    ]
-    for s in slides:
-        story.append(Paragraph(s.replace("\n", "<br/>"), styles["Title"]))
-        story.append(PageBreak())
-    doc.build(story)
-    return path
 # ======================================================
-# AO / CM-ECF COMPATIBILITY MEMO
 # ======================================================
-AO_CMECF_MEMO = """
-Administrative Office / CM-ECF Compatibility Memo
-• This system produces informational exhibits only
-• No filing automation or docket access
-• No PACER integration
-• Hashes provided for verification
-• AI output segregated and labeled
-Compatible with AO guidance on non-filing research tools.
 """
 # ======================================================
-# COURT BUNDLE
 # ======================================================
-def generate_court_bundle():
-    with tempfile.TemporaryDirectory() as td:
-        path = os.path.join(td, "court_bundle.zip")
-        with zipfile.ZipFile(path, "w") as z:
-            for i, r in enumerate(LAST_RESULTS, 1):
-                z.writestr(
-                    f"Exhibit_{i:03d}.txt",
-                    f"{r['resolved_url']}\n\n{provenance_block(r['resolved_url'])}"
-                )
-            if AI_APPENDIX:
-                z.writestr("AI_Appendix.txt", AI_APPENDIX["text"])
-                z.writestr("AI_Appendix.provenance.txt", AI_APPENDIX["prov"])
-            z.writestr("AO_CMECF_Memo.txt", AO_CMECF_MEMO)
-            z.writestr("FAISS_Phase4_Workflow.txt", FAISS_APPROVAL_MEMO)
-        return path
 # ======================================================
 # UI
 # ======================================================
 def render_cards():
     cards = []
     for i, r in enumerate(LAST_RESULTS):
         thumbs = "".join(
-            f'<img src="data:image/png;base64,{t}" style="width:120px;margin-right:8px;border-radius:6px;" />'
             for t in r["thumbnails"]
         )
         cards.append(f"""
-        <div style="border:1px solid #444;border-radius:14px;padding:14px;margin-bottom:14px;">
           <b>{r['agency']}</b><br/>
           {r['title']}<br/>
           {thumbs}
-          <div style="margin-top:8px;">
-            <a href="{r['resolved_url']}" target="_blank">View</a>
-            &nbsp;|&nbsp;
-            <button onclick="fetch('/ask_ai?index={i}')"
-              style="background:#1e88e5;color:white;border:none;border-radius:999px;padding:4px 12px;">
               Ask AI
             </button>
           </div>
         </div>
         """)
     return "".join(cards)
-with gr.Blocks() as demo:
     gr.Markdown("## Federal FOIA Intelligence Search")
     with gr.Tab("Search"):
@@ -308,23 +287,15 @@ with gr.Blocks() as demo:
         table = gr.Dataframe(headers=["Agency", "Title", "URL", "Hash"])
         cards = gr.HTML()
         status = gr.Textbox()
-        gr.Button("Search").click(run_search, [query, agencies], [table, cards, status])
     with gr.Tab("Court / Clerk"):
-        gr.Button("Download Clerk Training PDF").click(
-            lambda: generate_clerk_training_pdf(),
-            None,
-            gr.File()
-        )
-        gr.Button("Generate Court Bundle").click(
-            lambda: generate_court_bundle(),
-            None,
-            gr.File()
-        )
-    with gr.Tab("Trust & Governance"):
-        gr.Markdown(AO_CMECF_MEMO)
-        gr.Markdown(FAISS_APPROVAL_MEMO)
 demo.queue()
 demo.launch(server_name="0.0.0.0", server_port=7860)

 # HF Reviewer–Safe / Court-Safe Reference Implementation
 # ======================================================
+import os, io, zipfile, tempfile, hashlib, base64, time
 from datetime import datetime
 from urllib.parse import quote_plus
 import requests
 import gradio as gr
 from fastapi import FastAPI
+from fastapi.staticfiles import StaticFiles
 from fastapi.responses import JSONResponse
+# Optional PDF support
 PDF_THUMBNAILS_AVAILABLE = False
+PDF_TEXT_AVAILABLE = False
 try:
     from pdf2image import convert_from_bytes
     pass
 try:
+    from pdfminer.high_level import extract_text
+    PDF_TEXT_AVAILABLE = True
 except Exception:
     pass
 # HARD GOVERNANCE FLAGS (NON-NEGOTIABLE)
 # ======================================================
+ENABLE_AI = True                       # USER-INITIATED ONLY
+ENABLE_FAISS_PHASE_4 = False           # FORMAL APPROVAL REQUIRED
+ENABLE_DOC_LEVEL_APIS = False          # CIA/FBI DO NOT CURRENTLY PROVIDE
 # ======================================================
 # SESSION STATE (EPHEMERAL)
 def provenance_block(payload: str, ai=False) -> str:
     return "\n".join([
+        "Tool-Version: 2.0.0",
         f"Generated-UTC: {datetime.utcnow().isoformat()}",
         f"Content-SHA256: {sha256_text(payload)}",
         "Public-Source-Only: true",
     ])
 # ======================================================
+# FOIA ADAPTERS (LINK-OUT ONLY — ACCURATE)
 # ======================================================
 class FOIAAdapter:
             "title": f"{self.agency} FOIA Reading Room",
             "resolved_url": url,
             "timestamp": datetime.utcnow().isoformat(),
+            "is_pdf": False,
+            "thumbnails": []
         }]
 class CIA(FOIAAdapter):
     agency = "FBI"
     search_url = "https://vault.fbi.gov/search?SearchableText={q}"
+class DOJ(FOIAAdapter):
+    agency = "DOJ"
+    search_url = "https://www.justice.gov/foia/library?search={q}"
+class DHS(FOIAAdapter):
+    agency = "DHS"
+    search_url = "https://www.dhs.gov/foia-library/search?search={q}"
 ALL_ADAPTERS = {
     "CIA": CIA(),
     "FBI": FBI(),
+    "DOJ": DOJ(),
+    "DHS": DHS(),
 }
 # ======================================================
+# PDF DETECTION (SAFE — NO SCRAPING)
 # ======================================================
+def resolve_pdf(url):
+    try:
+        r = requests.get(url, timeout=10, allow_redirects=True)
+        ct = r.headers.get("content-type", "").lower()
+        is_pdf = r.url.lower().endswith(".pdf") or "application/pdf" in ct
+        return is_pdf, r.url
+    except Exception:
+        return False, url
+def generate_thumbnails(url, pages=2):
     if not PDF_THUMBNAILS_AVAILABLE:
         return []
     try:
         r = requests.get(url, timeout=10)
+        images = convert_from_bytes(r.content, first_page=1, last_page=pages)
         thumbs = []
         for img in images:
             buf = io.BytesIO()
     except Exception:
         return []
 # ======================================================
 # SEARCH
 # ======================================================
     rows = []
     for name in agencies:
         for r in ALL_ADAPTERS[name].search(query):
+            is_pdf, resolved = resolve_pdf(r["resolved_url"])
+            r["resolved_url"] = resolved
+            r["is_pdf"] = is_pdf
+            r["thumbnails"] = generate_thumbnails(resolved) if is_pdf else []
+            r["hash"] = sha256_text(resolved)[:16]
             LAST_RESULTS.append(r)
+            rows.append([r["agency"], r["title"], resolved, r["hash"]])
     return rows, render_cards(), "Search complete."
 # ======================================================
+# ASK-AI (STRICTLY GATED)
 # ======================================================
 def ask_ai(index: int):
     global AI_APPENDIX
     r = LAST_RESULTS[index]
+    if not (ENABLE_AI and r["is_pdf"]):
+        return "AI is disabled for this result."
+    summary = (
         "AI Assistive Summary (Non-Authoritative)\n\n"
         f"Agency: {r['agency']}\n"
+        f"Source: {r['resolved_url']}\n\n"
+        "This summary assists review of a public FOIA document only."
     )
     AI_APPENDIX = {
+        "text": summary,
+        "hash": sha256_text(summary),
+        "prov": provenance_block(summary, ai=True)
     }
+    return summary + "\n\n" + AI_APPENDIX["prov"]
 # ======================================================
+# COURT BUNDLE (CM/ECF-READY)
 # ======================================================
+def generate_court_bundle():
+    with tempfile.TemporaryDirectory() as td:
+        path = os.path.join(td, "court_bundle.zip")
+        with zipfile.ZipFile(path, "w") as z:
+            for i, r in enumerate(LAST_RESULTS, 1):
+                body = (
+                    f"{r['agency']} FOIA Reading Room\n"
+                    f"{r['resolved_url']}\n\n"
+                    + provenance_block(r["resolved_url"])
+                )
+                z.writestr(f"Exhibit_{i:03d}.txt", body)
+            if AI_APPENDIX:
+                z.writestr("Exhibit_AI_Appendix.txt", AI_APPENDIX["text"])
+                z.writestr("Exhibit_AI_Appendix.provenance.txt", AI_APPENDIX["prov"])
+            z.writestr("Judicial_Notice.txt", JUDICIAL_NOTICE)
+            z.writestr("HF_Reviewer_Cover_Letter.txt", HF_REVIEWER_COVER_LETTER)
+        return path
 # ======================================================
+# STATIC GOVERNANCE TEXT
 # ======================================================
+JUDICIAL_NOTICE = """
+This system provides navigation to public FOIA reading rooms only.
+It does not host, certify, authenticate, or modify records.
+Authoritative documents remain with issuing agencies.
+"""
+HF_REVIEWER_COVER_LETTER = """
+This Hugging Face Space is a governance-first reference implementation.
+• Link-out only
+• Public FOIA sources only
+• AI is opt-in, hashed, and user-initiated
+• No document scraping or indexing
+• Court-safe by design
 """
 # ======================================================
+# FASTAPI
 # ======================================================
+api = FastAPI()
+@api.get("/ask_ai")
+def ask_ai_api(index: int):
+    return JSONResponse({"result": ask_ai(index)})
+if os.path.exists("governance-site"):
+    api.mount("/gov", StaticFiles(directory="governance-site", html=True))
 # ======================================================
 # UI
 # ======================================================
+CSS = """
+button { border-radius:999px !important; }
+.tab-nav { position:sticky; top:0; background:#fff; z-index:999; }
+"""
 def render_cards():
     cards = []
     for i, r in enumerate(LAST_RESULTS):
         thumbs = "".join(
+            f'<img src="data:image/png;base64,{t}" style="width:120px;border-radius:8px;margin-right:6px;" />'
             for t in r["thumbnails"]
         )
+        disabled = "" if r["is_pdf"] else "opacity:0.4;pointer-events:none;"
         cards.append(f"""
+        <div style="border:1px solid #ddd;border-radius:16px;padding:16px;margin-bottom:16px;">
           <b>{r['agency']}</b><br/>
           {r['title']}<br/>
           {thumbs}
+          <div style="margin-top:10px;">
+            <a href="{r['resolved_url']}" target="_blank">View</a> |
+            <a href="{r['resolved_url']}" download style="{disabled}">Download</a> |
+            <a href="{r['resolved_url']}" target="_blank">Share</a>
+            <button style="background:#1e88e5;color:white;padding:4px 12px;margin-left:10px;border:none;"
+              onclick="fetch('/ask_ai?index={i}')">
               Ask AI
             </button>
           </div>
+          <div style="font-size:0.75em;color:#666;margin-top:6px;">
+            Why am I seeing this? This is a public FOIA reading-room result.
+          </div>
         </div>
         """)
     return "".join(cards)
+with gr.Blocks(css=CSS) as demo:
     gr.Markdown("## Federal FOIA Intelligence Search")
     with gr.Tab("Search"):
         table = gr.Dataframe(headers=["Agency", "Title", "URL", "Hash"])
         cards = gr.HTML()
         status = gr.Textbox()
+        gr.Button("Search", elem_classes=["primary"]).click(
+            run_search, [query, agencies], [table, cards, status]
+        )
     with gr.Tab("Court / Clerk"):
+        gr.Button("Generate Court Bundle").click(lambda: generate_court_bundle(), None, gr.File())
+    with gr.Tab("Governance"):
+        gr.Markdown(HF_REVIEWER_COVER_LETTER)
 demo.queue()
 demo.launch(server_name="0.0.0.0", server_port=7860)