Spaces:

GodsDevProject
/

FOIA_Doc_Search

Sleeping

App Files Files Community

GodsDevProject commited on Jan 11

Commit

eb5e858

verified ·

1 Parent(s): 2e91748

Update app.py

Browse files

Files changed (1) hide show

app.py +230 -149

app.py CHANGED Viewed

@@ -1,15 +1,15 @@
 import gradio as gr
 import time
 import hashlib
-import zipfile
 import io
-import uuid
 from datetime import datetime
-from urllib.parse import quote_plus, urlparse
-from collections import Counter
 import requests
-import plotly.graph_objects as go
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
 from reportlab.lib.styles import getSampleStyleSheet
@@ -17,27 +17,124 @@ from citations import bluebook_exhibit, table_of_authorities
 from foia_requests import generate_foia_request_text
 # ======================================================
-# OPTIONAL PDF TEXT EXTRACTION (STRICTLY OPT-IN)
 # ======================================================
 PDF_TEXT_AVAILABLE = False
 try:
     from pdfminer.high_level import extract_text
     PDF_TEXT_AVAILABLE = True
 except Exception:
-    PDF_TEXT_AVAILABLE = False
 # ======================================================
-# FEATURE GATES (HF SAFE)
 # ======================================================
-ENABLE_AI = True
-ENABLE_PDF_EXTRACTION = True
-ENABLE_LITIGATION_PDF = True
-ENABLE_COVERAGE_HEATMAP = True
 # ======================================================
-# BASE ADAPTER (LINK-OUT ONLY)
 # ======================================================
 class FOIAAdapter:
@@ -56,10 +153,6 @@ class FOIAAdapter:
             "latency_ms": latency
         }]
-# ======================================================
-# LIVE AGENCIES (PUBLIC READING ROOMS)
-# ======================================================
 class CIA(FOIAAdapter):
     agency = "CIA"
     search_url = "https://www.cia.gov/readingroom/search/site/{q}"
@@ -88,234 +181,222 @@ class NSA(FOIAAdapter):
     agency = "NSA"
     search_url = "https://www.nsa.gov/resources/everyone/foia/reading-room/?q={q}"
-LIVE_ADAPTERS = [CIA(), FBI(), DOJ(), DHS(), STATE(), GSA(), NSA()]
-# ======================================================
-# GLOBAL STATE (SESSION MEMORY ONLY)
-# ======================================================
-LAST_RESULTS = []
-SELECTED_INDEX = None
-# ======================================================
-# UTILITIES
-# ======================================================
-def citation_hash(r):
-    return hashlib.sha256(
-        f"{r['agency']}|{r['url']}|{r['timestamp']}".encode()
-    ).hexdigest()[:16]
-def ai_disclosure():
-    return (
-        "\n\n---\n"
-        "AI DISCLOSURE\n"
-        "• User-initiated analysis only\n"
-        "• Public FOIA materials only\n"
-        "• AI output is not evidence or legal advice\n"
-        "• Verify against original sources\n"
-    )
-def hash_ai_output(text):
-    return hashlib.sha256(text.encode()).hexdigest()
 # ======================================================
 # SEARCH
 # ======================================================
-def run_search(query):
-    global LAST_RESULTS
     LAST_RESULTS = []
     rows = []
-    for adapter in LIVE_ADAPTERS:
         for r in adapter.search(query):
             r["hash"] = citation_hash(r)
             LAST_RESULTS.append(r)
             rows.append([
                 r["agency"],
                 r["title"],
-                r["url"],
                 r["hash"],
                 f"{r['latency_ms']} ms"
             ])
-    return rows, render_cards()
 # ======================================================
-# RESULTS CARDS (POLISHED)
 # ======================================================
 def render_cards():
     cards = []
     for idx, r in enumerate(LAST_RESULTS):
-        preview = (
-            f"<iframe src='{r['url']}' height='220' width='100%'></iframe>"
-            if r["url"].lower().endswith(".pdf")
-            else f"<a href='{r['url']}' target='_blank'>Open FOIA Page</a>"
         )
         cards.append(f"""
         <div class="card">
             <div class="card-header">
                 <b>{r['agency']}</b>
-                <span class="badge">⏱ {r['latency_ms']} ms</span>
             </div>
             <div class="card-title">{r['title']}</div>
             {preview}
             <div class="actions">
-                <button onclick="selectDoc({idx})">Ask AI</button>
-                <a href="{r['url']}" target="_blank">View</a>
-                <a href="{r['url']}" download>Download</a>
             </div>
         </div>
         """)
-    return "".join(cards) if cards else "<i>No results</i>"
 # ======================================================
-# PDF EXTRACTION (OPT-IN)
 # ======================================================
-def extract_pdf_text(url):
-    if not (PDF_TEXT_AVAILABLE and ENABLE_PDF_EXTRACTION):
-        return ""
-    try:
-        r = requests.get(url, timeout=15)
-        with open("/tmp/doc.pdf", "wb") as f:
-            f.write(r.content)
-        return extract_text("/tmp/doc.pdf")[:6000]
-    except Exception:
-        return ""
 # ======================================================
-# AI ASK + CITATION CROSS-CHECK
 # ======================================================
 def ask_ai(opt_in, pdf_opt_in, question):
     if not opt_in:
-        return "⚠ AI requires explicit opt-in."
     if SELECTED_INDEX is None:
-        return "⚠ Select a document first."
     r = LAST_RESULTS[SELECTED_INDEX]
-    context = ""
-    if pdf_opt_in and r["url"].lower().endswith(".pdf"):
-        context = extract_pdf_text(r["url"])
     analysis = (
-        f"{bluebook_exhibit(r, SELECTED_INDEX + 1)}\n\n"
-        f"User Question:\n{question}\n\n"
-        f"Extracted Context:\n{context[:1500]}\n\n"
-        f"AI Summary:\nThis is a public FOIA document. "
-        f"Assertions should be verified against the cited exhibit."
     )
     final = analysis + ai_disclosure()
     return final + f"\n\nIntegrity Hash: {hash_ai_output(final)}"
 # ======================================================
-# LITIGATION APPENDIX (WITH TOA)
 # ======================================================
-def litigation_appendix():
     buf = io.BytesIO()
-    doc = SimpleDocTemplate(buf)
     styles = getSampleStyleSheet()
     story = []
-    story.append(Paragraph("Litigation Appendix", styles["Title"]))
     story.append(Spacer(1, 12))
-    story.append(Paragraph("Table of Authorities", styles["Heading1"]))
-    for line in table_of_authorities(LAST_RESULTS):
-        story.append(Paragraph(line, styles["Normal"]))
-    story.append(PageBreak())
     for i, r in enumerate(LAST_RESULTS, start=1):
-        story.append(Paragraph(f"Exhibit A-{i}", styles["Heading2"]))
-        story.append(Paragraph(bluebook_exhibit(r, i), styles["Normal"]))
-        story.append(Spacer(1, 8))
     doc.build(story)
     buf.seek(0)
     return buf
 # ======================================================
-# COVERAGE HEATMAP
 # ======================================================
-def coverage_heatmap():
-    counts = Counter(r["agency"] for r in LAST_RESULTS)
-    return go.Figure(
-        data=go.Heatmap(
-            z=[[counts.get(a.agency, 0)] for a in LIVE_ADAPTERS],
-            x=["Results"],
-            y=[a.agency for a in LIVE_ADAPTERS],
-            colorscale="Blues"
-        ),
-        layout=go.Layout(title="Agency Coverage Heatmap")
-    )
-# ======================================================
-# FOIA REQUEST GENERATOR
-# ======================================================
-def foia_request(agency, subject, requester):
-    return generate_foia_request_text(agency, subject, requester)
 # ======================================================
 # UI
 # ======================================================
 CSS = """
-.search textarea {font-size:18px;padding:14px}
 .card {border:1px solid #ddd;border-radius:14px;padding:16px;margin-bottom:18px}
 .card-header {display:flex;justify-content:space-between}
-.card-title {margin:8px 0 12px}
-.actions button, .actions a {margin-right:10px}
-.badge {background:#eef;padding:4px 8px;border-radius:8px;font-size:12px}
 """
 with gr.Blocks(css=CSS, title="Federal FOIA Intelligence Search") as app:
-    gr.Markdown("## 🏛️ Federal FOIA Intelligence Search\nPublic Reading Rooms Only")
-    with gr.Tab("🔍 Search"):
-        query = gr.Textbox(
-            label="Search FOIA Reading Rooms",
-            elem_classes=["search"],
-            placeholder="e.g. procurement, AATIP, surveillance"
         )
-        search_btn = gr.Button("Search", variant="primary")
-        table = gr.Dataframe(headers=["Agency","Title","URL","Hash","Latency"])
         gallery = gr.HTML()
-        search_btn.click(run_search, query, [table, gallery])
-    with gr.Tab("🧠 Ask AI"):
-        ai_opt = gr.Checkbox(label="Enable AI (Explicit Opt-In)")
         pdf_opt = gr.Checkbox(label="Allow PDF Text Extraction")
-        question = gr.Textbox(lines=4)
-        answer = gr.Textbox(lines=18)
-        gr.Button("Ask AI").click(ask_ai, [ai_opt, pdf_opt, question], answer)
-    with gr.Tab("📊 Analysis"):
-        gr.Button("Coverage Heatmap").click(coverage_heatmap, outputs=gr.Plot())
-    with gr.Tab("⚖️ Court Tools"):
-        gr.Button("Generate Litigation Appendix PDF").click(
-            litigation_appendix, outputs=gr.File()
         )
-    with gr.Tab("📝 FOIA Request"):
-        agency = gr.Textbox(label="Agency")
-        subject = gr.Textbox(label="Records Requested")
-        requester = gr.Textbox(label="Requester Name")
-        output = gr.Textbox(lines=14)
-        gr.Button("Generate FOIA Request").click(
-            foia_request, [agency, subject, requester], output
         )
 app.launch()

 import gradio as gr
 import time
 import hashlib
 import io
+import json
+import zipfile
+import base64
 from datetime import datetime
+from urllib.parse import quote_plus
 import requests
+import os
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
 from reportlab.lib.styles import getSampleStyleSheet
 from foia_requests import generate_foia_request_text
 # ======================================================
+# HARD FEATURE FLAGS (GOVERNANCE ENFORCED)
+# ======================================================
+ENABLE_FAISS_PHASE_4 = False   # MUST remain False unless formal approval
+ENABLE_AI = True
+ENABLE_PDF_EXTRACTION = True
+# ======================================================
+# OPTIONAL PDF SUPPORT
 # ======================================================
 PDF_TEXT_AVAILABLE = False
+PDF_THUMBNAIL_AVAILABLE = False
 try:
     from pdfminer.high_level import extract_text
     PDF_TEXT_AVAILABLE = True
 except Exception:
+    pass
+try:
+    from pdf2image import convert_from_bytes
+    PDF_THUMBNAIL_AVAILABLE = True
+except Exception:
+    pass
 # ======================================================
+# SESSION STATE
 # ======================================================
+LAST_RESULTS = []
+SELECTED_INDEX = None
 # ======================================================
+# HELPERS
+# ======================================================
+def citation_hash(r):
+    return hashlib.sha256(
+        f"{r['agency']}|{r['url']}|{r['timestamp']}".encode()
+    ).hexdigest()[:16]
+def signed_permalink_manifest(results):
+    """
+    Deterministic, hash-anchored manifest suitable for citation or audit.
+    """
+    payload = {
+        "generated_utc": datetime.utcnow().isoformat(),
+        "tool": "Federal FOIA Intelligence Search",
+        "documents": [
+            {
+                "exhibit": i + 1,
+                "agency": r["agency"],
+                "title": r["title"],
+                "resolved_url": r["resolved_url"],
+                "hash": r["hash"]
+            }
+            for i, r in enumerate(results)
+        ]
+    }
+    payload["manifest_hash"] = hashlib.sha256(
+        json.dumps(payload, sort_keys=True).encode()
+    ).hexdigest()
+    return payload
+def fre_callout():
+    return (
+        "FRE Reference (Educational):\n"
+        "• Rule 901 – Authentication\n"
+        "• Rule 803(8) – Public Records Exception\n"
+        "• Rule 1005 – Copies of Public Records\n"
+        "Not legal advice."
+    )
+def ai_disclosure():
+    return (
+        "\n\n---\n"
+        "AI DISCLOSURE\n"
+        "• User-initiated only\n"
+        "• Public FOIA documents only\n"
+        "• No legal advice\n"
+        "• Verify against cited exhibit\n"
+    )
+def hash_ai_output(text):
+    return hashlib.sha256(text.encode()).hexdigest()
+def resolve_pdf_url(url):
+    try:
+        r = requests.get(
+            url,
+            timeout=15,
+            allow_redirects=True,
+            headers={"User-Agent": "FOIA-Research-Tool"}
+        )
+        ct = r.headers.get("content-type", "").lower()
+        is_pdf = r.url.lower().endswith(".pdf") or "application/pdf" in ct
+        return is_pdf, r.url
+    except Exception:
+        return False, url
+def generate_pdf_thumbnails(url, max_pages=3):
+    if not PDF_THUMBNAIL_AVAILABLE:
+        return []
+    try:
+        r = requests.get(url, timeout=15)
+        images = convert_from_bytes(r.content, first_page=1, last_page=max_pages)
+        thumbs = []
+        for img in images:
+            buf = io.BytesIO()
+            img.save(buf, format="PNG")
+            thumbs.append(base64.b64encode(buf.getvalue()).decode())
+        return thumbs
+    except Exception:
+        return []
+# ======================================================
+# FOIA ADAPTERS
 # ======================================================
 class FOIAAdapter:
             "latency_ms": latency
         }]
 class CIA(FOIAAdapter):
     agency = "CIA"
     search_url = "https://www.cia.gov/readingroom/search/site/{q}"
     agency = "NSA"
     search_url = "https://www.nsa.gov/resources/everyone/foia/reading-room/?q={q}"
+ALL_ADAPTERS = {
+    "CIA": CIA(),
+    "FBI": FBI(),
+    "DOJ": DOJ(),
+    "DHS": DHS(),
+    "State": STATE(),
+    "GSA": GSA(),
+    "NSA": NSA()
+}
 # ======================================================
 # SEARCH
 # ======================================================
+def run_search(query, agencies):
+    global LAST_RESULTS, SELECTED_INDEX
+    SELECTED_INDEX = None
     LAST_RESULTS = []
     rows = []
+    for name in agencies:
+        adapter = ALL_ADAPTERS[name]
         for r in adapter.search(query):
             r["hash"] = citation_hash(r)
+            r["resolved_pdf"], r["resolved_url"] = resolve_pdf_url(r["url"])
+            r["thumbnails"] = (
+                generate_pdf_thumbnails(r["resolved_url"])
+                if r["resolved_pdf"] else []
+            )
             LAST_RESULTS.append(r)
             rows.append([
                 r["agency"],
                 r["title"],
+                r["resolved_url"],
                 r["hash"],
                 f"{r['latency_ms']} ms"
             ])
+    return rows, render_cards(), "No document selected"
 # ======================================================
+# RENDER CARDS
 # ======================================================
 def render_cards():
     cards = []
     for idx, r in enumerate(LAST_RESULTS):
+        thumbs = "".join(
+            f'<img src="data:image/png;base64,{t}" style="width:32%;margin:2px;border:1px solid #ccc" />'
+            for t in r["thumbnails"]
         )
+        preview = thumbs or f'<a href="{r["resolved_url"]}" target="_blank">Open FOIA Reading Room</a>'
         cards.append(f"""
         <div class="card">
             <div class="card-header">
                 <b>{r['agency']}</b>
+                <span class="badge">{r['latency_ms']} ms</span>
             </div>
             <div class="card-title">{r['title']}</div>
             {preview}
             <div class="actions">
+                <button onclick="selectDoc({idx})">Select</button>
+                <a href="{r['resolved_url']}" target="_blank">View</a>
             </div>
         </div>
         """)
+    return "".join(cards) or "<i>No results</i>"
 # ======================================================
+# DOC SELECTION
 # ======================================================
+def select_doc(idx):
+    global SELECTED_INDEX
+    SELECTED_INDEX = idx
+    return f"Selected document #{idx + 1}"
 # ======================================================
+# AI ASK
 # ======================================================
 def ask_ai(opt_in, pdf_opt_in, question):
     if not opt_in:
+        return "Explicit AI opt-in required."
     if SELECTED_INDEX is None:
+        return "Select a document first."
     r = LAST_RESULTS[SELECTED_INDEX]
+    if not r["resolved_pdf"]:
+        return "AI available only for public PDFs."
+    context = ""
+    pin_cite = "n.p."
+    if pdf_opt_in and PDF_TEXT_AVAILABLE:
+        try:
+            raw = extract_text(io.BytesIO(
+                requests.get(r["resolved_url"], timeout=15).content
+            ))
+            context = raw[:4000]
+            pin_cite = "p. 1"
+        except Exception:
+            pass
     analysis = (
+        f"{bluebook_exhibit(r, SELECTED_INDEX + 1, pin=pin_cite)}\n\n"
+        f"{fre_callout()}\n\n"
+        f"Question:\n{question}\n\n"
+        f"Context:\n{context}"
     )
     final = analysis + ai_disclosure()
     return final + f"\n\nIntegrity Hash: {hash_ai_output(final)}"
 # ======================================================
+# CLERK EXHIBIT PACKET (PDF)
 # ======================================================
+def generate_exhibit_packet():
     buf = io.BytesIO()
     styles = getSampleStyleSheet()
+    doc = SimpleDocTemplate(buf)
     story = []
+    story.append(Paragraph("Exhibit Packet (Clerk Format)", styles["Title"]))
     story.append(Spacer(1, 12))
     for i, r in enumerate(LAST_RESULTS, start=1):
+        story.append(Paragraph(
+            f"Exhibit {i}: {r['agency']} — {r['title']}", styles["Heading2"]
+        ))
+        story.append(Paragraph(r["resolved_url"], styles["Normal"]))
+        story.append(Paragraph(f"Hash: {r['hash']}", styles["Code"]))
+        story.append(Spacer(1, 12))
     doc.build(story)
     buf.seek(0)
     return buf
 # ======================================================
+# PACER-READY BUNDLE (ZIP)
 # ======================================================
+def generate_pacer_bundle():
+    buf = io.BytesIO()
+    z = zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED)
+    manifest = signed_permalink_manifest(LAST_RESULTS)
+    z.writestr("manifest.json", json.dumps(manifest, indent=2))
+    z.writestr("README.txt",
+        "PACER-Ready Educational Bundle\n"
+        "No filing performed. User responsible for review.\n"
+    )
+    z.close()
+    buf.seek(0)
+    return buf
 # ======================================================
 # UI
 # ======================================================
 CSS = """
 .card {border:1px solid #ddd;border-radius:14px;padding:16px;margin-bottom:18px}
 .card-header {display:flex;justify-content:space-between}
+.badge {background:#eef;padding:4px 8px;border-radius:8px}
 """
 with gr.Blocks(css=CSS, title="Federal FOIA Intelligence Search") as app:
+    gr.Markdown("## Federal FOIA Intelligence Search\nPublic Reading Rooms Only")
+    gr.HTML("""
+    <button onclick="window.open('/governance-site/index.html','_blank')">
+      Governance & Trust Documentation
+    </button>
+    """)
+    with gr.Tab("Search"):
+        agencies = gr.CheckboxGroup(
+            choices=list(ALL_ADAPTERS.keys()),
+            value=list(ALL_ADAPTERS.keys()),
+            label="Agencies"
         )
+        query = gr.Textbox(placeholder="e.g. AATIP, surveillance")
+        table = gr.Dataframe(headers=["Agency","Title","Resolved URL","Hash","Latency"])
         gallery = gr.HTML()
+        status = gr.Textbox(label="Selection Status")
+        gr.Button("Search").click(run_search, [query, agencies], [table, gallery, status])
+    with gr.Tab("Ask AI"):
+        ai_opt = gr.Checkbox(label="Enable AI")
         pdf_opt = gr.Checkbox(label="Allow PDF Text Extraction")
+        q = gr.Textbox(lines=4)
+        a = gr.Textbox(lines=18)
+        gr.Button("Ask AI").click(ask_ai, [ai_opt, pdf_opt, q], a)
+    with gr.Tab("Exports"):
+        gr.Markdown("### Signed / Clerk / PACER Outputs")
+        gr.File(label="Clerk Exhibit Packet (PDF)").upload(
+            lambda: generate_exhibit_packet(), outputs=None
+        )
+        gr.File(label="PACER-Ready Bundle (ZIP)").upload(
+            lambda: generate_pacer_bundle(), outputs=None
         )
+    with gr.Tab("FOIA Request"):
+        agency = gr.Textbox()
+        subject = gr.Textbox()
+        requester = gr.Textbox()
+        out = gr.Textbox(lines=14)
+        gr.Button("Generate").click(
+            lambda a,s,r: generate_foia_request_text(a,s,r),
+            [agency, subject, requester],
+            out
         )
 app.launch()