Spaces:

GodsDevProject
/

FOIA_Doc_Search

Sleeping

App Files Files Community

GodsDevProject commited on Jan 10

Commit

6a02a5b

verified ·

1 Parent(s): ef5f53e

Update app.py

Browse files

Files changed (1) hide show

app.py +276 -60

app.py CHANGED Viewed

@@ -1,25 +1,44 @@
 import gradio as gr
 import hashlib
-import io
 import zipfile
 from datetime import datetime
 from urllib.parse import quote_plus, urlparse
-from collections import Counter
 import plotly.graph_objects as go
-from bluebook import bluebook_full
-from appendix import build_litigation_appendix
 # ======================================================
-# CONFIG
 # ======================================================
-ENABLE_AI = True
-ENABLE_PUBLIC_SHARE = True
 # ======================================================
-# ADAPTER
 # ======================================================
 class FOIAAdapter:
@@ -27,13 +46,22 @@ class FOIAAdapter:
     search_url = ""
     def search(self, query):
         return [{
             "agency": self.agency,
-            "title": f"{self.agency} FOIA Search Results",
-            "url": self.search_url.format(q=quote_plus(query)),
-            "timestamp": datetime.utcnow().isoformat()
         }]
 class CIA(FOIAAdapter):
     agency = "CIA"
     search_url = "https://www.cia.gov/readingroom/search/site/{q}"
@@ -42,110 +70,298 @@ class FBI(FOIAAdapter):
     agency = "FBI"
     search_url = "https://vault.fbi.gov/search?SearchableText={q}"
-LIVE = [CIA(), FBI()]
 # ======================================================
-# STATE
 # ======================================================
-RESULTS = []
-SHARES = {}
 # ======================================================
 # SEARCH
 # ======================================================
 def run_search(query):
-    global RESULTS
-    RESULTS = []
     rows = []
-    for a in LIVE:
-        for r in a.search(query):
-            r["hash"] = hashlib.sha256(r["url"].encode()).hexdigest()[:16]
-            RESULTS.append(r)
-            rows.append([r["agency"], r["title"], r["url"], r["hash"]])
     return rows, render_cards()
 # ======================================================
-# CARDS
 # ======================================================
 def render_cards():
     cards = []
-    for i, r in enumerate(RESULTS):
         cards.append(f"""
-        <div style="border:1px solid #ddd;border-radius:12px;padding:14px;margin-bottom:16px">
-          <b>{r['agency']}</b><br>
-          {r['title']}<br><br>
-          <a href="{r['url']}" target="_blank">View</a> |
-          <a href="{r['url']}" download>Download</a> |
-          <a href="#" onclick="share({i})">Share</a> |
-          <i>Ask AI (opt-in)</i>
         </div>
         """)
-    return "".join(cards)
 # ======================================================
-# SHARE PAGE
 # ======================================================
 def create_share():
-    sid = hashlib.sha256(str(RESULTS).encode()).hexdigest()[:12]
-    SHARES[sid] = RESULTS.copy()
-    return f"Share ID: `{sid}`"
-def load_share(sid):
-    recs = SHARES.get(sid)
-    if not recs:
-        return "Invalid share ID"
-    return "\n".join(bluebook_full(r) for r in recs)
 # ======================================================
-# EXPORTS
 # ======================================================
 def journalist_zip():
     buf = io.BytesIO()
     with zipfile.ZipFile(buf, "w") as z:
-        z.writestr("citations.txt", "\n".join(bluebook_full(r) for r in RESULTS))
         z.writestr(
             "links.csv",
             "agency,title,url\n" +
-            "\n".join(f"{r['agency']},{r['title']},{r['url']}" for r in RESULTS)
         )
     buf.seek(0)
     return buf
-def appendix_pdf():
-    return build_litigation_appendix(RESULTS)
 # ======================================================
 # UI
 # ======================================================
-with gr.Blocks(title="Federal FOIA Intelligence Search") as app:
-    gr.Markdown("""
-    # 🏛️ Federal FOIA Intelligence Search
-    **Public FOIA Electronic Reading Rooms**
-    """)
     with gr.Tabs():
         with gr.Tab("🔍 Search"):
-            q = gr.Textbox(label="Search FOIA Libraries")
-            btn = gr.Button("Search")
-            table = gr.Dataframe(headers=["Agency","Title","URL","Hash"])
             gallery = gr.HTML()
-            btn.click(run_search, q, [table, gallery])
         with gr.Tab("📤 Share"):
             gr.Button("Create Share Page").click(create_share, outputs=gr.Textbox())
-            sid = gr.Textbox(label="Load Share ID")
-            gr.Button("Load").click(load_share, sid, gr.Textbox(lines=10))
-        with gr.Tab("⚖️ Litigation"):
-            gr.Button("Generate Appendix PDF").click(appendix_pdf, outputs=gr.File())
         with gr.Tab("🗂 Exports"):
             gr.Button("Journalist ZIP").click(journalist_zip, outputs=gr.File())

 import gradio as gr
+import time
 import hashlib
 import zipfile
+import io
+import uuid
 from datetime import datetime
 from urllib.parse import quote_plus, urlparse
+from collections import Counter, defaultdict
+import requests
 import plotly.graph_objects as go
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
+from reportlab.lib.styles import getSampleStyleSheet
+# ======================================================
+# OPTIONAL PDF TEXT EXTRACTION (SAFE / GUARDED)
+# ======================================================
+PDF_TEXT_AVAILABLE = False
+try:
+    from pdfminer.high_level import extract_text
+    PDF_TEXT_AVAILABLE = True
+except Exception:
+    PDF_TEXT_AVAILABLE = False
 # ======================================================
+# CONFIG / FEATURE GATES
 # ======================================================
+ENABLE_AI = True                 # explicit opt-in required
+ENABLE_PDF_EXTRACTION = True     # checkbox gated
+ENABLE_ENTITY_GRAPHS = True
+ENABLE_TIMELINES = True
+ENABLE_JOURNALIST_ZIP = True
+ENABLE_LITIGATION_PDF = True
+ENABLE_COVERAGE_HEATMAP = True
+ENABLE_LATENCY_BADGES = True
 # ======================================================
+# BASE ADAPTER (LINK-OUT ONLY)
 # ======================================================
 class FOIAAdapter:
     search_url = ""
     def search(self, query):
+        start = time.time()
+        url = self.search_url.format(q=quote_plus(query))
+        latency = round((time.time() - start) * 1000, 1)
         return [{
             "agency": self.agency,
+            "title": f"{self.agency} FOIA Reading Room Result",
+            "url": url,
+            "timestamp": datetime.utcnow().isoformat(),
+            "latency_ms": latency
         }]
+# ======================================================
+# LIVE AGENCIES (SAFE)
+# ======================================================
 class CIA(FOIAAdapter):
     agency = "CIA"
     search_url = "https://www.cia.gov/readingroom/search/site/{q}"
     agency = "FBI"
     search_url = "https://vault.fbi.gov/search?SearchableText={q}"
+class DOJ(FOIAAdapter):
+    agency = "DOJ"
+    search_url = "https://www.justice.gov/foia/library?search={q}"
+class DHS(FOIAAdapter):
+    agency = "DHS"
+    search_url = "https://www.dhs.gov/foia-library/search?search={q}"
+class STATE(FOIAAdapter):
+    agency = "State Department"
+    search_url = "https://foia.state.gov/Search/Search.aspx?q={q}"
+class GSA(FOIAAdapter):
+    agency = "GSA"
+    search_url = "https://www.gsa.gov/reference/freedom-of-information-act-foia/foia-library?search={q}"
+class NSA(FOIAAdapter):
+    agency = "NSA"
+    search_url = "https://www.nsa.gov/resources/everyone/foia/reading-room/?q={q}"
+LIVE_ADAPTERS = [CIA(), FBI(), DOJ(), DHS(), STATE(), GSA(), NSA()]
 # ======================================================
+# GLOBAL STATE (IN-MEMORY ONLY)
 # ======================================================
+LAST_RESULTS = []
+SELECTED_INDEX = None
+SHARE_REGISTRY = {}
+# ======================================================
+# UTILITIES
+# ======================================================
+def citation_hash(r):
+    raw = f"{r['agency']}|{r['url']}|{r['timestamp']}"
+    return hashlib.sha256(raw.encode()).hexdigest()[:16]
+def bluebook(r):
+    return (
+        f"{r['agency']}, {r['title']}, FOIA Electronic Reading Room, "
+        f"{r['url']} (retrieved {datetime.utcnow().strftime('%b %d, %Y')})."
+    )
+def ai_disclosure():
+    return (
+        "\n\n---\n"
+        "AI DISCLOSURE (Court-Ready)\n"
+        "• Analysis initiated only by user\n"
+        "• PDF text extracted only with explicit opt-in\n"
+        "• Public FOIA materials only\n"
+        "• AI output is not evidence or legal advice\n"
+        "• Verify against the original source\n"
+    )
+def hash_ai_output(text):
+    return hashlib.sha256(text.encode()).hexdigest()
 # ======================================================
 # SEARCH
 # ======================================================
 def run_search(query):
+    global LAST_RESULTS
+    LAST_RESULTS = []
     rows = []
+    for adapter in LIVE_ADAPTERS:
+        for r in adapter.search(query):
+            r["hash"] = citation_hash(r)
+            LAST_RESULTS.append(r)
+            rows.append([
+                r["agency"],
+                r["title"],
+                r["url"],
+                r["hash"],
+                f"{r['latency_ms']} ms"
+            ])
     return rows, render_cards()
 # ======================================================
+# CARD GALLERY
 # ======================================================
 def render_cards():
     cards = []
+    for idx, r in enumerate(LAST_RESULTS):
+        url = r["url"]
+        is_pdf = url.lower().endswith(".pdf")
+        preview = (
+            f"<iframe src='{url}' height='220' width='100%'></iframe>"
+            if is_pdf else
+            f"<a href='{url}' target='_blank'>Open FOIA page</a>"
+        )
+        latency = f"<span class='badge'>⏱ {r['latency_ms']} ms</span>"
         cards.append(f"""
+        <div class="card">
+          <div class="card-header">
+            <b>{r['agency']}</b> {latency}
+          </div>
+          <div class="card-title">{r['title']}</div>
+          {preview}
+          <div class="actions">
+            <a href="{url}" target="_blank">View</a>
+            <a href="{url}" download>Download</a>
+            <button onclick="selectDoc({idx})">Analyze / Ask AI</button>
+          </div>
         </div>
         """)
+    return "".join(cards) if cards else "<i>No results</i>"
+# ======================================================
+# PDF TEXT EXTRACTION (OPT-IN)
+# ======================================================
+def extract_pdf_text(url):
+    if not (PDF_TEXT_AVAILABLE and ENABLE_PDF_EXTRACTION):
+        return ""
+    try:
+        r = requests.get(url, timeout=15)
+        with open("/tmp/tmp.pdf", "wb") as f:
+            f.write(r.content)
+        return extract_text("/tmp/tmp.pdf")[:6000]
+    except Exception:
+        return ""
 # ======================================================
+# AI ASK (STRICTLY OPT-IN)
+# ======================================================
+def ask_ai(opt_in, extract_opt_in, question):
+    if not opt_in:
+        return "⚠ AI disabled. Explicit opt-in required."
+    if SELECTED_INDEX is None:
+        return "⚠ Select a document first."
+    r = LAST_RESULTS[SELECTED_INDEX]
+    context = ""
+    if extract_opt_in and r["url"].lower().endswith(".pdf"):
+        context = extract_pdf_text(r["url"])
+    analysis = (
+        f"AI ANALYSIS\n\n"
+        f"Agency: {r['agency']}\n"
+        f"Title: {r['title']}\n"
+        f"URL: {r['url']}\n\n"
+        f"User Question:\n{question}\n\n"
+        f"Extracted Context (if any):\n{context[:1500]}\n\n"
+        f"Summary:\nThis material is publicly available via FOIA."
+    )
+    final = analysis + ai_disclosure()
+    return final + f"\n\nIntegrity Hash: {hash_ai_output(final)}"
+# ======================================================
+# PERSISTENT SHARE PAGES (LINK-ONLY)
 # ======================================================
 def create_share():
+    token = hashlib.sha256(str(LAST_RESULTS).encode()).hexdigest()[:12]
+    SHARE_REGISTRY[token] = LAST_RESULTS.copy()
+    return f"Share ID: {token}"
+def load_share(token):
+    records = SHARE_REGISTRY.get(token)
+    if not records:
+        return "Invalid or expired share ID."
+    return "\n".join(bluebook(r) for r in records)
 # ======================================================
+# LITIGATION APPENDIX (PDF)
+# ======================================================
+def litigation_appendix():
+    buf = io.BytesIO()
+    doc = SimpleDocTemplate(buf)
+    styles = getSampleStyleSheet()
+    story = [
+        Paragraph("<b>Litigation Appendix</b>", styles["Title"]),
+        Spacer(1, 12),
+        Paragraph(
+            f"Generated {datetime.utcnow().strftime('%B %d, %Y UTC')}",
+            styles["Normal"]
+        ),
+        Spacer(1, 12),
+    ]
+    for i, r in enumerate(LAST_RESULTS, start=1):
+        story.append(Paragraph(f"<b>Exhibit A-{i}</b>", styles["Heading2"]))
+        story.append(Paragraph(bluebook(r), styles["Normal"]))
+        story.append(Spacer(1, 8))
+    doc.build(story)
+    buf.seek(0)
+    return buf
+# ======================================================
+# COVERAGE HEATMAP
+# ======================================================
+def coverage_heatmap():
+    counts = Counter(r["agency"] for r in LAST_RESULTS)
+    return go.Figure(
+        data=go.Heatmap(
+            z=[[counts.get(a.agency, 0)] for a in LIVE_ADAPTERS],
+            x=["Results"],
+            y=[a.agency for a in LIVE_ADAPTERS],
+            colorscale="Blues"
+        ),
+        layout=go.Layout(title="Agency Coverage Heatmap")
+    )
+# ======================================================
+# ENTITY / TIMELINE
+# ======================================================
+def entity_graph():
+    domains = Counter(urlparse(r["url"]).netloc for r in LAST_RESULTS)
+    return go.Figure([go.Bar(x=list(domains.keys()), y=list(domains.values()))])
+def timeline():
+    dates = Counter(r["timestamp"][:10] for r in LAST_RESULTS)
+    return go.Figure([go.Bar(x=list(dates.keys()), y=list(dates.values()))])
+# ======================================================
+# JOURNALIST ZIP
 # ======================================================
 def journalist_zip():
     buf = io.BytesIO()
     with zipfile.ZipFile(buf, "w") as z:
+        z.writestr("citations.txt", "\n".join(bluebook(r) for r in LAST_RESULTS))
         z.writestr(
             "links.csv",
             "agency,title,url\n" +
+            "\n".join(f"{r['agency']},{r['title']},{r['url']}" for r in LAST_RESULTS)
         )
     buf.seek(0)
     return buf
 # ======================================================
 # UI
 # ======================================================
+CSS = """
+.card {border:1px solid #ddd;border-radius:12px;padding:14px;margin-bottom:18px}
+.card-header {display:flex;justify-content:space-between}
+.card-title {margin:6px 0 10px 0}
+.actions a, .actions button {margin-right:10px}
+.badge {background:#eef;padding:2px 6px;border-radius:6px;font-size:12px}
+"""
+with gr.Blocks(css=CSS, title="Federal FOIA Intelligence Search") as app:
+    gr.Markdown("# 🏛️ Federal FOIA Intelligence Search\nPublic FOIA Reading Rooms Only")
     with gr.Tabs():
         with gr.Tab("🔍 Search"):
+            query = gr.Textbox(label="Search FOIA Libraries")
+            search_btn = gr.Button("Search")
+            table = gr.Dataframe(headers=["Agency","Title","URL","Hash","Latency"])
             gallery = gr.HTML()
+            search_btn.click(run_search, query, [table, gallery])
+        with gr.Tab("🧠 Ask AI"):
+            ai_opt = gr.Checkbox(label="Enable AI (Explicit Opt-In)")
+            pdf_opt = gr.Checkbox(label="Allow PDF Text Extraction")
+            question = gr.Textbox(label="Ask about selected document", lines=4)
+            answer = gr.Textbox(lines=18)
+            gr.Button("Ask AI").click(ask_ai, [ai_opt, pdf_opt, question], answer)
+        with gr.Tab("📊 Analysis"):
+            gr.Button("Coverage Heatmap").click(coverage_heatmap, outputs=gr.Plot())
+            gr.Button("Entity Graph").click(entity_graph, outputs=gr.Plot())
+            gr.Button("Timeline").click(timeline, outputs=gr.Plot())
         with gr.Tab("📤 Share"):
             gr.Button("Create Share Page").click(create_share, outputs=gr.Textbox())
+            share_id = gr.Textbox(label="Load Share ID")
+            gr.Button("Load").click(load_share, share_id, gr.Textbox(lines=10))
+        with gr.Tab("⚖️ Court Tools"):
+            gr.Button("Generate Litigation Appendix PDF").click(
+                litigation_appendix, outputs=gr.File()
+            )
         with gr.Tab("🗂 Exports"):
             gr.Button("Journalist ZIP").click(journalist_zip, outputs=gr.File())