Spaces:

GodsDevProject
/

FOIA_Doc_Search

Sleeping

App Files Files Community

GodsDevProject commited on Jan 10

Commit

4394fda

verified ·

1 Parent(s): 2bf60a1

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -64

app.py CHANGED Viewed

@@ -3,46 +3,57 @@ import time
 import hashlib
 import zipfile
 import io
 from datetime import datetime
 from urllib.parse import quote_plus, urlparse
-from collections import defaultdict, Counter
 import requests
 import plotly.graph_objects as go
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
 from reportlab.lib.styles import getSampleStyleSheet
 # ======================================================
 # CONFIG / FEATURE GATES
 # ======================================================
-ENABLE_AI = True                 # explicit user opt-in required
-ENABLE_PDF_THUMBNAILS = True
 ENABLE_ENTITY_GRAPHS = True
 ENABLE_TIMELINES = True
 ENABLE_JOURNALIST_ZIP = True
 # ======================================================
-# BASE ADAPTER
 # ======================================================
 class FOIAAdapter:
     agency = "UNKNOWN"
     search_url = ""
-    is_live = True
     def search(self, query):
         url = self.search_url.format(q=quote_plus(query))
         return [{
             "agency": self.agency,
-            "title": f"{self.agency} FOIA Search Results",
             "url": url,
-            "is_live": self.is_live,
             "timestamp": datetime.utcnow().isoformat()
         }]
 # ======================================================
-# LIVE AGENCIES
 # ======================================================
 class CIA(FOIAAdapter):
@@ -75,12 +86,20 @@ class NSA(FOIAAdapter):
 LIVE_ADAPTERS = [CIA(), FBI(), DOJ(), DHS(), STATE(), GSA(), NSA()]
 # ======================================================
 # UTILITIES
 # ======================================================
 def citation_hash(r):
-    raw = f"{r['agency']}{r['url']}{r['timestamp']}"
     return hashlib.sha256(raw.encode()).hexdigest()[:16]
 def bluebook(r):
@@ -94,18 +113,14 @@ def ai_disclosure():
         "\n\n---\n"
         "AI DISCLOSURE\n"
         "• User-initiated analysis only\n"
-        "• PDF processed only when explicitly requested\n"
         "• Public FOIA documents only\n"
-        "• Not legal advice or a primary source\n"
-        "• Verify against the original record\n"
     )
-# ======================================================
-# GLOBAL STATE
-# ======================================================
-LAST_RESULTS = []
-SELECTED_DOC = None
 # ======================================================
 # SEARCH
@@ -120,42 +135,36 @@ def run_search(query):
         for r in adapter.search(query):
             r["hash"] = citation_hash(r)
             LAST_RESULTS.append(r)
-            rows.append([
-                r["agency"],
-                r["title"],
-                r["url"],
-                r["hash"]
-            ])
     return rows, render_cards()
 # ======================================================
-# CARD / THUMBNAIL GALLERY
 # ======================================================
 def render_cards():
     cards = []
     for idx, r in enumerate(LAST_RESULTS):
         url = r["url"]
         is_pdf = url.lower().endswith(".pdf")
         preview = (
-            f"<iframe src='{url}' width='100%' height='200'></iframe>"
             if is_pdf else
             f"<a href='{url}' target='_blank'>Open link</a>"
         )
         cards.append(f"""
-        <div style="border:1px solid #ccc;border-radius:10px;padding:12px;margin-bottom:16px">
           <b>{r['agency']}</b><br>
           {r['title']}<br><br>
           {preview}
-          <div style="margin-top:8px">
-            <a href="{url}" target="_blank">View</a> |
-            <a href="{url}" download>Download</a> |
-            <a href="{url}" target="_blank">Share</a> |
-            <a href="#" onclick="selectDoc({idx})">Ask AI</a>
           </div>
         </div>
         """)
@@ -163,32 +172,81 @@ def render_cards():
     return "".join(cards) if cards else "<i>No results</i>"
 # ======================================================
-# AI ASK (PDF ONLY WHEN CLICKED)
 # ======================================================
-def ask_ai(opt_in, question):
     if not opt_in:
         return "⚠ AI disabled. Explicit opt-in required."
-    if SELECTED_DOC is None:
         return "⚠ Select a document first."
-    r = SELECTED_DOC
-    summary = (
         f"AI ANALYSIS\n\n"
         f"Agency: {r['agency']}\n"
         f"Title: {r['title']}\n"
         f"URL: {r['url']}\n\n"
         f"Question:\n{question}\n\n"
-        f"Analysis:\n"
-        f"This document is publicly available via FOIA. "
-        f"Key themes, entities, and relevance should be reviewed directly in the source."
     )
-    return summary + ai_disclosure()
 # ======================================================
-# ENTITY + TIMELINE
 # ======================================================
 def entity_graph():
@@ -206,7 +264,6 @@ def timeline():
 def journalist_zip():
     buf = io.BytesIO()
     with zipfile.ZipFile(buf, "w") as z:
-        z.writestr("README.txt", "Public FOIA links only.\nNo documents included.")
         z.writestr("citations.txt", "\n".join(bluebook(r) for r in LAST_RESULTS))
         z.writestr(
             "links.csv",
@@ -217,24 +274,16 @@ def journalist_zip():
     return buf
 # ======================================================
-# JS HELPERS
 # ======================================================
-JS = """
-<script>
-function selectDoc(idx){
-  fetch(`/select/${idx}`);
-  alert("Document selected for AI analysis");
-}
-</script>
 """
-# ======================================================
-# UI
-# ======================================================
-with gr.Blocks(title="Federal FOIA Intelligence Search") as app:
-    gr.HTML(JS)
     with gr.Tabs():
         with gr.Tab("🔍 Search"):
@@ -244,19 +293,22 @@ with gr.Blocks(title="Federal FOIA Intelligence Search") as app:
             gallery = gr.HTML()
             search_btn.click(run_search, query, [table, gallery])
-        with gr.Tab("📄 Documents"):
-            gallery.render()
-        with gr.Tab("🧠 AI Ask"):
             ai_opt = gr.Checkbox(label="Enable AI (Explicit Opt-In)")
             question = gr.Textbox(label="Ask about selected document", lines=4)
-            answer = gr.Textbox(lines=14)
-            gr.Button("Ask AI").click(ask_ai, [ai_opt, question], answer)
         with gr.Tab("📊 Analysis"):
             gr.Button("Entity Graph").click(entity_graph, outputs=gr.Plot())
             gr.Button("Timeline").click(timeline, outputs=gr.Plot())
         with gr.Tab("🗂 Exports"):
             gr.Button("Journalist ZIP").click(journalist_zip, outputs=gr.File())

 import hashlib
 import zipfile
 import io
+import uuid
 from datetime import datetime
 from urllib.parse import quote_plus, urlparse
+from collections import Counter
 import requests
 import plotly.graph_objects as go
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
 from reportlab.lib.styles import getSampleStyleSheet
+# ======================================================
+# OPTIONAL PDF TEXT EXTRACTION (SAFE / GUARDED)
+# ======================================================
+PDF_TEXT_AVAILABLE = False
+try:
+    from pdfminer.high_level import extract_text
+    PDF_TEXT_AVAILABLE = True
+except Exception:
+    PDF_TEXT_AVAILABLE = False
 # ======================================================
 # CONFIG / FEATURE GATES
 # ======================================================
+ENABLE_AI = True                 # explicit opt-in required
+ENABLE_PDF_EXTRACTION = True     # user + checkbox gated
 ENABLE_ENTITY_GRAPHS = True
 ENABLE_TIMELINES = True
 ENABLE_JOURNALIST_ZIP = True
+ENABLE_LITIGATION_PDF = True
 # ======================================================
+# BASE ADAPTER (LINK-OUT ONLY)
 # ======================================================
 class FOIAAdapter:
     agency = "UNKNOWN"
     search_url = ""
     def search(self, query):
         url = self.search_url.format(q=quote_plus(query))
         return [{
             "agency": self.agency,
+            "title": f"{self.agency} FOIA Reading Room Result",
             "url": url,
             "timestamp": datetime.utcnow().isoformat()
         }]
 # ======================================================
+# LIVE AGENCIES (SAFE)
 # ======================================================
 class CIA(FOIAAdapter):
 LIVE_ADAPTERS = [CIA(), FBI(), DOJ(), DHS(), STATE(), GSA(), NSA()]
+# ======================================================
+# GLOBAL STATE (IN-MEMORY ONLY)
+# ======================================================
+LAST_RESULTS = []
+SELECTED_INDEX = None
+SHARE_REGISTRY = {}
 # ======================================================
 # UTILITIES
 # ======================================================
 def citation_hash(r):
+    raw = f"{r['agency']}|{r['url']}|{r['timestamp']}"
     return hashlib.sha256(raw.encode()).hexdigest()[:16]
 def bluebook(r):
         "\n\n---\n"
         "AI DISCLOSURE\n"
         "• User-initiated analysis only\n"
+        "• PDF text extracted only with explicit opt-in\n"
         "• Public FOIA documents only\n"
+        "• AI output is not evidence or legal advice\n"
+        "• Verify against the original source\n"
     )
+def hash_ai_output(text):
+    return hashlib.sha256(text.encode()).hexdigest()
 # ======================================================
 # SEARCH
         for r in adapter.search(query):
             r["hash"] = citation_hash(r)
             LAST_RESULTS.append(r)
+            rows.append([r["agency"], r["title"], r["url"], r["hash"]])
     return rows, render_cards()
 # ======================================================
+# CARD GALLERY
 # ======================================================
 def render_cards():
     cards = []
     for idx, r in enumerate(LAST_RESULTS):
         url = r["url"]
         is_pdf = url.lower().endswith(".pdf")
         preview = (
+            f"<iframe src='{url}' height='220' width='100%'></iframe>"
             if is_pdf else
             f"<a href='{url}' target='_blank'>Open link</a>"
         )
         cards.append(f"""
+        <div class="card">
           <b>{r['agency']}</b><br>
           {r['title']}<br><br>
           {preview}
+          <div class="actions">
+            <a href="{url}" target="_blank">View</a>
+            <a href="{url}" download>Download</a>
+            <a href="/share/{idx}" target="_blank">Share</a>
+            <button onclick="selectDoc({idx})">Ask AI</button>
           </div>
         </div>
         """)
     return "".join(cards) if cards else "<i>No results</i>"
 # ======================================================
+# PDF TEXT EXTRACTION (OPT-IN)
+# ======================================================
+def extract_pdf_text(url):
+    if not (PDF_TEXT_AVAILABLE and ENABLE_PDF_EXTRACTION):
+        return ""
+    try:
+        r = requests.get(url, timeout=15)
+        with open("/tmp/tmp.pdf", "wb") as f:
+            f.write(r.content)
+        return extract_text("/tmp/tmp.pdf")[:6000]
+    except Exception:
+        return ""
+# ======================================================
+# AI ASK (STRICTLY OPT-IN)
 # ======================================================
+def ask_ai(opt_in, extract_opt_in, question):
     if not opt_in:
         return "⚠ AI disabled. Explicit opt-in required."
+    if SELECTED_INDEX is None:
         return "⚠ Select a document first."
+    r = LAST_RESULTS[SELECTED_INDEX]
+    context = ""
+    if extract_opt_in and r["url"].lower().endswith(".pdf"):
+        context = extract_pdf_text(r["url"])
+    analysis = (
         f"AI ANALYSIS\n\n"
         f"Agency: {r['agency']}\n"
         f"Title: {r['title']}\n"
         f"URL: {r['url']}\n\n"
         f"Question:\n{question}\n\n"
+        f"Context Extracted:\n{context[:1500]}\n\n"
+        f"Analysis:\nThis document is publicly available via FOIA."
     )
+    final = analysis + ai_disclosure()
+    return final + f"\n\nIntegrity Hash: {hash_ai_output(final)}"
 # ======================================================
+# SHARE PAGES (LINK-ONLY)
+# ======================================================
+def create_share(idx):
+    token = str(uuid.uuid4())[:8]
+    SHARE_REGISTRY[token] = LAST_RESULTS[idx]
+    return f"Public Share Token: {token}"
+# ======================================================
+# LITIGATION APPENDIX (PDF)
+# ======================================================
+def litigation_appendix():
+    buf = io.BytesIO()
+    doc = SimpleDocTemplate(buf)
+    styles = getSampleStyleSheet()
+    story = [Paragraph("<b>Litigation Appendix</b>", styles["Title"]), Spacer(1, 12)]
+    for i, r in enumerate(LAST_RESULTS, start=1):
+        story.append(Paragraph(f"<b>Exhibit A-{i}</b>", styles["Heading2"]))
+        story.append(Paragraph(bluebook(r), styles["Normal"]))
+        story.append(Spacer(1, 8))
+    doc.build(story)
+    buf.seek(0)
+    return buf
+# ======================================================
+# ENTITY / TIMELINE
 # ======================================================
 def entity_graph():
 def journalist_zip():
     buf = io.BytesIO()
     with zipfile.ZipFile(buf, "w") as z:
         z.writestr("citations.txt", "\n".join(bluebook(r) for r in LAST_RESULTS))
         z.writestr(
             "links.csv",
     return buf
 # ======================================================
+# UI
 # ======================================================
+CSS = """
+.card {border:1px solid #ddd;border-radius:10px;padding:12px;margin-bottom:16px}
+.actions a, .actions button {margin-right:8px}
 """
+with gr.Blocks(css=CSS, title="Federal FOIA Intelligence Search") as app:
+    gr.Markdown("# 🏛️ Federal FOIA Intelligence Search\nPublic Reading Rooms Only")
     with gr.Tabs():
         with gr.Tab("🔍 Search"):
             gallery = gr.HTML()
             search_btn.click(run_search, query, [table, gallery])
+        with gr.Tab("🧠 Ask AI"):
             ai_opt = gr.Checkbox(label="Enable AI (Explicit Opt-In)")
+            pdf_opt = gr.Checkbox(label="Allow PDF text extraction (Explicit Opt-In)")
             question = gr.Textbox(label="Ask about selected document", lines=4)
+            answer = gr.Textbox(lines=16)
+            gr.Button("Ask AI").click(ask_ai, [ai_opt, pdf_opt, question], answer)
         with gr.Tab("📊 Analysis"):
             gr.Button("Entity Graph").click(entity_graph, outputs=gr.Plot())
             gr.Button("Timeline").click(timeline, outputs=gr.Plot())
+        with gr.Tab("⚖️ Court Tools"):
+            gr.Button("Generate Litigation Appendix PDF").click(
+                litigation_appendix, outputs=gr.File()
+            )
         with gr.Tab("🗂 Exports"):
             gr.Button("Journalist ZIP").click(journalist_zip, outputs=gr.File())