Spaces:

GodsDevProject
/

FOIA_Doc_Search

Sleeping

App Files Files Community

GodsDevProject commited on Jan 10

Commit

449afb8

verified ·

1 Parent(s): a063941

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -233

app.py CHANGED Viewed

@@ -1,42 +1,27 @@
 import gradio as gr
-import time
 import hashlib
 import zipfile
 import io
-import uuid
 from datetime import datetime
 from urllib.parse import quote_plus, urlparse
 from collections import Counter
-import requests
 import plotly.graph_objects as go
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
 from reportlab.lib.styles import getSampleStyleSheet
-# ======================================================
-# OPTIONAL PDF TEXT EXTRACTION (SAFE / GUARDED)
-# ======================================================
-PDF_TEXT_AVAILABLE = False
-try:
-    from pdfminer.high_level import extract_text
-    PDF_TEXT_AVAILABLE = True
-except Exception:
-    PDF_TEXT_AVAILABLE = False
 # ======================================================
-# CONFIG / FEATURE GATES
 # ======================================================
-ENABLE_AI = True                 # explicit opt-in required
-ENABLE_PDF_EXTRACTION = True     # user + checkbox gated
-ENABLE_ENTITY_GRAPHS = True
-ENABLE_TIMELINES = True
 ENABLE_JOURNALIST_ZIP = True
-ENABLE_LITIGATION_PDF = True
 # ======================================================
-# BASE ADAPTER (LINK-OUT ONLY)
 # ======================================================
 class FOIAAdapter:
@@ -44,18 +29,13 @@ class FOIAAdapter:
     search_url = ""
     def search(self, query):
-        url = self.search_url.format(q=quote_plus(query))
         return [{
             "agency": self.agency,
-            "title": f"{self.agency} FOIA Reading Room Result",
-            "url": url,
             "timestamp": datetime.utcnow().isoformat()
         }]
-# ======================================================
-# LIVE AGENCIES (SAFE)
-# ======================================================
 class CIA(FOIAAdapter):
     agency = "CIA"
     search_url = "https://www.cia.gov/readingroom/search/site/{q}"
@@ -64,212 +44,61 @@ class FBI(FOIAAdapter):
     agency = "FBI"
     search_url = "https://vault.fbi.gov/search?SearchableText={q}"
-class DOJ(FOIAAdapter):
-    agency = "DOJ"
-    search_url = "https://www.justice.gov/foia/library?search={q}"
-class DHS(FOIAAdapter):
-    agency = "DHS"
-    search_url = "https://www.dhs.gov/foia-library/search?search={q}"
-class STATE(FOIAAdapter):
-    agency = "State Department"
-    search_url = "https://foia.state.gov/Search/Search.aspx?q={q}"
-class GSA(FOIAAdapter):
-    agency = "GSA"
-    search_url = "https://www.gsa.gov/reference/freedom-of-information-act-foia/foia-library?search={q}"
-class NSA(FOIAAdapter):
-    agency = "NSA"
-    search_url = "https://www.nsa.gov/resources/everyone/foia/reading-room/?q={q}"
-LIVE_ADAPTERS = [CIA(), FBI(), DOJ(), DHS(), STATE(), GSA(), NSA()]
-# ======================================================
-# GLOBAL STATE (IN-MEMORY ONLY)
 # ======================================================
-LAST_RESULTS = []
-SELECTED_INDEX = None
-SHARE_REGISTRY = {}
 # ======================================================
-# UTILITIES
-# ======================================================
-def citation_hash(r):
-    raw = f"{r['agency']}|{r['url']}|{r['timestamp']}"
-    return hashlib.sha256(raw.encode()).hexdigest()[:16]
-def bluebook(r):
-    return (
-        f"{r['agency']}, {r['title']}, FOIA Electronic Reading Room, "
-        f"{r['url']} (retrieved {datetime.utcnow().strftime('%b %d, %Y')})."
-    )
-def ai_disclosure():
-    return (
-        "\n\n---\n"
-        "AI DISCLOSURE\n"
-        "• User-initiated analysis only\n"
-        "• PDF text extracted only with explicit opt-in\n"
-        "• Public FOIA documents only\n"
-        "• AI output is not evidence or legal advice\n"
-        "• Verify against the original source\n"
-    )
-def hash_ai_output(text):
-    return hashlib.sha256(text.encode()).hexdigest()
 # ======================================================
 # SEARCH
 # ======================================================
 def run_search(query):
-    global LAST_RESULTS
-    LAST_RESULTS = []
     rows = []
-    for adapter in LIVE_ADAPTERS:
-        for r in adapter.search(query):
-            r["hash"] = citation_hash(r)
-            LAST_RESULTS.append(r)
             rows.append([r["agency"], r["title"], r["url"], r["hash"]])
-    return rows, render_cards()
-# ======================================================
-# CARD GALLERY
-# ======================================================
-def render_cards():
-    cards = []
-    for idx, r in enumerate(LAST_RESULTS):
-        url = r["url"]
-        is_pdf = url.lower().endswith(".pdf")
-        preview = (
-            f"<iframe src='{url}' height='220' width='100%'></iframe>"
-            if is_pdf else
-            f"<a href='{url}' target='_blank'>Open link</a>"
-        )
-        cards.append(f"""
-        <div class="card">
-          <b>{r['agency']}</b><br>
-          {r['title']}<br><br>
-          {preview}
-          <div class="actions">
-            <a href="{url}" target="_blank">View</a>
-            <a href="{url}" download>Download</a>
-            <a href="/share/{idx}" target="_blank">Share</a>
-            <button onclick="selectDoc({idx})">Ask AI</button>
-          </div>
-        </div>
-        """)
-    return "".join(cards) if cards else "<i>No results</i>"
 # ======================================================
-# PDF TEXT EXTRACTION (OPT-IN)
 # ======================================================
-def extract_pdf_text(url):
-    if not (PDF_TEXT_AVAILABLE and ENABLE_PDF_EXTRACTION):
-        return ""
-    try:
-        r = requests.get(url, timeout=15)
-        with open("/tmp/tmp.pdf", "wb") as f:
-            f.write(r.content)
-        return extract_text("/tmp/tmp.pdf")[:6000]
-    except Exception:
-        return ""
-# ======================================================
-# AI ASK (STRICTLY OPT-IN)
-# ======================================================
-def ask_ai(opt_in, extract_opt_in, question):
-    if not opt_in:
-        return "⚠ AI disabled. Explicit opt-in required."
-    if SELECTED_INDEX is None:
-        return "⚠ Select a document first."
-    r = LAST_RESULTS[SELECTED_INDEX]
-    context = ""
-    if extract_opt_in and r["url"].lower().endswith(".pdf"):
-        context = extract_pdf_text(r["url"])
-    analysis = (
         f"AI ANALYSIS\n\n"
-        f"Agency: {r['agency']}\n"
-        f"Title: {r['title']}\n"
-        f"URL: {r['url']}\n\n"
         f"Question:\n{question}\n\n"
-        f"Context Extracted:\n{context[:1500]}\n\n"
-        f"Analysis:\nThis document is publicly available via FOIA."
     )
-    final = analysis + ai_disclosure()
-    return final + f"\n\nIntegrity Hash: {hash_ai_output(final)}"
-# ======================================================
-# SHARE PAGES (LINK-ONLY)
-# ======================================================
-def create_share(idx):
-    token = str(uuid.uuid4())[:8]
-    SHARE_REGISTRY[token] = LAST_RESULTS[idx]
-    return f"Public Share Token: {token}"
 # ======================================================
-# LITIGATION APPENDIX (PDF)
-# ======================================================
-def litigation_appendix():
-    buf = io.BytesIO()
-    doc = SimpleDocTemplate(buf)
-    styles = getSampleStyleSheet()
-    story = [Paragraph("<b>Litigation Appendix</b>", styles["Title"]), Spacer(1, 12)]
-    for i, r in enumerate(LAST_RESULTS, start=1):
-        story.append(Paragraph(f"<b>Exhibit A-{i}</b>", styles["Heading2"]))
-        story.append(Paragraph(bluebook(r), styles["Normal"]))
-        story.append(Spacer(1, 8))
-    doc.build(story)
-    buf.seek(0)
-    return buf
-# ======================================================
-# ENTITY / TIMELINE
-# ======================================================
-def entity_graph():
-    domains = Counter(urlparse(r["url"]).netloc for r in LAST_RESULTS)
-    return go.Figure([go.Bar(x=list(domains.keys()), y=list(domains.values()))])
-def timeline():
-    dates = Counter(r["timestamp"][:10] for r in LAST_RESULTS)
-    return go.Figure([go.Bar(x=list(dates.keys()), y=list(dates.values()))])
-# ======================================================
-# JOURNALIST ZIP
 # ======================================================
 def journalist_zip():
     buf = io.BytesIO()
     with zipfile.ZipFile(buf, "w") as z:
-        z.writestr("citations.txt", "\n".join(bluebook(r) for r in LAST_RESULTS))
-        z.writestr(
-            "links.csv",
-            "agency,title,url\n" +
-            "\n".join(f"{r['agency']},{r['title']},{r['url']}" for r in LAST_RESULTS)
-        )
     buf.seek(0)
     return buf
@@ -277,39 +106,21 @@ def journalist_zip():
 # UI
 # ======================================================
-CSS = """
-.card {border:1px solid #ddd;border-radius:10px;padding:12px;margin-bottom:16px}
-.actions a, .actions button {margin-right:8px}
-"""
-with gr.Blocks(css=CSS, title="Federal FOIA Intelligence Search") as app:
-    gr.Markdown("# 🏛️ Federal FOIA Intelligence Search\nPublic Reading Rooms Only")
     with gr.Tabs():
-        with gr.Tab("🔍 Search"):
-            query = gr.Textbox(label="Search FOIA Libraries")
-            search_btn = gr.Button("Search")
             table = gr.Dataframe(headers=["Agency","Title","URL","Hash"])
-            gallery = gr.HTML()
-            search_btn.click(run_search, query, [table, gallery])
-        with gr.Tab("🧠 Ask AI"):
-            ai_opt = gr.Checkbox(label="Enable AI (Explicit Opt-In)")
-            pdf_opt = gr.Checkbox(label="Allow PDF text extraction (Explicit Opt-In)")
-            question = gr.Textbox(label="Ask about selected document", lines=4)
-            answer = gr.Textbox(lines=16)
-            gr.Button("Ask AI").click(ask_ai, [ai_opt, pdf_opt, question], answer)
-        with gr.Tab("📊 Analysis"):
-            gr.Button("Entity Graph").click(entity_graph, outputs=gr.Plot())
-            gr.Button("Timeline").click(timeline, outputs=gr.Plot())
-        with gr.Tab("⚖️ Court Tools"):
-            gr.Button("Generate Litigation Appendix PDF").click(
-                litigation_appendix, outputs=gr.File()
-            )
-        with gr.Tab("🗂 Exports"):
             gr.Button("Journalist ZIP").click(journalist_zip, outputs=gr.File())
 app.launch()

 import gradio as gr
 import hashlib
 import zipfile
 import io
 from datetime import datetime
 from urllib.parse import quote_plus, urlparse
 from collections import Counter
 import plotly.graph_objects as go
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
 from reportlab.lib.styles import getSampleStyleSheet
+from bluebook import bluebook_full
+from semantic import SemanticIndex, FAISS_AVAILABLE
 # ======================================================
+# CONFIG
 # ======================================================
+ENABLE_AI = True
 ENABLE_JOURNALIST_ZIP = True
 # ======================================================
+# FOIA ADAPTER
 # ======================================================
 class FOIAAdapter:
     search_url = ""
     def search(self, query):
         return [{
             "agency": self.agency,
+            "title": f"{self.agency} FOIA Search Results",
+            "url": self.search_url.format(q=quote_plus(query)),
             "timestamp": datetime.utcnow().isoformat()
         }]
 class CIA(FOIAAdapter):
     agency = "CIA"
     search_url = "https://www.cia.gov/readingroom/search/site/{q}"
     agency = "FBI"
     search_url = "https://vault.fbi.gov/search?SearchableText={q}"
+LIVE_ADAPTERS = [CIA(), FBI()]
 # ======================================================
+# STATE
 # ======================================================
+RESULTS = []
+SEMANTIC = None
+SELECTED = None
 # ======================================================
 # SEARCH
 # ======================================================
 def run_search(query):
+    global RESULTS
+    RESULTS = []
     rows = []
+    for a in LIVE_ADAPTERS:
+        for r in a.search(query):
+            r["hash"] = hashlib.sha256(r["url"].encode()).hexdigest()[:16]
+            RESULTS.append(r)
             rows.append([r["agency"], r["title"], r["url"], r["hash"]])
+    return rows
 # ======================================================
+# AI ASK
 # ======================================================
+def ask_ai(opt_in, question):
+    if not opt_in or not SELECTED:
+        return "AI disabled or no document selected."
+    r = SELECTED
+    return (
         f"AI ANALYSIS\n\n"
+        f"{r['title']}\n{r['url']}\n\n"
         f"Question:\n{question}\n\n"
+        f"Analysis is informational only.\n\n"
+        "AI DISCLOSURE:\n"
+        "• User-initiated\n"
+        "• Public FOIA documents only\n"
+        "• Verify against original source\n"
     )
 # ======================================================
+# EXPORT
 # ======================================================
 def journalist_zip():
     buf = io.BytesIO()
     with zipfile.ZipFile(buf, "w") as z:
+        z.writestr("citations.txt", "\n".join(bluebook_full(r) for r in RESULTS))
     buf.seek(0)
     return buf
 # UI
 # ======================================================
+with gr.Blocks(title="Federal FOIA Intelligence Search") as app:
     with gr.Tabs():
+        with gr.Tab("Search"):
+            q = gr.Textbox()
+            btn = gr.Button("Search")
             table = gr.Dataframe(headers=["Agency","Title","URL","Hash"])
+            btn.click(run_search, q, table)
+        with gr.Tab("AI Ask"):
+            opt = gr.Checkbox(label="Enable AI (Opt-In)")
+            question = gr.Textbox(lines=4)
+            answer = gr.Textbox(lines=12)
+            gr.Button("Ask AI").click(ask_ai, [opt, question], answer)
+        with gr.Tab("Exports"):
             gr.Button("Journalist ZIP").click(journalist_zip, outputs=gr.File())
 app.launch()