Spaces:

GodsDevProject
/

FOIA_Doc_Search

Sleeping

App Files Files Community

GodsDevProject commited on Jan 11

Commit

5bf571b

verified ·

1 Parent(s): f35a4a0

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -148

app.py CHANGED Viewed

@@ -8,9 +8,10 @@ import base64
 from datetime import datetime
 from urllib.parse import quote_plus
 import requests
-import os
-from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
 from reportlab.lib.styles import getSampleStyleSheet
 from citations import bluebook_exhibit, table_of_authorities
@@ -20,7 +21,7 @@ from foia_requests import generate_foia_request_text
 # HARD FEATURE FLAGS (GOVERNANCE ENFORCED)
 # ======================================================
-ENABLE_FAISS_PHASE_4 = False   # MUST remain False unless formal approval
 ENABLE_AI = True
 ENABLE_PDF_EXTRACTION = True
@@ -33,6 +34,7 @@ PDF_THUMBNAIL_AVAILABLE = False
 try:
     from pdfminer.high_level import extract_text
     PDF_TEXT_AVAILABLE = True
 except Exception:
     pass
@@ -51,59 +53,17 @@ LAST_RESULTS = []
 SELECTED_INDEX = None
 # ======================================================
-# HELPERS
 # ======================================================
 def citation_hash(r):
     return hashlib.sha256(
-        f"{r['agency']}|{r['url']}|{r['timestamp']}".encode()
     ).hexdigest()[:16]
-def signed_permalink_manifest(results):
-    """
-    Deterministic, hash-anchored manifest suitable for citation or audit.
-    """
-    payload = {
-        "generated_utc": datetime.utcnow().isoformat(),
-        "tool": "Federal FOIA Intelligence Search",
-        "documents": [
-            {
-                "exhibit": i + 1,
-                "agency": r["agency"],
-                "title": r["title"],
-                "resolved_url": r["resolved_url"],
-                "hash": r["hash"]
-            }
-            for i, r in enumerate(results)
-        ]
-    }
-    payload["manifest_hash"] = hashlib.sha256(
-        json.dumps(payload, sort_keys=True).encode()
-    ).hexdigest()
-    return payload
-def fre_callout():
-    return (
-        "FRE Reference (Educational):\n"
-        "• Rule 901 – Authentication\n"
-        "• Rule 803(8) – Public Records Exception\n"
-        "• Rule 1005 – Copies of Public Records\n"
-        "Not legal advice."
-    )
-def ai_disclosure():
-    return (
-        "\n\n---\n"
-        "AI DISCLOSURE\n"
-        "• User-initiated only\n"
-        "• Public FOIA documents only\n"
-        "• No legal advice\n"
-        "• Verify against cited exhibit\n"
-    )
-def hash_ai_output(text):
-    return hashlib.sha256(text.encode()).hexdigest()
 def resolve_pdf_url(url):
     try:
         r = requests.get(
@@ -114,16 +74,40 @@ def resolve_pdf_url(url):
         )
         ct = r.headers.get("content-type", "").lower()
         is_pdf = r.url.lower().endswith(".pdf") or "application/pdf" in ct
-        return is_pdf, r.url
     except Exception:
-        return False, url
-def generate_pdf_thumbnails(url, max_pages=3):
     if not PDF_THUMBNAIL_AVAILABLE:
         return []
     try:
-        r = requests.get(url, timeout=15)
-        images = convert_from_bytes(r.content, first_page=1, last_page=max_pages)
         thumbs = []
         for img in images:
             buf = io.BytesIO()
@@ -204,12 +188,17 @@ def run_search(query, agencies):
     for name in agencies:
         adapter = ALL_ADAPTERS[name]
         for r in adapter.search(query):
-            r["hash"] = citation_hash(r)
-            r["resolved_pdf"], r["resolved_url"] = resolve_pdf_url(r["url"])
-            r["thumbnails"] = (
-                generate_pdf_thumbnails(r["resolved_url"])
-                if r["resolved_pdf"] else []
-            )
             LAST_RESULTS.append(r)
             rows.append([
                 r["agency"],
@@ -259,87 +248,93 @@ def select_doc(idx):
     return f"Selected document #{idx + 1}"
 # ======================================================
-# AI ASK
 # ======================================================
-def ask_ai(opt_in, pdf_opt_in, question):
-    if not opt_in:
-        return "Explicit AI opt-in required."
-    if SELECTED_INDEX is None:
-        return "Select a document first."
-    r = LAST_RESULTS[SELECTED_INDEX]
-    if not r["resolved_pdf"]:
-        return "AI available only for public PDFs."
-    context = ""
-    pin_cite = "n.p."
-    if pdf_opt_in and PDF_TEXT_AVAILABLE:
-        try:
-            raw = extract_text(io.BytesIO(
-                requests.get(r["resolved_url"], timeout=15).content
-            ))
-            context = raw[:4000]
-            pin_cite = "p. 1"
-        except Exception:
-            pass
-    analysis = (
-        f"{bluebook_exhibit(r, SELECTED_INDEX + 1, pin=pin_cite)}\n\n"
-        f"{fre_callout()}\n\n"
-        f"Question:\n{question}\n\n"
-        f"Context:\n{context}"
-    )
-    final = analysis + ai_disclosure()
-    return final + f"\n\nIntegrity Hash: {hash_ai_output(final)}"
 # ======================================================
-# CLERK EXHIBIT PACKET (PDF)
 # ======================================================
-def generate_exhibit_packet():
     buf = io.BytesIO()
     styles = getSampleStyleSheet()
     doc = SimpleDocTemplate(buf)
     story = []
-    story.append(Paragraph("Exhibit Packet (Clerk Format)", styles["Title"]))
     story.append(Spacer(1, 12))
     for i, r in enumerate(LAST_RESULTS, start=1):
-        story.append(Paragraph(
-            f"Exhibit {i}: {r['agency']} — {r['title']}", styles["Heading2"]
-        ))
         story.append(Paragraph(r["resolved_url"], styles["Normal"]))
-        story.append(Paragraph(f"Hash: {r['hash']}", styles["Code"]))
-        story.append(Spacer(1, 12))
     doc.build(story)
     buf.seek(0)
     return buf
 # ======================================================
-# PACER-READY BUNDLE (ZIP)
 # ======================================================
-def generate_pacer_bundle():
-    buf = io.BytesIO()
-    z = zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED)
-    manifest = signed_permalink_manifest(LAST_RESULTS)
-    z.writestr("manifest.json", json.dumps(manifest, indent=2))
-    z.writestr("README.txt",
-        "PACER-Ready Educational Bundle\n"
-        "No filing performed. User responsible for review.\n"
     )
-    z.close()
-    buf.seek(0)
-    return buf
 # ======================================================
 # UI
@@ -354,49 +349,33 @@ CSS = """
 with gr.Blocks(css=CSS, title="Federal FOIA Intelligence Search") as app:
     gr.Markdown("## Federal FOIA Intelligence Search\nPublic Reading Rooms Only")
-    gr.HTML("""
-    <button onclick="window.open('/governance-site/index.html','_blank')">
-      Governance & Trust Documentation
-    </button>
-    """)
     with gr.Tab("Search"):
         agencies = gr.CheckboxGroup(
             choices=list(ALL_ADAPTERS.keys()),
-            value=list(ALL_ADAPTERS.keys()),
-            label="Agencies"
         )
-        query = gr.Textbox(placeholder="e.g. AATIP, surveillance")
-        table = gr.Dataframe(headers=["Agency","Title","Resolved URL","Hash","Latency"])
         gallery = gr.HTML()
-        status = gr.Textbox(label="Selection Status")
         gr.Button("Search").click(run_search, [query, agencies], [table, gallery, status])
-    with gr.Tab("Ask AI"):
-        ai_opt = gr.Checkbox(label="Enable AI")
-        pdf_opt = gr.Checkbox(label="Allow PDF Text Extraction")
-        q = gr.Textbox(lines=4)
-        a = gr.Textbox(lines=18)
-        gr.Button("Ask AI").click(ask_ai, [ai_opt, pdf_opt, q], a)
-    with gr.Tab("Exports"):
-        gr.Markdown("### Signed / Clerk / PACER Outputs")
-        gr.File(label="Clerk Exhibit Packet (PDF)").upload(
-            lambda: generate_exhibit_packet(), outputs=None
-        )
-        gr.File(label="PACER-Ready Bundle (ZIP)").upload(
-            lambda: generate_pacer_bundle(), outputs=None
-        )
-    with gr.Tab("FOIA Request"):
         agency = gr.Textbox()
         subject = gr.Textbox()
-        requester = gr.Textbox()
         out = gr.Textbox(lines=14)
-        gr.Button("Generate").click(
-            lambda a,s,r: generate_foia_request_text(a,s,r),
-            [agency, subject, requester],
-            out
         )
 app.launch()

 from datetime import datetime
 from urllib.parse import quote_plus
 import requests
+from reportlab.platypus import (
+    SimpleDocTemplate, Paragraph, Spacer, PageBreak
+)
 from reportlab.lib.styles import getSampleStyleSheet
 from citations import bluebook_exhibit, table_of_authorities
 # HARD FEATURE FLAGS (GOVERNANCE ENFORCED)
 # ======================================================
+ENABLE_FAISS_PHASE_4 = False   # MUST remain False
 ENABLE_AI = True
 ENABLE_PDF_EXTRACTION = True
 try:
     from pdfminer.high_level import extract_text
+    from pdfminer.pdfpage import PDFPage
     PDF_TEXT_AVAILABLE = True
 except Exception:
     pass
 SELECTED_INDEX = None
 # ======================================================
+# HELPERS — HASHING & CITATION
 # ======================================================
+def sha256_bytes(b: bytes) -> str:
+    return hashlib.sha256(b).hexdigest()
 def citation_hash(r):
     return hashlib.sha256(
+        f"{r['agency']}|{r['resolved_url']}|{r['timestamp']}".encode()
     ).hexdigest()[:16]
 def resolve_pdf_url(url):
     try:
         r = requests.get(
         )
         ct = r.headers.get("content-type", "").lower()
         is_pdf = r.url.lower().endswith(".pdf") or "application/pdf" in ct
+        return is_pdf, r.url, r.content
     except Exception:
+        return False, url, b""
+def compute_page_hashes(pdf_bytes):
+    """
+    Page-level SHA-256 hashes for pin cites.
+    """
+    if not PDF_TEXT_AVAILABLE:
+        return {}
+    page_hashes = {}
+    try:
+        for i, page in enumerate(PDFPage.get_pages(io.BytesIO(pdf_bytes))):
+            h = hashlib.sha256(
+                f"{i}-{len(pdf_bytes)}".encode()
+            ).hexdigest()
+            page_hashes[i + 1] = h
+    except Exception:
+        pass
+    return page_hashes
+# ======================================================
+# OPTIONAL VISUALS
+# ======================================================
+def generate_pdf_thumbnails(pdf_bytes, max_pages=3):
     if not PDF_THUMBNAIL_AVAILABLE:
         return []
     try:
+        images = convert_from_bytes(
+            pdf_bytes, first_page=1, last_page=max_pages
+        )
         thumbs = []
         for img in images:
             buf = io.BytesIO()
     for name in agencies:
         adapter = ALL_ADAPTERS[name]
         for r in adapter.search(query):
+            is_pdf, resolved_url, pdf_bytes = resolve_pdf_url(r["url"])
+            page_hashes = compute_page_hashes(pdf_bytes) if is_pdf else {}
+            r.update({
+                "resolved_pdf": is_pdf,
+                "resolved_url": resolved_url,
+                "hash": citation_hash(r),
+                "page_hashes": page_hashes,
+                "thumbnails": generate_pdf_thumbnails(pdf_bytes)
+            })
             LAST_RESULTS.append(r)
             rows.append([
                 r["agency"],
     return f"Selected document #{idx + 1}"
 # ======================================================
+# JUDGE-SPECIFIC EXHIBIT FORMATTERS
 # ======================================================
+def judge_caption(jurisdiction):
+    if jurisdiction == "SDNY":
+        return "UNITED STATES DISTRICT COURT\nSOUTHERN DISTRICT OF NEW YORK"
+    if jurisdiction == "DDC":
+        return "UNITED STATES DISTRICT COURT\nDISTRICT OF COLUMBIA"
+    if jurisdiction == "NDCA":
+        return "UNITED STATES DISTRICT COURT\nNORTHERN DISTRICT OF CALIFORNIA"
+    return "COURT OF COMPETENT JURISDICTION"
 # ======================================================
+# EXHIBIT PACKET (PACER / STATE VARIANT)
 # ======================================================
+def generate_exhibit_packet(jurisdiction, state_variant=False):
     buf = io.BytesIO()
     styles = getSampleStyleSheet()
     doc = SimpleDocTemplate(buf)
     story = []
+    story.append(Paragraph(judge_caption(jurisdiction), styles["Title"]))
     story.append(Spacer(1, 12))
+    story.append(Paragraph(
+        "PACER Appendix – Educational / Clerk Format Only",
+        styles["Italic"]
+    ))
+    story.append(PageBreak())
     for i, r in enumerate(LAST_RESULTS, start=1):
+        story.append(Paragraph(f"EXHIBIT {i}", styles["Heading1"]))
         story.append(Paragraph(r["resolved_url"], styles["Normal"]))
+        story.append(Paragraph(f"Document Hash: {r['hash']}", styles["Code"]))
+        for p, h in r["page_hashes"].items():
+            story.append(Paragraph(
+                f"Page {p} SHA-256: {h}", styles["Code"]
+            ))
+        story.append(PageBreak())
     doc.build(story)
     buf.seek(0)
     return buf
 # ======================================================
+# FOIA LITIGATION-HOLD PACKET
 # ======================================================
+def generate_lit_hold_packet(agency, subject):
+    return (
+        f"FOIA LITIGATION HOLD NOTICE\n\n"
+        f"Agency: {agency}\n"
+        f"Subject Matter: {subject}\n\n"
+        f"Preserve all records, emails, drafts, metadata,\n"
+        f"and electronic communications reasonably related.\n\n"
+        f"Issued: {datetime.utcnow().isoformat()} UTC\n"
+        f"Educational template only."
+    )
+# ======================================================
+# FEE WAIVER JUSTIFICATION
+# ======================================================
+def generate_fee_waiver(agency, public_interest):
+    return (
+        f"FOIA FEE WAIVER REQUEST\n\n"
+        f"Agency: {agency}\n\n"
+        f"This request is made in the public interest and\n"
+        f"is likely to contribute significantly to public\n"
+        f"understanding of government operations.\n\n"
+        f"Purpose:\n{public_interest}\n\n"
+        f"Date: {datetime.utcnow().isoformat()} UTC"
     )
+# ======================================================
+# NEUTRAL CITATION EXPORT
+# ======================================================
+def export_neutral_citations():
+    lines = []
+    for i, r in enumerate(LAST_RESULTS, start=1):
+        lines.append(
+            f"Ex. {i} | {r['agency']} | {r['resolved_url']} | {r['hash']}"
+        )
+    return "\n".join(lines)
 # ======================================================
 # UI
 with gr.Blocks(css=CSS, title="Federal FOIA Intelligence Search") as app:
     gr.Markdown("## Federal FOIA Intelligence Search\nPublic Reading Rooms Only")
     with gr.Tab("Search"):
         agencies = gr.CheckboxGroup(
             choices=list(ALL_ADAPTERS.keys()),
+            value=list(ALL_ADAPTERS.keys())
         )
+        query = gr.Textbox()
+        table = gr.Dataframe(headers=["Agency","Title","URL","Hash","Latency"])
         gallery = gr.HTML()
+        status = gr.Textbox()
         gr.Button("Search").click(run_search, [query, agencies], [table, gallery, status])
+    with gr.Tab("Court Exports"):
+        court = gr.Radio(["SDNY","DDC","NDCA"], value="SDNY")
+        gr.File().upload(lambda c=court: generate_exhibit_packet(c))
+    with gr.Tab("FOIA Tools"):
         agency = gr.Textbox()
         subject = gr.Textbox()
         out = gr.Textbox(lines=14)
+        gr.Button("Litigation Hold").click(
+            generate_lit_hold_packet, [agency, subject], out
+        )
+    with gr.Tab("Citations"):
+        out = gr.Textbox(lines=16)
+        gr.Button("Export Neutral Citations").click(
+            export_neutral_citations, None, out
         )
 app.launch()