GodsDevProject commited on
Commit
cc720ae
·
verified ·
1 Parent(s): dafc3d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -143
app.py CHANGED
@@ -3,21 +3,19 @@
3
  # HF Reviewer–Safe / Court-Safe Reference Implementation
4
  # ======================================================
5
 
6
- import os, io, zipfile, tempfile, hashlib, base64
7
  from datetime import datetime
8
  from urllib.parse import quote_plus
9
  import requests
10
 
11
  import gradio as gr
12
  from fastapi import FastAPI
 
13
  from fastapi.responses import JSONResponse
14
 
15
- # ======================================================
16
- # OPTIONAL PDF SUPPORT
17
- # ======================================================
18
-
19
  PDF_THUMBNAILS_AVAILABLE = False
20
- PDF_TEXT_EXTRACTION_AVAILABLE = False
21
 
22
  try:
23
  from pdf2image import convert_from_bytes
@@ -26,8 +24,8 @@ except Exception:
26
  pass
27
 
28
  try:
29
- from PyPDF2 import PdfReader
30
- PDF_TEXT_EXTRACTION_AVAILABLE = True
31
  except Exception:
32
  pass
33
 
@@ -39,9 +37,9 @@ from reportlab.lib.pagesizes import LETTER
39
  # HARD GOVERNANCE FLAGS (NON-NEGOTIABLE)
40
  # ======================================================
41
 
42
- ENABLE_AI = True
43
- ENABLE_FAISS_PHASE_4 = False # REQUIRES FORMAL APPROVAL
44
- ENABLE_DOC_LEVEL_APIS = False
45
 
46
  # ======================================================
47
  # SESSION STATE (EPHEMERAL)
@@ -59,7 +57,7 @@ def sha256_text(t: str) -> str:
59
 
60
  def provenance_block(payload: str, ai=False) -> str:
61
  return "\n".join([
62
- "Tool-Version: 1.9.0",
63
  f"Generated-UTC: {datetime.utcnow().isoformat()}",
64
  f"Content-SHA256: {sha256_text(payload)}",
65
  "Public-Source-Only: true",
@@ -68,31 +66,7 @@ def provenance_block(payload: str, ai=False) -> str:
68
  ])
69
 
70
  # ======================================================
71
- # FAISS PHASE-4 FORMAL APPROVAL WORKFLOW
72
- # ======================================================
73
-
74
- FAISS_APPROVAL_MEMO = """
75
- Phase-4 FAISS Approval Workflow
76
-
77
- 1. Written authorization from data-owning agency
78
- 2. Judicial approval (if court-adjacent use)
79
- 3. Privacy Impact Assessment (PIA)
80
- 4. Security review (no embeddings of restricted data)
81
- 5. ENABLE_FAISS_PHASE_4 flag set to True
82
- 6. Signed change record archived
83
-
84
- Status: NOT APPROVED
85
- """
86
-
87
- class Phase4FAISS:
88
- def __init__(self):
89
- if not ENABLE_FAISS_PHASE_4:
90
- raise RuntimeError(
91
- "Phase-4 FAISS indexing is disabled pending formal approval."
92
- )
93
-
94
- # ======================================================
95
- # FOIA ADAPTERS (LINK-OUT ONLY)
96
  # ======================================================
97
 
98
  class FOIAAdapter:
@@ -106,6 +80,8 @@ class FOIAAdapter:
106
  "title": f"{self.agency} FOIA Reading Room",
107
  "resolved_url": url,
108
  "timestamp": datetime.utcnow().isoformat(),
 
 
109
  }]
110
 
111
  class CIA(FOIAAdapter):
@@ -116,21 +92,40 @@ class FBI(FOIAAdapter):
116
  agency = "FBI"
117
  search_url = "https://vault.fbi.gov/search?SearchableText={q}"
118
 
 
 
 
 
 
 
 
 
119
  ALL_ADAPTERS = {
120
  "CIA": CIA(),
121
  "FBI": FBI(),
 
 
122
  }
123
 
124
  # ======================================================
125
- # TRUE PDF THUMBNAILS
126
  # ======================================================
127
 
128
- def generate_pdf_thumbnails(url, max_pages=2):
 
 
 
 
 
 
 
 
 
129
  if not PDF_THUMBNAILS_AVAILABLE:
130
  return []
131
  try:
132
  r = requests.get(url, timeout=10)
133
- images = convert_from_bytes(r.content, first_page=1, last_page=max_pages)
134
  thumbs = []
135
  for img in images:
136
  buf = io.BytesIO()
@@ -140,23 +135,6 @@ def generate_pdf_thumbnails(url, max_pages=2):
140
  except Exception:
141
  return []
142
 
143
- # ======================================================
144
- # REAL PDF TEXT EXTRACTION (PUBLIC DOCS ONLY)
145
- # ======================================================
146
-
147
- def extract_pdf_text(url, limit=1500):
148
- if not PDF_TEXT_EXTRACTION_AVAILABLE:
149
- return "PDF text extraction not available in this environment."
150
- try:
151
- r = requests.get(url, timeout=10)
152
- reader = PdfReader(io.BytesIO(r.content))
153
- text = ""
154
- for page in reader.pages[:5]:
155
- text += page.extract_text() or ""
156
- return text[:limit]
157
- except Exception:
158
- return "Unable to extract text from PDF."
159
-
160
  # ======================================================
161
  # SEARCH
162
  # ======================================================
@@ -168,138 +146,139 @@ def run_search(query, agencies):
168
  rows = []
169
  for name in agencies:
170
  for r in ALL_ADAPTERS[name].search(query):
171
- r["hash"] = sha256_text(r["resolved_url"])[:16]
172
- r["thumbnails"] = (
173
- generate_pdf_thumbnails(r["resolved_url"])
174
- if r["resolved_url"].lower().endswith(".pdf")
175
- else []
176
- )
177
- r["extracted_text"] = (
178
- extract_pdf_text(r["resolved_url"])
179
- if r["resolved_url"].lower().endswith(".pdf")
180
- else ""
181
- )
182
  LAST_RESULTS.append(r)
183
- rows.append([r["agency"], r["title"], r["resolved_url"], r["hash"]])
184
 
185
  return rows, render_cards(), "Search complete."
186
 
187
  # ======================================================
188
- # ASK AI (ASSISTIVE ONLY)
189
  # ======================================================
190
 
191
  def ask_ai(index: int):
192
  global AI_APPENDIX
193
  r = LAST_RESULTS[index]
194
 
195
- text = (
 
 
 
196
  "AI Assistive Summary (Non-Authoritative)\n\n"
197
  f"Agency: {r['agency']}\n"
198
- f"Source URL: {r['resolved_url']}\n\n"
199
- f"Extracted Text Preview:\n{r.get('extracted_text','')[:800]}"
200
  )
201
 
202
  AI_APPENDIX = {
203
- "text": text,
204
- "hash": sha256_text(text),
205
- "prov": provenance_block(text, ai=True)
206
  }
207
 
208
- return text + "\n\n" + AI_APPENDIX["prov"]
209
 
210
  # ======================================================
211
- # CLERK TRAINING – ACTUAL PDF SLIDES
212
  # ======================================================
213
 
214
- def generate_clerk_training_pdf():
215
- path = os.path.join(tempfile.gettempdir(), "Judicial_Clerk_Training.pdf")
216
- styles = getSampleStyleSheet()
217
- doc = SimpleDocTemplate(path, pagesize=LETTER)
218
- story = []
 
 
 
 
 
 
219
 
220
- slides = [
221
- "FOIA Intelligence Tool – Clerk Training",
222
- "What This Tool Is\n\n• Public FOIA link-out search\n• No scraping\n• No sealed data",
223
- "What This Tool Is NOT\n\n• Not evidence\n• Not legal advice\n• Not authentication",
224
- "AI Usage\n\n• User-initiated\n• Assistive only\n• Cryptographically hashed",
225
- "CM/ECF Compatibility\n\n• Informational exhibits\n• Hash-verifiable\n• No metadata mutation",
226
- ]
227
 
228
- for s in slides:
229
- story.append(Paragraph(s.replace("\n", "<br/>"), styles["Title"]))
230
- story.append(PageBreak())
231
 
232
- doc.build(story)
233
- return path
234
 
235
  # ======================================================
236
- # AO / CM-ECF COMPATIBILITY MEMO
237
  # ======================================================
238
 
239
- AO_CMECF_MEMO = """
240
- Administrative Office / CM-ECF Compatibility Memo
 
 
 
241
 
242
- This system produces informational exhibits only
243
- No filing automation or docket access
244
- • No PACER integration
245
- • Hashes provided for verification
246
- • AI output segregated and labeled
247
 
248
- Compatible with AO guidance on non-filing research tools.
 
 
 
 
249
  """
250
 
251
  # ======================================================
252
- # COURT BUNDLE
253
  # ======================================================
254
 
255
- def generate_court_bundle():
256
- with tempfile.TemporaryDirectory() as td:
257
- path = os.path.join(td, "court_bundle.zip")
258
- with zipfile.ZipFile(path, "w") as z:
259
- for i, r in enumerate(LAST_RESULTS, 1):
260
- z.writestr(
261
- f"Exhibit_{i:03d}.txt",
262
- f"{r['resolved_url']}\n\n{provenance_block(r['resolved_url'])}"
263
- )
264
-
265
- if AI_APPENDIX:
266
- z.writestr("AI_Appendix.txt", AI_APPENDIX["text"])
267
- z.writestr("AI_Appendix.provenance.txt", AI_APPENDIX["prov"])
268
 
269
- z.writestr("AO_CMECF_Memo.txt", AO_CMECF_MEMO)
270
- z.writestr("FAISS_Phase4_Workflow.txt", FAISS_APPROVAL_MEMO)
 
271
 
272
- return path
 
273
 
274
  # ======================================================
275
  # UI
276
  # ======================================================
277
 
 
 
 
 
 
278
  def render_cards():
279
  cards = []
280
  for i, r in enumerate(LAST_RESULTS):
281
  thumbs = "".join(
282
- f'<img src="data:image/png;base64,{t}" style="width:120px;margin-right:8px;border-radius:6px;" />'
283
  for t in r["thumbnails"]
284
  )
 
285
  cards.append(f"""
286
- <div style="border:1px solid #444;border-radius:14px;padding:14px;margin-bottom:14px;">
287
  <b>{r['agency']}</b><br/>
288
  {r['title']}<br/>
289
  {thumbs}
290
- <div style="margin-top:8px;">
291
- <a href="{r['resolved_url']}" target="_blank">View</a>
292
- &nbsp;|&nbsp;
293
- <button onclick="fetch('/ask_ai?index={i}')"
294
- style="background:#1e88e5;color:white;border:none;border-radius:999px;padding:4px 12px;">
 
295
  Ask AI
296
  </button>
297
  </div>
 
 
 
298
  </div>
299
  """)
300
  return "".join(cards)
301
 
302
- with gr.Blocks() as demo:
303
  gr.Markdown("## Federal FOIA Intelligence Search")
304
 
305
  with gr.Tab("Search"):
@@ -308,23 +287,15 @@ with gr.Blocks() as demo:
308
  table = gr.Dataframe(headers=["Agency", "Title", "URL", "Hash"])
309
  cards = gr.HTML()
310
  status = gr.Textbox()
311
- gr.Button("Search").click(run_search, [query, agencies], [table, cards, status])
 
 
312
 
313
  with gr.Tab("Court / Clerk"):
314
- gr.Button("Download Clerk Training PDF").click(
315
- lambda: generate_clerk_training_pdf(),
316
- None,
317
- gr.File()
318
- )
319
- gr.Button("Generate Court Bundle").click(
320
- lambda: generate_court_bundle(),
321
- None,
322
- gr.File()
323
- )
324
 
325
- with gr.Tab("Trust & Governance"):
326
- gr.Markdown(AO_CMECF_MEMO)
327
- gr.Markdown(FAISS_APPROVAL_MEMO)
328
 
329
  demo.queue()
330
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  # HF Reviewer–Safe / Court-Safe Reference Implementation
4
  # ======================================================
5
 
6
+ import os, io, zipfile, tempfile, hashlib, base64, time
7
  from datetime import datetime
8
  from urllib.parse import quote_plus
9
  import requests
10
 
11
  import gradio as gr
12
  from fastapi import FastAPI
13
+ from fastapi.staticfiles import StaticFiles
14
  from fastapi.responses import JSONResponse
15
 
16
+ # Optional PDF support
 
 
 
17
  PDF_THUMBNAILS_AVAILABLE = False
18
+ PDF_TEXT_AVAILABLE = False
19
 
20
  try:
21
  from pdf2image import convert_from_bytes
 
24
  pass
25
 
26
  try:
27
+ from pdfminer.high_level import extract_text
28
+ PDF_TEXT_AVAILABLE = True
29
  except Exception:
30
  pass
31
 
 
37
  # HARD GOVERNANCE FLAGS (NON-NEGOTIABLE)
38
  # ======================================================
39
 
40
+ ENABLE_AI = True # USER-INITIATED ONLY
41
+ ENABLE_FAISS_PHASE_4 = False # FORMAL APPROVAL REQUIRED
42
+ ENABLE_DOC_LEVEL_APIS = False # CIA/FBI DO NOT CURRENTLY PROVIDE
43
 
44
  # ======================================================
45
  # SESSION STATE (EPHEMERAL)
 
57
 
58
  def provenance_block(payload: str, ai=False) -> str:
59
  return "\n".join([
60
+ "Tool-Version: 2.0.0",
61
  f"Generated-UTC: {datetime.utcnow().isoformat()}",
62
  f"Content-SHA256: {sha256_text(payload)}",
63
  "Public-Source-Only: true",
 
66
  ])
67
 
68
  # ======================================================
69
+ # FOIA ADAPTERS (LINK-OUT ONLY ACCURATE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # ======================================================
71
 
72
  class FOIAAdapter:
 
80
  "title": f"{self.agency} FOIA Reading Room",
81
  "resolved_url": url,
82
  "timestamp": datetime.utcnow().isoformat(),
83
+ "is_pdf": False,
84
+ "thumbnails": []
85
  }]
86
 
87
  class CIA(FOIAAdapter):
 
92
  agency = "FBI"
93
  search_url = "https://vault.fbi.gov/search?SearchableText={q}"
94
 
95
+ class DOJ(FOIAAdapter):
96
+ agency = "DOJ"
97
+ search_url = "https://www.justice.gov/foia/library?search={q}"
98
+
99
+ class DHS(FOIAAdapter):
100
+ agency = "DHS"
101
+ search_url = "https://www.dhs.gov/foia-library/search?search={q}"
102
+
103
  ALL_ADAPTERS = {
104
  "CIA": CIA(),
105
  "FBI": FBI(),
106
+ "DOJ": DOJ(),
107
+ "DHS": DHS(),
108
  }
109
 
110
  # ======================================================
111
+ # PDF DETECTION (SAFE — NO SCRAPING)
112
  # ======================================================
113
 
114
+ def resolve_pdf(url):
115
+ try:
116
+ r = requests.get(url, timeout=10, allow_redirects=True)
117
+ ct = r.headers.get("content-type", "").lower()
118
+ is_pdf = r.url.lower().endswith(".pdf") or "application/pdf" in ct
119
+ return is_pdf, r.url
120
+ except Exception:
121
+ return False, url
122
+
123
+ def generate_thumbnails(url, pages=2):
124
  if not PDF_THUMBNAILS_AVAILABLE:
125
  return []
126
  try:
127
  r = requests.get(url, timeout=10)
128
+ images = convert_from_bytes(r.content, first_page=1, last_page=pages)
129
  thumbs = []
130
  for img in images:
131
  buf = io.BytesIO()
 
135
  except Exception:
136
  return []
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  # ======================================================
139
  # SEARCH
140
  # ======================================================
 
146
  rows = []
147
  for name in agencies:
148
  for r in ALL_ADAPTERS[name].search(query):
149
+ is_pdf, resolved = resolve_pdf(r["resolved_url"])
150
+ r["resolved_url"] = resolved
151
+ r["is_pdf"] = is_pdf
152
+ r["thumbnails"] = generate_thumbnails(resolved) if is_pdf else []
153
+ r["hash"] = sha256_text(resolved)[:16]
 
 
 
 
 
 
154
  LAST_RESULTS.append(r)
155
+ rows.append([r["agency"], r["title"], resolved, r["hash"]])
156
 
157
  return rows, render_cards(), "Search complete."
158
 
159
  # ======================================================
160
+ # ASK-AI (STRICTLY GATED)
161
  # ======================================================
162
 
163
  def ask_ai(index: int):
164
  global AI_APPENDIX
165
  r = LAST_RESULTS[index]
166
 
167
+ if not (ENABLE_AI and r["is_pdf"]):
168
+ return "AI is disabled for this result."
169
+
170
+ summary = (
171
  "AI Assistive Summary (Non-Authoritative)\n\n"
172
  f"Agency: {r['agency']}\n"
173
+ f"Source: {r['resolved_url']}\n\n"
174
+ "This summary assists review of a public FOIA document only."
175
  )
176
 
177
  AI_APPENDIX = {
178
+ "text": summary,
179
+ "hash": sha256_text(summary),
180
+ "prov": provenance_block(summary, ai=True)
181
  }
182
 
183
+ return summary + "\n\n" + AI_APPENDIX["prov"]
184
 
185
  # ======================================================
186
+ # COURT BUNDLE (CM/ECF-READY)
187
  # ======================================================
188
 
189
+ def generate_court_bundle():
190
+ with tempfile.TemporaryDirectory() as td:
191
+ path = os.path.join(td, "court_bundle.zip")
192
+ with zipfile.ZipFile(path, "w") as z:
193
+ for i, r in enumerate(LAST_RESULTS, 1):
194
+ body = (
195
+ f"{r['agency']} FOIA Reading Room\n"
196
+ f"{r['resolved_url']}\n\n"
197
+ + provenance_block(r["resolved_url"])
198
+ )
199
+ z.writestr(f"Exhibit_{i:03d}.txt", body)
200
 
201
+ if AI_APPENDIX:
202
+ z.writestr("Exhibit_AI_Appendix.txt", AI_APPENDIX["text"])
203
+ z.writestr("Exhibit_AI_Appendix.provenance.txt", AI_APPENDIX["prov"])
 
 
 
 
204
 
205
+ z.writestr("Judicial_Notice.txt", JUDICIAL_NOTICE)
206
+ z.writestr("HF_Reviewer_Cover_Letter.txt", HF_REVIEWER_COVER_LETTER)
 
207
 
208
+ return path
 
209
 
210
  # ======================================================
211
+ # STATIC GOVERNANCE TEXT
212
  # ======================================================
213
 
214
+ JUDICIAL_NOTICE = """
215
+ This system provides navigation to public FOIA reading rooms only.
216
+ It does not host, certify, authenticate, or modify records.
217
+ Authoritative documents remain with issuing agencies.
218
+ """
219
 
220
+ HF_REVIEWER_COVER_LETTER = """
221
+ This Hugging Face Space is a governance-first reference implementation.
 
 
 
222
 
223
+ Link-out only
224
+ • Public FOIA sources only
225
+ • AI is opt-in, hashed, and user-initiated
226
+ • No document scraping or indexing
227
+ • Court-safe by design
228
  """
229
 
230
  # ======================================================
231
+ # FASTAPI
232
  # ======================================================
233
 
234
+ api = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
+ @api.get("/ask_ai")
237
+ def ask_ai_api(index: int):
238
+ return JSONResponse({"result": ask_ai(index)})
239
 
240
+ if os.path.exists("governance-site"):
241
+ api.mount("/gov", StaticFiles(directory="governance-site", html=True))
242
 
243
  # ======================================================
244
  # UI
245
  # ======================================================
246
 
247
+ CSS = """
248
+ button { border-radius:999px !important; }
249
+ .tab-nav { position:sticky; top:0; background:#fff; z-index:999; }
250
+ """
251
+
252
  def render_cards():
253
  cards = []
254
  for i, r in enumerate(LAST_RESULTS):
255
  thumbs = "".join(
256
+ f'<img src="data:image/png;base64,{t}" style="width:120px;border-radius:8px;margin-right:6px;" />'
257
  for t in r["thumbnails"]
258
  )
259
+ disabled = "" if r["is_pdf"] else "opacity:0.4;pointer-events:none;"
260
  cards.append(f"""
261
+ <div style="border:1px solid #ddd;border-radius:16px;padding:16px;margin-bottom:16px;">
262
  <b>{r['agency']}</b><br/>
263
  {r['title']}<br/>
264
  {thumbs}
265
+ <div style="margin-top:10px;">
266
+ <a href="{r['resolved_url']}" target="_blank">View</a> |
267
+ <a href="{r['resolved_url']}" download style="{disabled}">Download</a> |
268
+ <a href="{r['resolved_url']}" target="_blank">Share</a>
269
+ <button style="background:#1e88e5;color:white;padding:4px 12px;margin-left:10px;border:none;"
270
+ onclick="fetch('/ask_ai?index={i}')">
271
  Ask AI
272
  </button>
273
  </div>
274
+ <div style="font-size:0.75em;color:#666;margin-top:6px;">
275
+ Why am I seeing this? This is a public FOIA reading-room result.
276
+ </div>
277
  </div>
278
  """)
279
  return "".join(cards)
280
 
281
+ with gr.Blocks(css=CSS) as demo:
282
  gr.Markdown("## Federal FOIA Intelligence Search")
283
 
284
  with gr.Tab("Search"):
 
287
  table = gr.Dataframe(headers=["Agency", "Title", "URL", "Hash"])
288
  cards = gr.HTML()
289
  status = gr.Textbox()
290
+ gr.Button("Search", elem_classes=["primary"]).click(
291
+ run_search, [query, agencies], [table, cards, status]
292
+ )
293
 
294
  with gr.Tab("Court / Clerk"):
295
+ gr.Button("Generate Court Bundle").click(lambda: generate_court_bundle(), None, gr.File())
 
 
 
 
 
 
 
 
 
296
 
297
+ with gr.Tab("Governance"):
298
+ gr.Markdown(HF_REVIEWER_COVER_LETTER)
 
299
 
300
  demo.queue()
301
  demo.launch(server_name="0.0.0.0", server_port=7860)