GodsDevProject commited on
Commit
3225d63
ยท
verified ยท
1 Parent(s): bcc6c37

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -154
app.py CHANGED
@@ -1,8 +1,15 @@
1
  import gradio as gr
2
  import time
3
  import re
4
- from typing import List, Dict
5
  from collections import defaultdict
 
 
 
 
 
 
 
 
6
 
7
  ###############################################################################
8
  # GLOBAL STATE (HF SAFE)
@@ -10,9 +17,11 @@ from collections import defaultdict
10
 
11
  RESULT_CACHE: List[Dict] = []
12
  CURRENT_INDEX = 0
 
 
13
 
14
  ###############################################################################
15
- # AGENCY POLICY / ROBOTS / LIVE SAFETY
16
  ###############################################################################
17
 
18
  AGENCY_POLICY = {
@@ -21,31 +30,10 @@ AGENCY_POLICY = {
21
  "ODNI": {"robots": True, "live": True},
22
  "USAF": {"robots": True, "live": True},
23
  "NSA": {"robots": False, "live": False},
24
- "NRO": {"robots": False, "live": False},
25
- "SAP": {"robots": False, "live": False},
26
  }
27
 
28
  ###############################################################################
29
- # KILL SWITCH (AUTO + MANUAL SAFE)
30
- ###############################################################################
31
-
32
- class KillSwitch:
33
- def __init__(self):
34
- self.disabled = {}
35
-
36
- def disable(self, agency: str, reason: str):
37
- self.disabled[agency] = reason
38
-
39
- def enabled(self, agency: str) -> bool:
40
- return agency not in self.disabled
41
-
42
- def reason(self, agency: str) -> str:
43
- return self.disabled.get(agency, "")
44
-
45
- KILL = KillSwitch()
46
-
47
- ###############################################################################
48
- # FOIA EXEMPTION CLASSIFIER
49
  ###############################################################################
50
 
51
  B_CODES = {
@@ -57,186 +45,166 @@ B_CODES = {
57
 
58
  def classify_exemptions(text: str) -> Dict:
59
  found = [k for k in B_CODES if k in text.lower()]
60
- confidence = round(min(1.0, 0.3 * len(found)), 2)
61
- return {"codes": found, "confidence": confidence}
62
-
63
- ###############################################################################
64
- # REDACTION SCORING
65
- ###############################################################################
66
 
67
  def redaction_score(text: str) -> float:
68
  hits = sum(k in text.lower() for k in ["redact", "withheld", "b("])
69
  return round(min(1.0, hits * 0.25), 2)
70
 
71
  ###############################################################################
72
- # SEMANTIC CLUSTERING (HF SAFE HEURISTIC)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  ###############################################################################
74
 
75
- def semantic_clusters(results: List[Dict]) -> Dict[str, List[Dict]]:
 
 
 
 
76
  clusters = defaultdict(list)
77
  for r in results:
78
  t = r["title"].lower()
79
  if "uap" in t or "aerial" in t:
80
- clusters["๐Ÿ›ธ UAP / Aerial Phenomena"].append(r)
81
  elif "intelligence" in t:
82
- clusters["๐Ÿง  Intelligence Activities"].append(r)
83
  else:
84
- clusters["๐Ÿ“„ General Records"].append(r)
85
  return clusters
86
 
 
 
 
 
 
 
 
87
  ###############################################################################
88
- # MOCK LIVE SEARCH (REPLACE WITH REAL ADAPTERS SAFELY)
89
  ###############################################################################
90
 
91
- def run_search(query: str) -> List[Dict]:
92
- time.sleep(0.3)
93
-
94
- raw = [
95
- {
96
- "title": "UAP Task Force Preliminary Assessment",
97
- "snippet": "Some material withheld under b(1) and b(3).",
98
- "url": "https://www.dni.gov/files/ODNI/documents/assessments/Prelimary-Assessments-UAP-20210625.pdf",
99
- "agency": "ODNI",
100
- "source": "ODNI FOIA",
101
- },
102
- {
103
- "title": "CIA Memorandum on Aerial Phenomena",
104
- "snippet": "This document contains redactions under b(3).",
105
- "url": "https://www.cia.gov/readingroom/docs/DOC_000001.pdf",
106
- "agency": "CIA",
107
- "source": "CIA FOIA",
108
- },
109
- {
110
- "title": "Project Blue Book Summary",
111
- "snippet": "Historical investigation records.",
112
- "url": "https://www.archives.gov/research/military/air-force/ufos",
113
- "agency": "USAF",
114
- "source": "National Archives",
115
- },
116
- ]
117
-
118
- allowed = []
119
- for r in raw:
120
- policy = AGENCY_POLICY.get(r["agency"], {})
121
  if not policy.get("robots", False):
122
  continue
123
- if not KILL.enabled(r["agency"]):
124
  continue
125
- allowed.append(r)
 
 
 
 
126
 
127
- return allowed
 
 
 
128
 
129
- ###############################################################################
130
- # TEXT UTILITIES
131
- ###############################################################################
132
-
133
- def highlight(text: str, query: str) -> str:
134
- if not query:
135
- return text
136
- return re.sub(
137
- re.escape(query),
138
- lambda m: f"<mark>{m.group(0)}</mark>",
139
- text,
140
- flags=re.IGNORECASE,
141
- )
142
 
143
  ###############################################################################
144
  # RENDERERS
145
  ###############################################################################
146
 
147
- def render_results(results: List[Dict], query: str) -> str:
148
- clusters = semantic_clusters(results)
149
  blocks = []
150
 
151
- for name, items in clusters.items():
152
- section = [f"## {name}"]
153
- for idx, r in enumerate(items):
154
- global_index = RESULT_CACHE.index(r)
155
- ex = classify_exemptions(r["snippet"])
156
 
157
- section.append(
 
 
 
 
 
158
  f"""
159
  **{highlight(r['title'], query)}**
160
- ๐Ÿ›๏ธ {r['agency']} ยท ๐Ÿ“Š Redaction {redaction_score(r['snippet'])}
161
- โš–๏ธ Exemptions: `{', '.join(ex['codes']) or 'None'}` (conf {ex['confidence']})
162
 
163
- ๐Ÿ”— {r['url']}
164
- โžก๏ธ **Select #{global_index}**
165
  """
166
  )
167
- blocks.append("\n\n".join(section))
168
 
169
- return "\n\n---\n\n".join(blocks)
170
-
171
- def render_preview(index: int) -> str:
172
  if not RESULT_CACHE:
173
- return "_No document selected._"
174
 
175
- r = RESULT_CACHE[index]
176
  ex = classify_exemptions(r["snippet"])
177
 
178
- iframe = (
179
- f'<iframe src="{r["url"]}" width="100%" height="520px" '
180
- f'style="border:1px solid #444;border-radius:8px;"></iframe>'
181
- )
182
-
183
  return f"""
184
- ### ๐Ÿ“„ Document Preview
185
  **{r['title']}**
186
- ๐Ÿ›๏ธ {r['agency']} ยท {r['source']}
187
 
188
- ๐Ÿ›ก๏ธ Redaction Risk: **{redaction_score(r['snippet'])}**
189
  โš–๏ธ FOIA Exemptions: `{', '.join(ex['codes']) or 'None'}`
190
- ๐Ÿ”Ž Confidence: **{ex['confidence']}**
191
 
192
- {iframe}
 
193
  """
194
 
195
- def agency_coverage(results: List[Dict]) -> str:
196
  counts = defaultdict(int)
197
  for r in results:
198
  counts[r["agency"]] += 1
199
-
200
  rows = ["| Agency | Docs |", "|---|---|"]
201
  for k, v in sorted(counts.items(), key=lambda x: -x[1]):
202
  rows.append(f"| {k} | {v} |")
203
-
204
  return "\n".join(rows)
205
 
206
  ###############################################################################
207
- # EVENT HANDLERS
208
  ###############################################################################
209
 
210
- def do_search(query: str):
211
  global RESULT_CACHE, CURRENT_INDEX
212
- RESULT_CACHE = run_search(query)
 
 
213
  CURRENT_INDEX = 0
214
 
215
  return (
216
- render_results(RESULT_CACHE, query),
217
  render_preview(0),
218
- agency_coverage(RESULT_CACHE),
219
- 0,
 
 
220
  )
221
 
222
- def select_index(idx: int):
223
  global CURRENT_INDEX
224
  idx = int(max(0, min(idx, len(RESULT_CACHE) - 1)))
225
  CURRENT_INDEX = idx
226
  return render_preview(idx)
227
 
228
- def next_doc():
229
- global CURRENT_INDEX
230
- if CURRENT_INDEX < len(RESULT_CACHE) - 1:
231
- CURRENT_INDEX += 1
232
- return CURRENT_INDEX, render_preview(CURRENT_INDEX)
233
-
234
- def prev_doc():
235
- global CURRENT_INDEX
236
- if CURRENT_INDEX > 0:
237
- CURRENT_INDEX -= 1
238
- return CURRENT_INDEX, render_preview(CURRENT_INDEX)
239
-
240
  ###############################################################################
241
  # UI
242
  ###############################################################################
@@ -244,7 +212,10 @@ def prev_doc():
244
  with gr.Blocks(theme=gr.themes.Soft()) as app:
245
  gr.Markdown("# ๐Ÿ›๏ธ Federated FOIA Intelligence Search")
246
 
247
- query = gr.Textbox(label="Search public FOIA reading rooms")
 
 
 
248
  search_btn = gr.Button("๐Ÿ” Search")
249
 
250
  with gr.Row():
@@ -255,32 +226,20 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
255
 
256
  with gr.Column(scale=7):
257
  preview_md = gr.Markdown()
258
- with gr.Row():
259
- prev_btn = gr.Button("โฌ…๏ธ Prev")
260
- next_btn = gr.Button("โžก๏ธ Next")
261
- index_box = gr.Number(label="Selected index", precision=0)
262
 
263
- search_btn.click(
264
- do_search,
265
- inputs=query,
266
- outputs=[results_md, preview_md, coverage_md, index_box],
267
- )
268
 
269
- index_box.change(
270
- select_index,
271
- inputs=index_box,
272
- outputs=preview_md,
273
- )
274
 
275
- next_btn.click(
276
- next_doc,
277
- outputs=[index_box, preview_md],
 
278
  )
279
 
280
- prev_btn.click(
281
- prev_doc,
282
- outputs=[index_box, preview_md],
283
- )
284
 
285
  if __name__ == "__main__":
286
  app.launch()
 
1
  import gradio as gr
2
  import time
3
  import re
 
4
  from collections import defaultdict
5
+ from typing import List, Dict
6
+
7
+ from analytics import log_event
8
+ from policy.kill_switch import KillSwitch
9
+ from semantic.faiss_index import SemanticIndex
10
+ from export.journalist_zip import export_zip
11
+ from reports.explainability import explainability_report
12
+ from ingest.registry import get_all_adapters
13
 
14
  ###############################################################################
15
  # GLOBAL STATE (HF SAFE)
 
17
 
18
  RESULT_CACHE: List[Dict] = []
19
  CURRENT_INDEX = 0
20
+ SEMANTIC = SemanticIndex()
21
+ KILL = KillSwitch()
22
 
23
  ###############################################################################
24
+ # AGENCY POLICY / ROBOTS
25
  ###############################################################################
26
 
27
  AGENCY_POLICY = {
 
30
  "ODNI": {"robots": True, "live": True},
31
  "USAF": {"robots": True, "live": True},
32
  "NSA": {"robots": False, "live": False},
 
 
33
  }
34
 
35
  ###############################################################################
36
+ # FOIA EXEMPTIONS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  ###############################################################################
38
 
39
  B_CODES = {
 
45
 
46
  def classify_exemptions(text: str) -> Dict:
47
  found = [k for k in B_CODES if k in text.lower()]
48
+ return {
49
+ "codes": found,
50
+ "confidence": round(min(1.0, len(found) * 0.3), 2)
51
+ }
 
 
52
 
53
  def redaction_score(text: str) -> float:
54
  hits = sum(k in text.lower() for k in ["redact", "withheld", "b("])
55
  return round(min(1.0, hits * 0.25), 2)
56
 
57
  ###############################################################################
58
+ # TEXT UTILITIES
59
+ ###############################################################################
60
+
61
+ def highlight(text: str, query: str) -> str:
62
+ if not query:
63
+ return text
64
+ return re.sub(
65
+ re.escape(query),
66
+ lambda m: f"<mark>{m.group(0)}</mark>",
67
+ text,
68
+ flags=re.IGNORECASE,
69
+ )
70
+
71
+ ###############################################################################
72
+ # SEMANTIC CLUSTERING + TIMELINE
73
  ###############################################################################
74
 
75
+ def infer_year(text: str):
76
+ m = re.search(r"(19|20)\d{2}", text)
77
+ return m.group(0) if m else "Unknown"
78
+
79
+ def semantic_clusters(results: List[Dict]):
80
  clusters = defaultdict(list)
81
  for r in results:
82
  t = r["title"].lower()
83
  if "uap" in t or "aerial" in t:
84
+ clusters["๐Ÿ›ธ UAP / Aerial"].append(r)
85
  elif "intelligence" in t:
86
+ clusters["๐Ÿง  Intelligence"].append(r)
87
  else:
88
+ clusters["๐Ÿ“„ General"].append(r)
89
  return clusters
90
 
91
+ def timeline_view(results: List[Dict]):
92
+ timeline = defaultdict(list)
93
+ for r in results:
94
+ year = infer_year(r["title"] + r["url"])
95
+ timeline[year].append(r)
96
+ return dict(sorted(timeline.items(), reverse=True))
97
+
98
  ###############################################################################
99
+ # SEARCH
100
  ###############################################################################
101
 
102
+ def run_search(query: str, semantic_refine: bool):
103
+ adapters = get_all_adapters()
104
+ results = []
105
+
106
+ for name, adapter in adapters.items():
107
+ policy = AGENCY_POLICY.get(name, {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  if not policy.get("robots", False):
109
  continue
110
+ if not KILL.enabled(name):
111
  continue
112
+ try:
113
+ res = adapter.sync_search(query)
114
+ results.extend(res)
115
+ except Exception as e:
116
+ KILL.disable(name, str(e))
117
 
118
+ if semantic_refine and results:
119
+ SEMANTIC.build([r["title"] + " " + r["snippet"] for r in results])
120
+ ranked = SEMANTIC.search(query, k=len(results))
121
+ results = [r for r in results if (r["title"] + " " + r["snippet"]) in ranked]
122
 
123
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  ###############################################################################
126
  # RENDERERS
127
  ###############################################################################
128
 
129
+ def render_results(results, query, view):
 
130
  blocks = []
131
 
132
+ if view == "Timeline":
133
+ groups = timeline_view(results)
134
+ else:
135
+ groups = semantic_clusters(results)
 
136
 
137
+ for name, items in groups.items():
138
+ blocks.append(f"## {name}")
139
+ for r in items:
140
+ idx = RESULT_CACHE.index(r)
141
+ ex = classify_exemptions(r["snippet"])
142
+ blocks.append(
143
  f"""
144
  **{highlight(r['title'], query)}**
145
+ ๐Ÿ›๏ธ {r['agency']} ยท ๐Ÿ›ก๏ธ Redaction {redaction_score(r['snippet'])}
146
+ โš–๏ธ Exemptions: `{', '.join(ex['codes']) or 'None'}`
147
 
148
+ โžก๏ธ Select **{idx}**
149
+ ๐Ÿ”— {r['url']}
150
  """
151
  )
152
+ return "\n\n".join(blocks)
153
 
154
+ def render_preview(idx: int):
 
 
155
  if not RESULT_CACHE:
156
+ return "_No document selected_"
157
 
158
+ r = RESULT_CACHE[idx]
159
  ex = classify_exemptions(r["snippet"])
160
 
 
 
 
 
 
161
  return f"""
162
+ ### ๐Ÿ“„ Preview
163
  **{r['title']}**
164
+ ๐Ÿ›๏ธ {r['agency']}
165
 
166
+ ๐Ÿ›ก๏ธ Redaction Risk: {redaction_score(r['snippet'])}
167
  โš–๏ธ FOIA Exemptions: `{', '.join(ex['codes']) or 'None'}`
 
168
 
169
+ <iframe src="{r['url']}" width="100%" height="520px"
170
+ style="border:1px solid #444;border-radius:8px;"></iframe>
171
  """
172
 
173
+ def coverage_table(results):
174
  counts = defaultdict(int)
175
  for r in results:
176
  counts[r["agency"]] += 1
 
177
  rows = ["| Agency | Docs |", "|---|---|"]
178
  for k, v in sorted(counts.items(), key=lambda x: -x[1]):
179
  rows.append(f"| {k} | {v} |")
 
180
  return "\n".join(rows)
181
 
182
  ###############################################################################
183
+ # EVENTS
184
  ###############################################################################
185
 
186
+ def do_search(query, semantic_refine, view):
187
  global RESULT_CACHE, CURRENT_INDEX
188
+ log_event("search", {"query": query})
189
+
190
+ RESULT_CACHE = run_search(query, semantic_refine)
191
  CURRENT_INDEX = 0
192
 
193
  return (
194
+ render_results(RESULT_CACHE, query, view),
195
  render_preview(0),
196
+ coverage_table(RESULT_CACHE),
197
+ explainability_report(RESULT_CACHE),
198
+ export_zip(RESULT_CACHE),
199
+ 0
200
  )
201
 
202
+ def select_index(idx):
203
  global CURRENT_INDEX
204
  idx = int(max(0, min(idx, len(RESULT_CACHE) - 1)))
205
  CURRENT_INDEX = idx
206
  return render_preview(idx)
207
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  ###############################################################################
209
  # UI
210
  ###############################################################################
 
212
  with gr.Blocks(theme=gr.themes.Soft()) as app:
213
  gr.Markdown("# ๐Ÿ›๏ธ Federated FOIA Intelligence Search")
214
 
215
+ query = gr.Textbox(label="Search public FOIA records")
216
+ semantic_toggle = gr.Checkbox(label="๐Ÿง  Semantic refine (FAISS)", value=False)
217
+ view = gr.Radio(["Clustered", "Timeline"], value="Clustered")
218
+
219
  search_btn = gr.Button("๐Ÿ” Search")
220
 
221
  with gr.Row():
 
226
 
227
  with gr.Column(scale=7):
228
  preview_md = gr.Markdown()
229
+ index_box = gr.Number(label="Select index", precision=0)
 
 
 
230
 
231
+ with gr.Accordion("๐Ÿงพ Explainability Report", open=False):
232
+ explain_md = gr.Markdown()
 
 
 
233
 
234
+ zip_file = gr.File(label="๐Ÿ“ฆ Journalist ZIP")
 
 
 
 
235
 
236
+ search_btn.click(
237
+ do_search,
238
+ inputs=[query, semantic_toggle, view],
239
+ outputs=[results_md, preview_md, coverage_md, explain_md, zip_file, index_box],
240
  )
241
 
242
+ index_box.change(select_index, inputs=index_box, outputs=preview_md)
 
 
 
243
 
244
  if __name__ == "__main__":
245
  app.launch()