GodsDevProject commited on
Commit
a717b5a
Β·
verified Β·
1 Parent(s): 37f4615

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -127
app.py CHANGED
@@ -2,14 +2,71 @@ import gradio as gr
2
  import time
3
  import re
4
  from typing import List, Dict
 
5
 
6
  ###############################################################################
7
- # GLOBAL STATE (HF SAFE)
8
  ###############################################################################
9
 
10
  RESULT_CACHE: List[Dict] = []
11
  SELECTED_INDEX = 0
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  ###############################################################################
14
  # UTILITIES
15
  ###############################################################################
@@ -17,167 +74,128 @@ SELECTED_INDEX = 0
17
  def highlight(text: str, query: str) -> str:
18
  if not query:
19
  return text
20
- terms = [re.escape(t) for t in query.split() if len(t) > 1]
21
- if not terms:
22
- return text
23
- pattern = re.compile(rf"({'|'.join(terms)})", re.IGNORECASE)
24
- return pattern.sub(r"<mark>\1</mark>", text)
25
 
26
 
27
  def redaction_score(text: str) -> float:
28
- """
29
- Heuristic confidence score that document may contain redactions.
30
- 0.0 = none detected, 1.0 = heavy redaction likelihood
31
- """
32
- hits = sum(
33
- k in text.lower()
34
- for k in ["redact", "b(1)", "b(3)", "withheld", "classified"]
35
- )
36
  return round(min(1.0, hits * 0.25), 2)
37
 
38
 
39
- def badge(r: Dict) -> str:
40
- live = "🟒 LIVE" if r["live"] else "🟑 STUB"
41
- return f"`{live}` Β· `{r['agency']}`"
42
-
43
-
44
  ###############################################################################
45
- # MOCK ADAPTER OUTPUT (REPLACE WITH REAL ADAPTERS)
46
  ###############################################################################
47
 
48
- def run_federated_query(query: str) -> List[Dict]:
49
- """
50
- HF-safe simulated adapter aggregation.
51
- """
52
  time.sleep(0.4)
53
 
54
- return [
55
  {
56
- "title": "UAP Task Force Report (Preliminary)",
57
- "snippet": "The UAPTF evaluated a number of UAP incidents...",
58
  "url": "https://www.dni.gov/files/ODNI/documents/assessments/Prelimary-Assessments-UAP-20210625.pdf",
59
  "agency": "ODNI",
60
- "source": "ODNI FOIA Reading Room",
61
- "live": True,
62
  },
63
  {
64
  "title": "CIA Memorandum on Aerial Phenomena",
65
- "snippet": "This document was partially redacted under b(3)...",
66
  "url": "https://www.cia.gov/readingroom/docs/DOC_000001.pdf",
67
  "agency": "CIA",
68
- "source": "CIA FOIA Reading Room",
69
- "live": True,
70
  },
71
  {
72
- "title": "Project BLUE BOOK Summary",
73
- "snippet": "Historical summary of investigations into unidentified objects...",
74
  "url": "https://www.archives.gov/research/military/air-force/ufos",
75
  "agency": "USAF",
76
  "source": "National Archives",
77
- "live": False,
78
  },
79
  ]
80
 
 
 
 
 
 
 
81
 
82
- ###############################################################################
83
- # RESULT RENDERING
84
- ###############################################################################
85
-
86
- def agency_counts(results: List[Dict]) -> str:
87
- counts = {}
88
- for r in results:
89
- counts[r["agency"]] = counts.get(r["agency"], 0) + 1
90
 
91
- lines = ["### πŸ›οΈ Per-Agency Coverage"]
92
- for agency, count in sorted(counts.items()):
93
- lines.append(f"- **{agency}**: {count}")
94
-
95
- return "\n".join(lines)
96
 
 
 
 
97
 
98
- def render_results(results: List[Dict], query: str) -> str:
99
- lines = ["### πŸ“š Search Results (click to preview)\n"]
 
100
 
101
- for i, r in enumerate(results):
102
- title = highlight(r["title"], query)
103
- snippet = highlight(r["snippet"], query)
104
- score = redaction_score(r["snippet"])
 
105
 
106
- lines.append(
107
- f"""
108
- <div style="cursor:pointer" onclick="">
109
- **{i+1}. {title}**
110
- {badge(r)}
111
- πŸ›‘οΈ Redaction Confidence: **{score}**
112
 
113
- {snippet}
114
 
115
- πŸ”— [Open Source]({r['url']})
116
- </div>
117
  """
118
- )
119
 
120
- return "\n\n---\n\n".join(lines)
121
 
 
122
 
123
- ###############################################################################
124
- # PREVIEW PANEL
125
- ###############################################################################
126
 
127
  def render_preview(index: int) -> str:
128
  if not RESULT_CACHE:
129
  return "_No document selected._"
130
 
131
- index = max(0, min(index, len(RESULT_CACHE) - 1))
132
  r = RESULT_CACHE[index]
 
133
 
134
- score = redaction_score(r["snippet"])
135
-
136
- header = f"""
137
- ### πŸ“„ Document Preview ({index + 1}/{len(RESULT_CACHE)})
138
  **{r['title']}**
139
- {r['source']} Β· `{r['agency']}`
140
- πŸ›‘οΈ Redaction Confidence: **{score}**
141
- """
142
 
143
- iframe = f"""
144
- <iframe src="{r['url']}" width="100%" height="550px"
145
- style="border:1px solid #444;border-radius:8px;"></iframe>
146
- """
147
 
148
- return header + iframe
 
 
149
 
150
 
151
  ###############################################################################
152
- # SEARCH HANDLER
153
  ###############################################################################
154
 
155
- def run_search(query: str):
156
  global RESULT_CACHE, SELECTED_INDEX
157
-
158
- if not query.strip():
159
- return "⚠️ Enter a search term.", "_", "_"
160
-
161
- results = run_federated_query(query)
162
-
163
- RESULT_CACHE = results
164
  SELECTED_INDEX = 0
165
 
166
  return (
167
- render_results(results, query),
168
  render_preview(0),
169
- agency_counts(results),
170
  )
171
 
172
 
173
- def next_doc(idx):
174
- idx = min(idx + 1, len(RESULT_CACHE) - 1)
175
- return render_preview(idx), idx
176
-
177
-
178
- def prev_doc(idx):
179
- idx = max(idx - 1, 0)
180
- return render_preview(idx), idx
181
 
182
 
183
  ###############################################################################
@@ -185,49 +203,31 @@ def prev_doc(idx):
185
  ###############################################################################
186
 
187
  with gr.Blocks(theme=gr.themes.Soft()) as app:
188
- gr.Markdown(
189
- """
190
- # πŸ›οΈ Federated FOIA Document Search
191
- Search public FOIA reading rooms across agencies.
192
- """
193
- )
194
-
195
- query = gr.Textbox(label="Search term", placeholder="e.g. UAP")
196
 
 
197
  search_btn = gr.Button("πŸ” Search")
198
 
199
- selected_index = gr.State(0)
200
-
201
  with gr.Row():
202
  with gr.Column(scale=5):
203
  results_md = gr.Markdown()
204
 
205
  with gr.Column(scale=7):
206
  preview_md = gr.Markdown()
207
- with gr.Row():
208
- prev_btn = gr.Button("⬅️ Previous")
209
- next_btn = gr.Button("➑️ Next")
210
 
211
- coverage_md = gr.Markdown()
212
 
213
  search_btn.click(
214
- fn=run_search,
215
  inputs=query,
216
- outputs=[results_md, preview_md, coverage_md],
217
  )
218
 
219
- prev_btn.click(
220
- fn=prev_doc,
221
- inputs=selected_index,
222
- outputs=[preview_md, selected_index],
223
  )
224
 
225
- next_btn.click(
226
- fn=next_doc,
227
- inputs=selected_index,
228
- outputs=[preview_md, selected_index],
229
- )
230
-
231
-
232
  if __name__ == "__main__":
233
  app.launch()
 
2
  import time
3
  import re
4
  from typing import List, Dict
5
+ from collections import defaultdict
6
 
7
  ###############################################################################
8
+ # GLOBAL STATE
9
  ###############################################################################
10
 
11
  RESULT_CACHE: List[Dict] = []
12
  SELECTED_INDEX = 0
13
 
14
+ ###############################################################################
15
+ # ROBOTS.TXT POLICY
16
+ ###############################################################################
17
+
18
+ AGENCY_POLICY = {
19
+ "CIA": {"robots_allowed": True, "reason": "Explicit public FOIA reading room"},
20
+ "ODNI": {"robots_allowed": True, "reason": "Public PDF hosting"},
21
+ "USAF": {"robots_allowed": True, "reason": "National Archives mirror"},
22
+ "NSA": {"robots_allowed": False, "reason": "robots.txt disallows crawling"},
23
+ }
24
+
25
+ ###############################################################################
26
+ # FOIA EXEMPTION CLASSIFIER
27
+ ###############################################################################
28
+
29
+ B_CODES = {
30
+ "b(1)": "National Security",
31
+ "b(3)": "Statutory Exemption",
32
+ "b(5)": "Deliberative Process",
33
+ "b(7)": "Law Enforcement",
34
+ }
35
+
36
+
37
+ def classify_exemptions(text: str) -> Dict:
38
+ found = []
39
+ for code, desc in B_CODES.items():
40
+ if code in text.lower():
41
+ found.append(code)
42
+
43
+ confidence = round(min(1.0, len(found) * 0.3), 2)
44
+
45
+ return {
46
+ "codes": found,
47
+ "confidence": confidence,
48
+ }
49
+
50
+
51
+ ###############################################################################
52
+ # SEMANTIC CLUSTERING (HF SAFE)
53
+ ###############################################################################
54
+
55
+ def semantic_cluster(results: List[Dict]) -> Dict[str, List[Dict]]:
56
+ clusters = defaultdict(list)
57
+
58
+ for r in results:
59
+ title = r["title"].lower()
60
+ if "uap" in title or "aerial" in title:
61
+ clusters["πŸ›Έ UAP / Aerial Phenomena"].append(r)
62
+ elif "intelligence" in title:
63
+ clusters["🧠 Intelligence Activities"].append(r)
64
+ else:
65
+ clusters["πŸ“„ General Records"].append(r)
66
+
67
+ return clusters
68
+
69
+
70
  ###############################################################################
71
  # UTILITIES
72
  ###############################################################################
 
74
  def highlight(text: str, query: str) -> str:
75
  if not query:
76
  return text
77
+ pattern = re.compile(re.escape(query), re.IGNORECASE)
78
+ return pattern.sub(r"<mark>\g<0></mark>", text)
 
 
 
79
 
80
 
81
  def redaction_score(text: str) -> float:
82
+ hits = sum(k in text.lower() for k in ["redact", "withheld", "b("])
 
 
 
 
 
 
 
83
  return round(min(1.0, hits * 0.25), 2)
84
 
85
 
 
 
 
 
 
86
  ###############################################################################
87
+ # MOCK LIVE QUERY (REPLACE ADAPTERS LATER)
88
  ###############################################################################
89
 
90
+ def run_query(query: str) -> List[Dict]:
 
 
 
91
  time.sleep(0.4)
92
 
93
+ raw = [
94
  {
95
+ "title": "UAP Task Force Preliminary Assessment",
96
+ "snippet": "Some material withheld under b(1) and b(3).",
97
  "url": "https://www.dni.gov/files/ODNI/documents/assessments/Prelimary-Assessments-UAP-20210625.pdf",
98
  "agency": "ODNI",
99
+ "source": "ODNI FOIA",
 
100
  },
101
  {
102
  "title": "CIA Memorandum on Aerial Phenomena",
103
+ "snippet": "This document contains redactions under b(3).",
104
  "url": "https://www.cia.gov/readingroom/docs/DOC_000001.pdf",
105
  "agency": "CIA",
106
+ "source": "CIA FOIA",
 
107
  },
108
  {
109
+ "title": "Project Blue Book Summary",
110
+ "snippet": "Historical investigation records.",
111
  "url": "https://www.archives.gov/research/military/air-force/ufos",
112
  "agency": "USAF",
113
  "source": "National Archives",
 
114
  },
115
  ]
116
 
117
+ # Robots enforcement
118
+ allowed = []
119
+ for r in raw:
120
+ policy = AGENCY_POLICY.get(r["agency"], {})
121
+ if policy.get("robots_allowed", False):
122
+ allowed.append(r)
123
 
124
+ return allowed
 
 
 
 
 
 
 
125
 
 
 
 
 
 
126
 
127
+ ###############################################################################
128
+ # RENDERERS
129
+ ###############################################################################
130
 
131
+ def render_clusters(results: List[Dict], query: str) -> str:
132
+ clusters = semantic_cluster(results)
133
+ blocks = []
134
 
135
+ for cluster, items in clusters.items():
136
+ section = [f"## {cluster}"]
137
+ for i, r in enumerate(items):
138
+ idx = RESULT_CACHE.index(r)
139
+ exemptions = classify_exemptions(r["snippet"])
140
 
141
+ section.append(
142
+ f"""
143
+ **{highlight(r['title'], query)}**
144
+ πŸ›οΈ {r['agency']} Β· πŸ›‘οΈ Redaction {redaction_score(r['snippet'])}
145
+ βš–οΈ Exemptions: `{', '.join(exemptions['codes']) or 'None'}`
146
+ πŸ”Ž Confidence: **{exemptions['confidence']}**
147
 
148
+ πŸ”— [Source]({r['url']})
149
 
150
+ ➑️ Select index **{idx}**
 
151
  """
152
+ )
153
 
154
+ blocks.append("\n\n".join(section))
155
 
156
+ return "\n\n---\n\n".join(blocks)
157
 
 
 
 
158
 
159
  def render_preview(index: int) -> str:
160
  if not RESULT_CACHE:
161
  return "_No document selected._"
162
 
 
163
  r = RESULT_CACHE[index]
164
+ exemptions = classify_exemptions(r["snippet"])
165
 
166
+ return f"""
167
+ ### πŸ“„ Preview
 
 
168
  **{r['title']}**
169
+ πŸ›οΈ {r['agency']} Β· {r['source']}
 
 
170
 
171
+ πŸ›‘οΈ Redaction Risk: **{redaction_score(r['snippet'])}**
172
+ βš–οΈ FOIA Exemptions: `{', '.join(exemptions['codes']) or 'None'}`
173
+ πŸ”Ž Confidence: **{exemptions['confidence']}**
 
174
 
175
+ <iframe src="{r['url']}" width="100%" height="520px"
176
+ style="border-radius:8px;border:1px solid #444;"></iframe>
177
+ """
178
 
179
 
180
  ###############################################################################
181
+ # EVENT HANDLERS
182
  ###############################################################################
183
 
184
+ def search(query: str):
185
  global RESULT_CACHE, SELECTED_INDEX
186
+ RESULT_CACHE = run_query(query)
 
 
 
 
 
 
187
  SELECTED_INDEX = 0
188
 
189
  return (
190
+ render_clusters(RESULT_CACHE, query),
191
  render_preview(0),
 
192
  )
193
 
194
 
195
+ def select_index(idx: int):
196
+ global SELECTED_INDEX
197
+ SELECTED_INDEX = idx
198
+ return render_preview(idx)
 
 
 
 
199
 
200
 
201
  ###############################################################################
 
203
  ###############################################################################
204
 
205
  with gr.Blocks(theme=gr.themes.Soft()) as app:
206
+ gr.Markdown("# πŸ›οΈ Federated FOIA Intelligence Search")
 
 
 
 
 
 
 
207
 
208
+ query = gr.Textbox(label="Search")
209
  search_btn = gr.Button("πŸ” Search")
210
 
 
 
211
  with gr.Row():
212
  with gr.Column(scale=5):
213
  results_md = gr.Markdown()
214
 
215
  with gr.Column(scale=7):
216
  preview_md = gr.Markdown()
 
 
 
217
 
218
+ select_box = gr.Number(label="Select result index", precision=0)
219
 
220
  search_btn.click(
221
+ fn=search,
222
  inputs=query,
223
+ outputs=[results_md, preview_md],
224
  )
225
 
226
+ select_box.change(
227
+ fn=select_index,
228
+ inputs=select_box,
229
+ outputs=preview_md,
230
  )
231
 
 
 
 
 
 
 
 
232
  if __name__ == "__main__":
233
  app.launch()