mk1985 commited on
Commit
f9a5a03
·
verified ·
1 Parent(s): d51705e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -50
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # 📚 Install dependencies
2
  # Make sure to run this in your environment if you haven't already
3
- # !pip install openai anthropic google-generativeai gradio transformers torch gliner numpy --quiet
4
 
5
  # ⚙️ Imports
6
  import openai
@@ -10,7 +10,9 @@ import gradio as gr
10
  from gliner import GLiNER
11
  from collections import defaultdict
12
  import numpy as np
 
13
  import os
 
14
 
15
  # 🧠 Supported models and their providers
16
  MODEL_OPTIONS = {
@@ -34,15 +36,12 @@ except Exception as e:
34
  # 🧠 Prompt for the Conceptual AI to generate a research framework
35
  FRAMEWORK_PROMPT_TEMPLATE = """
36
  You are an expert research assistant specializing in history. For the provided topic: **"{topic}"**, your task is to generate a conceptual research framework.
37
-
38
  **Instructions:**
39
  1. Identify 4-6 high-level **Conceptual Categories** relevant to analyzing this historical topic (e.g., 'Key Figures', 'Core Ideologies', 'Significant Events').
40
  2. For each category, list specific, searchable **Labels** that would appear in a primary or secondary source document.
41
  3. **Crucial Rule for Labels:** Use concise, singular, and fundamental terms (e.g., use `Treaty` not `Diplomatic Treaties`). Use Title Case (e.g. `Working Class`).
42
-
43
  **Output Format:**
44
  Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its labels.
45
-
46
  ### Example Category: Political Actions
47
  - Petition, Charter, Protest, Rally, Legislation
48
  ### Example Category: Social Groups
@@ -51,6 +50,7 @@ Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-
51
 
52
  # 🧠 Generator Function (The "Conceptual AI")
53
  def generate_from_prompt(prompt, provider, key_dict):
 
54
  provider_id = MODEL_OPTIONS.get(provider)
55
  api_key = key_dict.get(f"{provider_id}_key")
56
  if not api_key:
@@ -73,7 +73,6 @@ def generate_from_prompt(prompt, provider, key_dict):
73
 
74
  # --- UI Definitions ---
75
 
76
- # REFORMATTED: No underscores, uses Title Case
77
  STANDARD_LABELS = [
78
  "Person", "Organization", "Location", "Country", "City", "State",
79
  "Nationality", "Group", "Date", "Event", "Law", "Legal Document",
@@ -84,7 +83,11 @@ STANDARD_LABELS = [
84
  MAX_CATEGORIES = 8
85
 
86
  with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break: break-word; }") as demo:
 
 
 
87
  gr.Markdown("# Historical Text Analysis Tool")
 
88
  gr.Markdown(
89
  """
90
  This tool uses two forms of AI to accelerate historical research. First, a **Conceptual AI** generates a research framework with relevant search terms for your topic. Second, an **Extraction AI** scans your source text to find and highlight those terms with high precision.
@@ -142,7 +145,6 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
142
  gr.Markdown("--- \n## Step 3: Run Analysis")
143
  threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls the strictness of the Extraction AI. Lower values find more potential matches. Higher values return fewer, more precise matches.")
144
  text_input = gr.Textbox(label="Paste Your Source Text Here for Analysis", lines=15, placeholder="Paste a historical document, an article, or a chapter...")
145
- # UPDATED BUTTON TEXT
146
  analyze_btn = gr.Button("Find Entities", variant="primary")
147
 
148
  analysis_status = gr.Markdown(visible=False)
@@ -159,13 +161,28 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
159
  with gr.TabItem("Highlighted Text"):
160
  highlighted_text_output = gr.HighlightedText(label="Found Entities", interactive=True)
161
  with gr.TabItem("Detailed Results"):
162
- detailed_results_output = gr.Markdown(label="Aggregated List of Found Entities")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  with gr.TabItem("Debug Log"):
164
  debug_output = gr.Textbox(label="Extraction Process Log", interactive=False, lines=8)
165
 
166
  # --- Backend Functions ---
167
 
168
  def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
 
169
  yield {generate_btn: gr.update(value="Generating...", interactive=False)}
170
 
171
  try:
@@ -214,24 +231,22 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
214
  raise gr.Error(str(e))
215
 
216
  def analyze_text(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
217
- # UPDATED PROGRESS MESSAGE
218
  yield {
219
  analyze_btn: gr.update(value="Finding Entities...", interactive=False),
220
- analysis_status: gr.update(value="The Extraction AI is scanning your text. This may take a moment...", visible=True),
221
- highlighted_text_output: None, detailed_results_output: None, debug_output: "Starting analysis..."
 
222
  }
223
 
 
224
  debug_info = []
225
- if gliner_model is None:
226
- raise gr.Error("Extraction AI (GLiNER model) is not loaded. Cannot analyze text. Please check logs and restart.")
227
-
228
  labels_to_use = set()
229
  for group in suggested_labels_from_groups:
230
  if group: labels_to_use.update(group)
231
  if standard_labels: labels_to_use.update(standard_labels)
232
  custom = {l.strip() for l in custom_label_text.split(',') if l.strip()}
233
  if custom: labels_to_use.update(custom)
234
-
235
  final_labels = sorted(list(labels_to_use))
236
  debug_info.append(f"Searching for {len(final_labels)} unique labels.")
237
  debug_info.append(f"Confidence Threshold set to: {threshold}")
@@ -241,11 +256,13 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
241
  analyze_btn: gr.update(value="Find Entities", interactive=True),
242
  analysis_status: gr.update(visible=False),
243
  highlighted_text_output: {"text": text, "entities": []},
244
- detailed_results_output: "Analysis stopped: Please provide text and select at least one label to search for.",
 
245
  debug_output: "Analysis stopped: No text or no labels provided."
246
  }
247
  return
248
 
 
249
  all_entities = []
250
  chunk_size, overlap = 1024, 100
251
  for i in range(0, len(text), chunk_size - overlap):
@@ -254,14 +271,10 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
254
  for ent in chunk_entities:
255
  ent['start'] += i; ent['end'] += i
256
  all_entities.append(ent)
257
-
258
  unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
259
  debug_info.append(f"Found {len(unique_entities)} raw entity mentions.")
260
 
261
- highlighted_output_data = {
262
- "text": text,
263
- "entities": [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]
264
- }
265
 
266
  aggregated_matches = defaultdict(lambda: {'count': 0, 'scores': [], 'original_casing': ''})
267
  for ent in unique_entities:
@@ -269,44 +282,46 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
269
  key = (ent['label'], match_text.lower())
270
  aggregated_matches[key]['count'] += 1
271
  aggregated_matches[key]['scores'].append(ent['score'])
272
- if not aggregated_matches[key]['original_casing']:
273
- aggregated_matches[key]['original_casing'] = match_text
274
 
275
- # --- NEW LOGIC FOR SINGLE, UNIFIED TABLE ---
276
  table_rows = []
277
  for (label, _), data in aggregated_matches.items():
278
  avg_score = np.mean(data['scores'])
279
  table_rows.append({
280
- 'label': label,
281
- 'text': data['original_casing'],
282
- 'count': data['count'],
283
- 'avg_score': avg_score
284
  })
285
 
286
- # Sort the rows by Label (alphabetically), then by count (descending)
287
- table_rows.sort(key=lambda x: (x['label'], -x['count']))
288
-
289
- markdown_string = ""
290
- if not table_rows:
291
- markdown_string = "No entities found. Consider lowering the confidence threshold or refining your labels."
292
- else:
293
- # Build the Markdown table string
294
- markdown_string += "| Label | Text Found | Instances | Avg. Confidence Score* |\n"
295
- markdown_string += "|-------|------------|-----------|--------------------------|\n"
296
- for row in table_rows:
297
- markdown_string += f"| {row['label']} | {row['text']} | {row['count']} | {row['avg_score']:.2f} |\n"
298
-
299
- markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the Extraction AI is that it found the correct label (1.00 = 100% certain). The score is an average across all instances of that text.</i></small>"
300
-
301
  debug_info.append("Analysis complete.")
302
 
303
  yield {
304
  analyze_btn: gr.update(value="Find Entities", interactive=True),
305
  analysis_status: gr.update(visible=False),
306
  highlighted_text_output: highlighted_output_data,
307
- detailed_results_output: markdown_string,
 
 
 
308
  debug_output: "\n".join(debug_info)
309
  }
 
 
 
 
 
 
 
 
 
 
310
 
311
  # --- Wire up UI events ---
312
  generate_btn.click(
@@ -315,24 +330,28 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
315
  outputs=[generate_btn] + [comp for pair in dynamic_components for comp in pair]
316
  )
317
 
318
- def deselect_all():
319
- return gr.update(value=[])
320
- def select_all(choices):
321
- return gr.update(value=choices)
322
 
323
  deselect_all_std_btn.click(fn=deselect_all, inputs=None, outputs=[standard_labels_checkbox])
324
  select_all_std_btn.click(lambda: select_all(STANDARD_LABELS), inputs=None, outputs=[standard_labels_checkbox])
325
 
326
- # Wire up the dynamic select/deselect buttons
327
  for _, cg, sel_btn, desel_btn in dynamic_components:
328
- # BUG FIX: Use a lambda to capture the component `cg` itself, allowing `cg.choices` to provide the full list of options.
329
  sel_btn.click(fn=lambda c=cg: gr.update(value=c.choices), inputs=None, outputs=[cg])
330
  desel_btn.click(fn=deselect_all, inputs=None, outputs=[cg])
331
 
332
  analyze_btn.click(
333
  fn=analyze_text,
334
  inputs=[text_input, standard_labels_checkbox, custom_labels_textbox, threshold_slider] + [cg for acc, cg, sel, desel in dynamic_components],
335
- outputs=[analyze_btn, analysis_status, highlighted_text_output, detailed_results_output, debug_output]
 
 
 
 
 
 
 
 
336
  )
337
 
338
  demo.launch(share=True, debug=True)
 
1
  # 📚 Install dependencies
2
  # Make sure to run this in your environment if you haven't already
3
+ # !pip install openai anthropic google-generativeai gradio transformers torch gliner numpy pandas --quiet
4
 
5
  # ⚙️ Imports
6
  import openai
 
10
  from gliner import GLiNER
11
  from collections import defaultdict
12
  import numpy as np
13
+ import pandas as pd # Import pandas for DataFrame
14
  import os
15
+ import tempfile # For creating temporary CSV files
16
 
17
  # 🧠 Supported models and their providers
18
  MODEL_OPTIONS = {
 
36
  # 🧠 Prompt for the Conceptual AI to generate a research framework
37
  FRAMEWORK_PROMPT_TEMPLATE = """
38
  You are an expert research assistant specializing in history. For the provided topic: **"{topic}"**, your task is to generate a conceptual research framework.
 
39
  **Instructions:**
40
  1. Identify 4-6 high-level **Conceptual Categories** relevant to analyzing this historical topic (e.g., 'Key Figures', 'Core Ideologies', 'Significant Events').
41
  2. For each category, list specific, searchable **Labels** that would appear in a primary or secondary source document.
42
  3. **Crucial Rule for Labels:** Use concise, singular, and fundamental terms (e.g., use `Treaty` not `Diplomatic Treaties`). Use Title Case (e.g. `Working Class`).
 
43
  **Output Format:**
44
  Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its labels.
 
45
  ### Example Category: Political Actions
46
  - Petition, Charter, Protest, Rally, Legislation
47
  ### Example Category: Social Groups
 
50
 
51
  # 🧠 Generator Function (The "Conceptual AI")
52
  def generate_from_prompt(prompt, provider, key_dict):
53
+ # (This function remains unchanged)
54
  provider_id = MODEL_OPTIONS.get(provider)
55
  api_key = key_dict.get(f"{provider_id}_key")
56
  if not api_key:
 
73
 
74
  # --- UI Definitions ---
75
 
 
76
  STANDARD_LABELS = [
77
  "Person", "Organization", "Location", "Country", "City", "State",
78
  "Nationality", "Group", "Date", "Event", "Law", "Legal Document",
 
83
  MAX_CATEGORIES = 8
84
 
85
  with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break: break-word; }") as demo:
86
+ # Invisible component to store the results DataFrame for later use (like exporting)
87
+ results_state = gr.State()
88
+
89
  gr.Markdown("# Historical Text Analysis Tool")
90
+ # ... (Introduction and Step 1-3 UI remains the same)
91
  gr.Markdown(
92
  """
93
  This tool uses two forms of AI to accelerate historical research. First, a **Conceptual AI** generates a research framework with relevant search terms for your topic. Second, an **Extraction AI** scans your source text to find and highlight those terms with high precision.
 
145
  gr.Markdown("--- \n## Step 3: Run Analysis")
146
  threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls the strictness of the Extraction AI. Lower values find more potential matches. Higher values return fewer, more precise matches.")
147
  text_input = gr.Textbox(label="Paste Your Source Text Here for Analysis", lines=15, placeholder="Paste a historical document, an article, or a chapter...")
 
148
  analyze_btn = gr.Button("Find Entities", variant="primary")
149
 
150
  analysis_status = gr.Markdown(visible=False)
 
161
  with gr.TabItem("Highlighted Text"):
162
  highlighted_text_output = gr.HighlightedText(label="Found Entities", interactive=True)
163
  with gr.TabItem("Detailed Results"):
164
+ # NEW: Helpful text about copy/pasting and exporting
165
+ gr.Markdown("You can sort the table by clicking on column headers or filter by typing in the search box below. Use the button to export the full table to a CSV file.")
166
+ with gr.Row():
167
+ export_btn = gr.Button("Export Results to CSV")
168
+
169
+ # NEW: Switched to gr.DataFrame for interactive results
170
+ detailed_results_output = gr.DataFrame(
171
+ headers=["Label", "Text Found", "Instances", "Confidence Score"],
172
+ datatype=["str", "str", "number", "number"],
173
+ label="Aggregated List of Found Entities"
174
+ )
175
+
176
+ # NEW: File output component for the download link
177
+ csv_file_output = gr.File(label="Download CSV", visible=False)
178
+
179
  with gr.TabItem("Debug Log"):
180
  debug_output = gr.Textbox(label="Extraction Process Log", interactive=False, lines=8)
181
 
182
  # --- Backend Functions ---
183
 
184
  def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
185
+ # (This function remains unchanged)
186
  yield {generate_btn: gr.update(value="Generating...", interactive=False)}
187
 
188
  try:
 
231
  raise gr.Error(str(e))
232
 
233
  def analyze_text(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
 
234
  yield {
235
  analyze_btn: gr.update(value="Finding Entities...", interactive=False),
236
+ analysis_status: gr.update(value="The Extraction AI is scanning your text...", visible=True),
237
+ highlighted_text_output: None, detailed_results_output: None, debug_output: "Starting analysis...",
238
+ csv_file_output: gr.update(visible=False) # Hide old CSV link
239
  }
240
 
241
+ # ... (Label collection logic is the same)
242
  debug_info = []
243
+ if gliner_model is None: raise gr.Error("Extraction AI (GLiNER model) is not loaded.")
 
 
244
  labels_to_use = set()
245
  for group in suggested_labels_from_groups:
246
  if group: labels_to_use.update(group)
247
  if standard_labels: labels_to_use.update(standard_labels)
248
  custom = {l.strip() for l in custom_label_text.split(',') if l.strip()}
249
  if custom: labels_to_use.update(custom)
 
250
  final_labels = sorted(list(labels_to_use))
251
  debug_info.append(f"Searching for {len(final_labels)} unique labels.")
252
  debug_info.append(f"Confidence Threshold set to: {threshold}")
 
256
  analyze_btn: gr.update(value="Find Entities", interactive=True),
257
  analysis_status: gr.update(visible=False),
258
  highlighted_text_output: {"text": text, "entities": []},
259
+ detailed_results_output: None,
260
+ results_state: None, # Clear state
261
  debug_output: "Analysis stopped: No text or no labels provided."
262
  }
263
  return
264
 
265
+ # ... (GLiNER prediction logic is the same)
266
  all_entities = []
267
  chunk_size, overlap = 1024, 100
268
  for i in range(0, len(text), chunk_size - overlap):
 
271
  for ent in chunk_entities:
272
  ent['start'] += i; ent['end'] += i
273
  all_entities.append(ent)
 
274
  unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
275
  debug_info.append(f"Found {len(unique_entities)} raw entity mentions.")
276
 
277
+ highlighted_output_data = {"text": text, "entities": [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]}
 
 
 
278
 
279
  aggregated_matches = defaultdict(lambda: {'count': 0, 'scores': [], 'original_casing': ''})
280
  for ent in unique_entities:
 
282
  key = (ent['label'], match_text.lower())
283
  aggregated_matches[key]['count'] += 1
284
  aggregated_matches[key]['scores'].append(ent['score'])
285
+ if not aggregated_matches[key]['original_casing']: aggregated_matches[key]['original_casing'] = match_text
 
286
 
287
+ # --- NEW LOGIC FOR PANDAS DATAFRAME ---
288
  table_rows = []
289
  for (label, _), data in aggregated_matches.items():
290
  avg_score = np.mean(data['scores'])
291
  table_rows.append({
292
+ "Label": label,
293
+ "Text Found": data['original_casing'],
294
+ "Instances": data['count'],
295
+ "Confidence Score": round(avg_score, 2)
296
  })
297
 
298
+ # Create DataFrame and sort it
299
+ results_df = pd.DataFrame(table_rows)
300
+ if not results_df.empty:
301
+ results_df = results_df.sort_values(by=["Label", "Instances"], ascending=[True, False])
302
+
 
 
 
 
 
 
 
 
 
 
303
  debug_info.append("Analysis complete.")
304
 
305
  yield {
306
  analyze_btn: gr.update(value="Find Entities", interactive=True),
307
  analysis_status: gr.update(visible=False),
308
  highlighted_text_output: highlighted_output_data,
309
+ # Output the DataFrame to the gr.DataFrame component
310
+ detailed_results_output: results_df,
311
+ # Store the DataFrame in the invisible gr.State component
312
+ results_state: results_df,
313
  debug_output: "\n".join(debug_info)
314
  }
315
+
316
+ # --- NEW FUNCTION TO HANDLE CSV EXPORT ---
317
+ def export_to_csv(df):
318
+ if df is None or df.empty:
319
+ gr.Info("No data to export. Please run 'Find Entities' first.")
320
+ return None # Return None to keep the file component hidden
321
+
322
+ with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv', encoding='utf-8') as tmpfile:
323
+ df.to_csv(tmpfile.name, index=False)
324
+ return gr.update(value=tmpfile.name, visible=True)
325
 
326
  # --- Wire up UI events ---
327
  generate_btn.click(
 
330
  outputs=[generate_btn] + [comp for pair in dynamic_components for comp in pair]
331
  )
332
 
333
+ def deselect_all(): return gr.update(value=[])
334
+ def select_all(choices): return gr.update(value=choices)
 
 
335
 
336
  deselect_all_std_btn.click(fn=deselect_all, inputs=None, outputs=[standard_labels_checkbox])
337
  select_all_std_btn.click(lambda: select_all(STANDARD_LABELS), inputs=None, outputs=[standard_labels_checkbox])
338
 
 
339
  for _, cg, sel_btn, desel_btn in dynamic_components:
 
340
  sel_btn.click(fn=lambda c=cg: gr.update(value=c.choices), inputs=None, outputs=[cg])
341
  desel_btn.click(fn=deselect_all, inputs=None, outputs=[cg])
342
 
343
  analyze_btn.click(
344
  fn=analyze_text,
345
  inputs=[text_input, standard_labels_checkbox, custom_labels_textbox, threshold_slider] + [cg for acc, cg, sel, desel in dynamic_components],
346
+ # Add results_state to the outputs list
347
+ outputs=[analyze_btn, analysis_status, highlighted_text_output, detailed_results_output, results_state, debug_output, csv_file_output]
348
+ )
349
+
350
+ # Wire up the new export button
351
+ export_btn.click(
352
+ fn=export_to_csv,
353
+ inputs=[results_state],
354
+ outputs=[csv_file_output]
355
  )
356
 
357
  demo.launch(share=True, debug=True)