mk1985 commited on
Commit
d51705e
·
verified ·
1 Parent(s): 5f5b923

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -29
app.py CHANGED
@@ -8,7 +8,7 @@ import anthropic
8
  import google.generativeai as genai
9
  import gradio as gr
10
  from gliner import GLiNER
11
- from collections import defaultdict, Counter
12
  import numpy as np
13
  import os
14
 
@@ -38,7 +38,7 @@ You are an expert research assistant specializing in history. For the provided t
38
  **Instructions:**
39
  1. Identify 4-6 high-level **Conceptual Categories** relevant to analyzing this historical topic (e.g., 'Key Figures', 'Core Ideologies', 'Significant Events').
40
  2. For each category, list specific, searchable **Labels** that would appear in a primary or secondary source document.
41
- 3. **Crucial Rule for Labels:** Use concise, singular, and fundamental terms (e.g., use `Treaty` not `Diplomatic Treaties`).
42
 
43
  **Output Format:**
44
  Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its labels.
@@ -73,11 +73,12 @@ def generate_from_prompt(prompt, provider, key_dict):
73
 
74
  # --- UI Definitions ---
75
 
 
76
  STANDARD_LABELS = [
77
- "PERSON", "ORGANIZATION", "LOCATION", "COUNTRY", "CITY", "STATE",
78
- "NATIONALITY", "GROUP", "DATE", "EVENT", "LAW", "LEGAL_DOCUMENT",
79
- "PRODUCT", "FACILITY", "WORK_OF_ART", "LANGUAGE", "TIME", "PERCENTAGE",
80
- "MONEY", "CURRENCY", "QUANTITY", "ORDINAL_NUMBER", "CARDINAL_NUMBER"
81
  ]
82
 
83
  MAX_CATEGORIES = 8
@@ -94,7 +95,7 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
94
  ### Understanding "Entities" and "Labels"
95
  In text analysis, this process is often called "Named Entity Recognition" (NER).
96
  - An **Entity** is a specific piece of text in your document, like a name, a place, or a date (e.g., `Queen Victoria`, `1848`, `London`).
97
- - A **Label** is the category that entity belongs to (e.g., `PERSON`, `DATE`, `LOCATION`).
98
 
99
  This tool helps you define your labels and then automatically finds the corresponding entities in your text.
100
  """
@@ -139,9 +140,10 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
139
  custom_labels_textbox = gr.Textbox(label="Enter Custom Labels (comma-separated)", placeholder="e.g., Technology, Weapon, Secret Society...")
140
 
141
  gr.Markdown("--- \n## Step 3: Run Analysis")
142
- threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls the strictness of the Extraction AI. Lower values find more potential matches (less strict). Higher values return fewer, more precise matches (more strict).")
143
  text_input = gr.Textbox(label="Paste Your Source Text Here for Analysis", lines=15, placeholder="Paste a historical document, an article, or a chapter...")
144
- analyze_btn = gr.Button("Analyze Text", variant="primary")
 
145
 
146
  analysis_status = gr.Markdown(visible=False)
147
 
@@ -164,9 +166,7 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
164
  # --- Backend Functions ---
165
 
166
  def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
167
- yield {
168
- generate_btn: gr.update(value="Generating...", interactive=False)
169
- }
170
 
171
  try:
172
  key_dict = {"openai_key": os.environ.get("OPENAI_API_KEY", openai_k), "anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k), "google_key": os.environ.get("GOOGLE_API_KEY", google_k)}
@@ -214,8 +214,9 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
214
  raise gr.Error(str(e))
215
 
216
  def analyze_text(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
 
217
  yield {
218
- analyze_btn: gr.update(value="Analyzing...", interactive=False),
219
  analysis_status: gr.update(value="The Extraction AI is scanning your text. This may take a moment...", visible=True),
220
  highlighted_text_output: None, detailed_results_output: None, debug_output: "Starting analysis..."
221
  }
@@ -237,7 +238,7 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
237
 
238
  if not text or not final_labels:
239
  yield {
240
- analyze_btn: gr.update(value="Analyze Text", interactive=True),
241
  analysis_status: gr.update(visible=False),
242
  highlighted_text_output: {"text": text, "entities": []},
243
  detailed_results_output: "Analysis stopped: Please provide text and select at least one label to search for.",
@@ -257,7 +258,6 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
257
  unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
258
  debug_info.append(f"Found {len(unique_entities)} raw entity mentions.")
259
 
260
- # --- BUG FIX: Map 'label' to 'entity' for Gradio's HighlightedText component ---
261
  highlighted_output_data = {
262
  "text": text,
263
  "entities": [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]
@@ -272,29 +272,36 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
272
  if not aggregated_matches[key]['original_casing']:
273
  aggregated_matches[key]['original_casing'] = match_text
274
 
275
- results_by_label = defaultdict(list)
 
276
  for (label, _), data in aggregated_matches.items():
277
  avg_score = np.mean(data['scores'])
278
- results_by_label[label].append({'text': data['original_casing'], 'count': data['count'], 'avg_score': avg_score})
 
 
 
 
 
 
 
 
279
 
280
  markdown_string = ""
281
- for label, items in sorted(results_by_label.items()):
282
- markdown_string += f"### {label}\n"
283
- markdown_string += "| Text Found | Instances | Avg. Confidence Score* |\n"
284
- markdown_string += "|------------|-----------|--------------------------|\n"
285
- for item in sorted(items, key=lambda x: x['count'], reverse=True):
286
- markdown_string += f"| {item['text']} | {item['count']} | {item['avg_score']:.2f} |\n"
287
- markdown_string += "\n"
288
-
289
- if not markdown_string:
290
  markdown_string = "No entities found. Consider lowering the confidence threshold or refining your labels."
291
  else:
 
 
 
 
 
 
292
  markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the Extraction AI is that it found the correct label (1.00 = 100% certain). The score is an average across all instances of that text.</i></small>"
293
-
294
  debug_info.append("Analysis complete.")
295
 
296
  yield {
297
- analyze_btn: gr.update(value="Analyze Text", interactive=True),
298
  analysis_status: gr.update(visible=False),
299
  highlighted_text_output: highlighted_output_data,
300
  detailed_results_output: markdown_string,
@@ -318,7 +325,8 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
318
 
319
  # Wire up the dynamic select/deselect buttons
320
  for _, cg, sel_btn, desel_btn in dynamic_components:
321
- sel_btn.click(fn=select_all, inputs=[cg], outputs=[cg])
 
322
  desel_btn.click(fn=deselect_all, inputs=None, outputs=[cg])
323
 
324
  analyze_btn.click(
 
8
  import google.generativeai as genai
9
  import gradio as gr
10
  from gliner import GLiNER
11
+ from collections import defaultdict
12
  import numpy as np
13
  import os
14
 
 
38
  **Instructions:**
39
  1. Identify 4-6 high-level **Conceptual Categories** relevant to analyzing this historical topic (e.g., 'Key Figures', 'Core Ideologies', 'Significant Events').
40
  2. For each category, list specific, searchable **Labels** that would appear in a primary or secondary source document.
41
+ 3. **Crucial Rule for Labels:** Use concise, singular, and fundamental terms (e.g., use `Treaty` not `Diplomatic Treaties`). Use Title Case (e.g. `Working Class`).
42
 
43
  **Output Format:**
44
  Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its labels.
 
73
 
74
  # --- UI Definitions ---
75
 
76
+ # REFORMATTED: No underscores, uses Title Case
77
  STANDARD_LABELS = [
78
+ "Person", "Organization", "Location", "Country", "City", "State",
79
+ "Nationality", "Group", "Date", "Event", "Law", "Legal Document",
80
+ "Product", "Facility", "Work Of Art", "Language", "Time", "Percentage",
81
+ "Money", "Currency", "Quantity", "Ordinal Number", "Cardinal Number"
82
  ]
83
 
84
  MAX_CATEGORIES = 8
 
95
  ### Understanding "Entities" and "Labels"
96
  In text analysis, this process is often called "Named Entity Recognition" (NER).
97
  - An **Entity** is a specific piece of text in your document, like a name, a place, or a date (e.g., `Queen Victoria`, `1848`, `London`).
98
+ - A **Label** is the category that entity belongs to (e.g., `Person`, `Date`, `Location`).
99
 
100
  This tool helps you define your labels and then automatically finds the corresponding entities in your text.
101
  """
 
140
  custom_labels_textbox = gr.Textbox(label="Enter Custom Labels (comma-separated)", placeholder="e.g., Technology, Weapon, Secret Society...")
141
 
142
  gr.Markdown("--- \n## Step 3: Run Analysis")
143
+ threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls the strictness of the Extraction AI. Lower values find more potential matches. Higher values return fewer, more precise matches.")
144
  text_input = gr.Textbox(label="Paste Your Source Text Here for Analysis", lines=15, placeholder="Paste a historical document, an article, or a chapter...")
145
+ # UPDATED BUTTON TEXT
146
+ analyze_btn = gr.Button("Find Entities", variant="primary")
147
 
148
  analysis_status = gr.Markdown(visible=False)
149
 
 
166
  # --- Backend Functions ---
167
 
168
  def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
169
+ yield {generate_btn: gr.update(value="Generating...", interactive=False)}
 
 
170
 
171
  try:
172
  key_dict = {"openai_key": os.environ.get("OPENAI_API_KEY", openai_k), "anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k), "google_key": os.environ.get("GOOGLE_API_KEY", google_k)}
 
214
  raise gr.Error(str(e))
215
 
216
  def analyze_text(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
217
+ # UPDATED PROGRESS MESSAGE
218
  yield {
219
+ analyze_btn: gr.update(value="Finding Entities...", interactive=False),
220
  analysis_status: gr.update(value="The Extraction AI is scanning your text. This may take a moment...", visible=True),
221
  highlighted_text_output: None, detailed_results_output: None, debug_output: "Starting analysis..."
222
  }
 
238
 
239
  if not text or not final_labels:
240
  yield {
241
+ analyze_btn: gr.update(value="Find Entities", interactive=True),
242
  analysis_status: gr.update(visible=False),
243
  highlighted_text_output: {"text": text, "entities": []},
244
  detailed_results_output: "Analysis stopped: Please provide text and select at least one label to search for.",
 
258
  unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
259
  debug_info.append(f"Found {len(unique_entities)} raw entity mentions.")
260
 
 
261
  highlighted_output_data = {
262
  "text": text,
263
  "entities": [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]
 
272
  if not aggregated_matches[key]['original_casing']:
273
  aggregated_matches[key]['original_casing'] = match_text
274
 
275
+ # --- NEW LOGIC FOR SINGLE, UNIFIED TABLE ---
276
+ table_rows = []
277
  for (label, _), data in aggregated_matches.items():
278
  avg_score = np.mean(data['scores'])
279
+ table_rows.append({
280
+ 'label': label,
281
+ 'text': data['original_casing'],
282
+ 'count': data['count'],
283
+ 'avg_score': avg_score
284
+ })
285
+
286
+ # Sort the rows by Label (alphabetically), then by count (descending)
287
+ table_rows.sort(key=lambda x: (x['label'], -x['count']))
288
 
289
  markdown_string = ""
290
+ if not table_rows:
 
 
 
 
 
 
 
 
291
  markdown_string = "No entities found. Consider lowering the confidence threshold or refining your labels."
292
  else:
293
+ # Build the Markdown table string
294
+ markdown_string += "| Label | Text Found | Instances | Avg. Confidence Score* |\n"
295
+ markdown_string += "|-------|------------|-----------|--------------------------|\n"
296
+ for row in table_rows:
297
+ markdown_string += f"| {row['label']} | {row['text']} | {row['count']} | {row['avg_score']:.2f} |\n"
298
+
299
  markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the Extraction AI is that it found the correct label (1.00 = 100% certain). The score is an average across all instances of that text.</i></small>"
300
+
301
  debug_info.append("Analysis complete.")
302
 
303
  yield {
304
+ analyze_btn: gr.update(value="Find Entities", interactive=True),
305
  analysis_status: gr.update(visible=False),
306
  highlighted_text_output: highlighted_output_data,
307
  detailed_results_output: markdown_string,
 
325
 
326
  # Wire up the dynamic select/deselect buttons
327
  for _, cg, sel_btn, desel_btn in dynamic_components:
328
+ # BUG FIX: Use a lambda to capture the component `cg` itself, allowing `cg.choices` to provide the full list of options.
329
+ sel_btn.click(fn=lambda c=cg: gr.update(value=c.choices), inputs=None, outputs=[cg])
330
  desel_btn.click(fn=deselect_all, inputs=None, outputs=[cg])
331
 
332
  analyze_btn.click(