Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@ import anthropic
|
|
| 8 |
import google.generativeai as genai
|
| 9 |
import gradio as gr
|
| 10 |
from gliner import GLiNER
|
| 11 |
-
from collections import defaultdict
|
| 12 |
import numpy as np
|
| 13 |
import os
|
| 14 |
|
|
@@ -38,7 +38,7 @@ You are an expert research assistant specializing in history. For the provided t
|
|
| 38 |
**Instructions:**
|
| 39 |
1. Identify 4-6 high-level **Conceptual Categories** relevant to analyzing this historical topic (e.g., 'Key Figures', 'Core Ideologies', 'Significant Events').
|
| 40 |
2. For each category, list specific, searchable **Labels** that would appear in a primary or secondary source document.
|
| 41 |
-
3. **Crucial Rule for Labels:** Use concise, singular, and fundamental terms (e.g., use `Treaty` not `Diplomatic Treaties`).
|
| 42 |
|
| 43 |
**Output Format:**
|
| 44 |
Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its labels.
|
|
@@ -73,11 +73,12 @@ def generate_from_prompt(prompt, provider, key_dict):
|
|
| 73 |
|
| 74 |
# --- UI Definitions ---
|
| 75 |
|
|
|
|
| 76 |
STANDARD_LABELS = [
|
| 77 |
-
"
|
| 78 |
-
"
|
| 79 |
-
"
|
| 80 |
-
"
|
| 81 |
]
|
| 82 |
|
| 83 |
MAX_CATEGORIES = 8
|
|
@@ -94,7 +95,7 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
|
|
| 94 |
### Understanding "Entities" and "Labels"
|
| 95 |
In text analysis, this process is often called "Named Entity Recognition" (NER).
|
| 96 |
- An **Entity** is a specific piece of text in your document, like a name, a place, or a date (e.g., `Queen Victoria`, `1848`, `London`).
|
| 97 |
-
- A **Label** is the category that entity belongs to (e.g., `
|
| 98 |
|
| 99 |
This tool helps you define your labels and then automatically finds the corresponding entities in your text.
|
| 100 |
"""
|
|
@@ -139,9 +140,10 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
|
|
| 139 |
custom_labels_textbox = gr.Textbox(label="Enter Custom Labels (comma-separated)", placeholder="e.g., Technology, Weapon, Secret Society...")
|
| 140 |
|
| 141 |
gr.Markdown("--- \n## Step 3: Run Analysis")
|
| 142 |
-
threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls the strictness of the Extraction AI. Lower values find more potential matches
|
| 143 |
text_input = gr.Textbox(label="Paste Your Source Text Here for Analysis", lines=15, placeholder="Paste a historical document, an article, or a chapter...")
|
| 144 |
-
|
|
|
|
| 145 |
|
| 146 |
analysis_status = gr.Markdown(visible=False)
|
| 147 |
|
|
@@ -164,9 +166,7 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
|
|
| 164 |
# --- Backend Functions ---
|
| 165 |
|
| 166 |
def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
|
| 167 |
-
yield {
|
| 168 |
-
generate_btn: gr.update(value="Generating...", interactive=False)
|
| 169 |
-
}
|
| 170 |
|
| 171 |
try:
|
| 172 |
key_dict = {"openai_key": os.environ.get("OPENAI_API_KEY", openai_k), "anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k), "google_key": os.environ.get("GOOGLE_API_KEY", google_k)}
|
|
@@ -214,8 +214,9 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
|
|
| 214 |
raise gr.Error(str(e))
|
| 215 |
|
| 216 |
def analyze_text(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
|
|
|
|
| 217 |
yield {
|
| 218 |
-
analyze_btn: gr.update(value="
|
| 219 |
analysis_status: gr.update(value="The Extraction AI is scanning your text. This may take a moment...", visible=True),
|
| 220 |
highlighted_text_output: None, detailed_results_output: None, debug_output: "Starting analysis..."
|
| 221 |
}
|
|
@@ -237,7 +238,7 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
|
|
| 237 |
|
| 238 |
if not text or not final_labels:
|
| 239 |
yield {
|
| 240 |
-
analyze_btn: gr.update(value="
|
| 241 |
analysis_status: gr.update(visible=False),
|
| 242 |
highlighted_text_output: {"text": text, "entities": []},
|
| 243 |
detailed_results_output: "Analysis stopped: Please provide text and select at least one label to search for.",
|
|
@@ -257,7 +258,6 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
|
|
| 257 |
unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
|
| 258 |
debug_info.append(f"Found {len(unique_entities)} raw entity mentions.")
|
| 259 |
|
| 260 |
-
# --- BUG FIX: Map 'label' to 'entity' for Gradio's HighlightedText component ---
|
| 261 |
highlighted_output_data = {
|
| 262 |
"text": text,
|
| 263 |
"entities": [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]
|
|
@@ -272,29 +272,36 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
|
|
| 272 |
if not aggregated_matches[key]['original_casing']:
|
| 273 |
aggregated_matches[key]['original_casing'] = match_text
|
| 274 |
|
| 275 |
-
|
|
|
|
| 276 |
for (label, _), data in aggregated_matches.items():
|
| 277 |
avg_score = np.mean(data['scores'])
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
markdown_string = ""
|
| 281 |
-
|
| 282 |
-
markdown_string += f"### {label}\n"
|
| 283 |
-
markdown_string += "| Text Found | Instances | Avg. Confidence Score* |\n"
|
| 284 |
-
markdown_string += "|------------|-----------|--------------------------|\n"
|
| 285 |
-
for item in sorted(items, key=lambda x: x['count'], reverse=True):
|
| 286 |
-
markdown_string += f"| {item['text']} | {item['count']} | {item['avg_score']:.2f} |\n"
|
| 287 |
-
markdown_string += "\n"
|
| 288 |
-
|
| 289 |
-
if not markdown_string:
|
| 290 |
markdown_string = "No entities found. Consider lowering the confidence threshold or refining your labels."
|
| 291 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the Extraction AI is that it found the correct label (1.00 = 100% certain). The score is an average across all instances of that text.</i></small>"
|
| 293 |
-
|
| 294 |
debug_info.append("Analysis complete.")
|
| 295 |
|
| 296 |
yield {
|
| 297 |
-
analyze_btn: gr.update(value="
|
| 298 |
analysis_status: gr.update(visible=False),
|
| 299 |
highlighted_text_output: highlighted_output_data,
|
| 300 |
detailed_results_output: markdown_string,
|
|
@@ -318,7 +325,8 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
|
|
| 318 |
|
| 319 |
# Wire up the dynamic select/deselect buttons
|
| 320 |
for _, cg, sel_btn, desel_btn in dynamic_components:
|
| 321 |
-
|
|
|
|
| 322 |
desel_btn.click(fn=deselect_all, inputs=None, outputs=[cg])
|
| 323 |
|
| 324 |
analyze_btn.click(
|
|
|
|
| 8 |
import google.generativeai as genai
|
| 9 |
import gradio as gr
|
| 10 |
from gliner import GLiNER
|
| 11 |
+
from collections import defaultdict
|
| 12 |
import numpy as np
|
| 13 |
import os
|
| 14 |
|
|
|
|
| 38 |
**Instructions:**
|
| 39 |
1. Identify 4-6 high-level **Conceptual Categories** relevant to analyzing this historical topic (e.g., 'Key Figures', 'Core Ideologies', 'Significant Events').
|
| 40 |
2. For each category, list specific, searchable **Labels** that would appear in a primary or secondary source document.
|
| 41 |
+
3. **Crucial Rule for Labels:** Use concise, singular, and fundamental terms (e.g., use `Treaty` not `Diplomatic Treaties`). Use Title Case (e.g. `Working Class`).
|
| 42 |
|
| 43 |
**Output Format:**
|
| 44 |
Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its labels.
|
|
|
|
| 73 |
|
| 74 |
# --- UI Definitions ---
|
| 75 |
|
| 76 |
+
# REFORMATTED: No underscores, uses Title Case
|
| 77 |
STANDARD_LABELS = [
|
| 78 |
+
"Person", "Organization", "Location", "Country", "City", "State",
|
| 79 |
+
"Nationality", "Group", "Date", "Event", "Law", "Legal Document",
|
| 80 |
+
"Product", "Facility", "Work Of Art", "Language", "Time", "Percentage",
|
| 81 |
+
"Money", "Currency", "Quantity", "Ordinal Number", "Cardinal Number"
|
| 82 |
]
|
| 83 |
|
| 84 |
MAX_CATEGORIES = 8
|
|
|
|
| 95 |
### Understanding "Entities" and "Labels"
|
| 96 |
In text analysis, this process is often called "Named Entity Recognition" (NER).
|
| 97 |
- An **Entity** is a specific piece of text in your document, like a name, a place, or a date (e.g., `Queen Victoria`, `1848`, `London`).
|
| 98 |
+
- A **Label** is the category that entity belongs to (e.g., `Person`, `Date`, `Location`).
|
| 99 |
|
| 100 |
This tool helps you define your labels and then automatically finds the corresponding entities in your text.
|
| 101 |
"""
|
|
|
|
| 140 |
custom_labels_textbox = gr.Textbox(label="Enter Custom Labels (comma-separated)", placeholder="e.g., Technology, Weapon, Secret Society...")
|
| 141 |
|
| 142 |
gr.Markdown("--- \n## Step 3: Run Analysis")
|
| 143 |
+
threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls the strictness of the Extraction AI. Lower values find more potential matches. Higher values return fewer, more precise matches.")
|
| 144 |
text_input = gr.Textbox(label="Paste Your Source Text Here for Analysis", lines=15, placeholder="Paste a historical document, an article, or a chapter...")
|
| 145 |
+
# UPDATED BUTTON TEXT
|
| 146 |
+
analyze_btn = gr.Button("Find Entities", variant="primary")
|
| 147 |
|
| 148 |
analysis_status = gr.Markdown(visible=False)
|
| 149 |
|
|
|
|
| 166 |
# --- Backend Functions ---
|
| 167 |
|
| 168 |
def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
|
| 169 |
+
yield {generate_btn: gr.update(value="Generating...", interactive=False)}
|
|
|
|
|
|
|
| 170 |
|
| 171 |
try:
|
| 172 |
key_dict = {"openai_key": os.environ.get("OPENAI_API_KEY", openai_k), "anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k), "google_key": os.environ.get("GOOGLE_API_KEY", google_k)}
|
|
|
|
| 214 |
raise gr.Error(str(e))
|
| 215 |
|
| 216 |
def analyze_text(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
|
| 217 |
+
# UPDATED PROGRESS MESSAGE
|
| 218 |
yield {
|
| 219 |
+
analyze_btn: gr.update(value="Finding Entities...", interactive=False),
|
| 220 |
analysis_status: gr.update(value="The Extraction AI is scanning your text. This may take a moment...", visible=True),
|
| 221 |
highlighted_text_output: None, detailed_results_output: None, debug_output: "Starting analysis..."
|
| 222 |
}
|
|
|
|
| 238 |
|
| 239 |
if not text or not final_labels:
|
| 240 |
yield {
|
| 241 |
+
analyze_btn: gr.update(value="Find Entities", interactive=True),
|
| 242 |
analysis_status: gr.update(visible=False),
|
| 243 |
highlighted_text_output: {"text": text, "entities": []},
|
| 244 |
detailed_results_output: "Analysis stopped: Please provide text and select at least one label to search for.",
|
|
|
|
| 258 |
unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
|
| 259 |
debug_info.append(f"Found {len(unique_entities)} raw entity mentions.")
|
| 260 |
|
|
|
|
| 261 |
highlighted_output_data = {
|
| 262 |
"text": text,
|
| 263 |
"entities": [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]
|
|
|
|
| 272 |
if not aggregated_matches[key]['original_casing']:
|
| 273 |
aggregated_matches[key]['original_casing'] = match_text
|
| 274 |
|
| 275 |
+
# --- NEW LOGIC FOR SINGLE, UNIFIED TABLE ---
|
| 276 |
+
table_rows = []
|
| 277 |
for (label, _), data in aggregated_matches.items():
|
| 278 |
avg_score = np.mean(data['scores'])
|
| 279 |
+
table_rows.append({
|
| 280 |
+
'label': label,
|
| 281 |
+
'text': data['original_casing'],
|
| 282 |
+
'count': data['count'],
|
| 283 |
+
'avg_score': avg_score
|
| 284 |
+
})
|
| 285 |
+
|
| 286 |
+
# Sort the rows by Label (alphabetically), then by count (descending)
|
| 287 |
+
table_rows.sort(key=lambda x: (x['label'], -x['count']))
|
| 288 |
|
| 289 |
markdown_string = ""
|
| 290 |
+
if not table_rows:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
markdown_string = "No entities found. Consider lowering the confidence threshold or refining your labels."
|
| 292 |
else:
|
| 293 |
+
# Build the Markdown table string
|
| 294 |
+
markdown_string += "| Label | Text Found | Instances | Avg. Confidence Score* |\n"
|
| 295 |
+
markdown_string += "|-------|------------|-----------|--------------------------|\n"
|
| 296 |
+
for row in table_rows:
|
| 297 |
+
markdown_string += f"| {row['label']} | {row['text']} | {row['count']} | {row['avg_score']:.2f} |\n"
|
| 298 |
+
|
| 299 |
markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the Extraction AI is that it found the correct label (1.00 = 100% certain). The score is an average across all instances of that text.</i></small>"
|
| 300 |
+
|
| 301 |
debug_info.append("Analysis complete.")
|
| 302 |
|
| 303 |
yield {
|
| 304 |
+
analyze_btn: gr.update(value="Find Entities", interactive=True),
|
| 305 |
analysis_status: gr.update(visible=False),
|
| 306 |
highlighted_text_output: highlighted_output_data,
|
| 307 |
detailed_results_output: markdown_string,
|
|
|
|
| 325 |
|
| 326 |
# Wire up the dynamic select/deselect buttons
|
| 327 |
for _, cg, sel_btn, desel_btn in dynamic_components:
|
| 328 |
+
# BUG FIX: Use a lambda to capture the component `cg` itself, allowing `cg.choices` to provide the full list of options.
|
| 329 |
+
sel_btn.click(fn=lambda c=cg: gr.update(value=c.choices), inputs=None, outputs=[cg])
|
| 330 |
desel_btn.click(fn=deselect_all, inputs=None, outputs=[cg])
|
| 331 |
|
| 332 |
analyze_btn.click(
|