Spaces:

mk1985
/

Historical-Text-Analyser

Sleeping

App Files Files Community

mk1985 commited on Jul 23, 2025

Commit

d51705e

verified ·

1 Parent(s): 5f5b923

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -29

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import anthropic
 import google.generativeai as genai
 import gradio as gr
 from gliner import GLiNER
-from collections import defaultdict, Counter
 import numpy as np
 import os
@@ -38,7 +38,7 @@ You are an expert research assistant specializing in history. For the provided t
 **Instructions:**
 1.  Identify 4-6 high-level **Conceptual Categories** relevant to analyzing this historical topic (e.g., 'Key Figures', 'Core Ideologies', 'Significant Events').
 2.  For each category, list specific, searchable **Labels** that would appear in a primary or secondary source document.
-3.  **Crucial Rule for Labels:** Use concise, singular, and fundamental terms (e.g., use `Treaty` not `Diplomatic Treaties`).
 **Output Format:**
 Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its labels.
@@ -73,11 +73,12 @@ def generate_from_prompt(prompt, provider, key_dict):
 # --- UI Definitions ---
 STANDARD_LABELS = [
-    "PERSON", "ORGANIZATION", "LOCATION", "COUNTRY", "CITY", "STATE",
-    "NATIONALITY", "GROUP", "DATE", "EVENT", "LAW", "LEGAL_DOCUMENT",
-    "PRODUCT", "FACILITY", "WORK_OF_ART", "LANGUAGE", "TIME", "PERCENTAGE",
-    "MONEY", "CURRENCY", "QUANTITY", "ORDINAL_NUMBER", "CARDINAL_NUMBER"
 ]
 MAX_CATEGORIES = 8
@@ -94,7 +95,7 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
         ### Understanding "Entities" and "Labels"
         In text analysis, this process is often called "Named Entity Recognition" (NER).
         - An **Entity** is a specific piece of text in your document, like a name, a place, or a date (e.g., `Queen Victoria`, `1848`, `London`).
-        - A **Label** is the category that entity belongs to (e.g., `PERSON`, `DATE`, `LOCATION`).
         This tool helps you define your labels and then automatically finds the corresponding entities in your text.
         """
@@ -139,9 +140,10 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
         custom_labels_textbox = gr.Textbox(label="Enter Custom Labels (comma-separated)", placeholder="e.g., Technology, Weapon, Secret Society...")
     gr.Markdown("--- \n## Step 3: Run Analysis")
-    threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls the strictness of the Extraction AI. Lower values find more potential matches (less strict). Higher values return fewer, more precise matches (more strict).")
     text_input = gr.Textbox(label="Paste Your Source Text Here for Analysis", lines=15, placeholder="Paste a historical document, an article, or a chapter...")
-    analyze_btn = gr.Button("Analyze Text", variant="primary")
     analysis_status = gr.Markdown(visible=False)
@@ -164,9 +166,7 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
     # --- Backend Functions ---
     def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
-        yield {
-            generate_btn: gr.update(value="Generating...", interactive=False)
-        }
         try:
             key_dict = {"openai_key": os.environ.get("OPENAI_API_KEY", openai_k), "anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k), "google_key": os.environ.get("GOOGLE_API_KEY", google_k)}
@@ -214,8 +214,9 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
             raise gr.Error(str(e))
     def analyze_text(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
         yield {
-            analyze_btn: gr.update(value="Analyzing...", interactive=False),
             analysis_status: gr.update(value="The Extraction AI is scanning your text. This may take a moment...", visible=True),
             highlighted_text_output: None, detailed_results_output: None, debug_output: "Starting analysis..."
         }
@@ -237,7 +238,7 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
         if not text or not final_labels:
             yield {
-                analyze_btn: gr.update(value="Analyze Text", interactive=True),
                 analysis_status: gr.update(visible=False),
                 highlighted_text_output: {"text": text, "entities": []},
                 detailed_results_output: "Analysis stopped: Please provide text and select at least one label to search for.",
@@ -257,7 +258,6 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
         unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
         debug_info.append(f"Found {len(unique_entities)} raw entity mentions.")
-        # --- BUG FIX: Map 'label' to 'entity' for Gradio's HighlightedText component ---
         highlighted_output_data = {
             "text": text,
             "entities": [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]
@@ -272,29 +272,36 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
             if not aggregated_matches[key]['original_casing']:
                 aggregated_matches[key]['original_casing'] = match_text
-        results_by_label = defaultdict(list)
         for (label, _), data in aggregated_matches.items():
             avg_score = np.mean(data['scores'])
-            results_by_label[label].append({'text': data['original_casing'], 'count': data['count'], 'avg_score': avg_score})
         markdown_string = ""
-        for label, items in sorted(results_by_label.items()):
-            markdown_string += f"### {label}\n"
-            markdown_string += "| Text Found | Instances | Avg. Confidence Score* |\n"
-            markdown_string += "|------------|-----------|--------------------------|\n"
-            for item in sorted(items, key=lambda x: x['count'], reverse=True):
-                markdown_string += f"| {item['text']} | {item['count']} | {item['avg_score']:.2f} |\n"
-            markdown_string += "\n"
-        if not markdown_string:
             markdown_string = "No entities found. Consider lowering the confidence threshold or refining your labels."
         else:
             markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the Extraction AI is that it found the correct label (1.00 = 100% certain). The score is an average across all instances of that text.</i></small>"
         debug_info.append("Analysis complete.")
         yield {
-            analyze_btn: gr.update(value="Analyze Text", interactive=True),
             analysis_status: gr.update(visible=False),
             highlighted_text_output: highlighted_output_data,
             detailed_results_output: markdown_string,
@@ -318,7 +325,8 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
     # Wire up the dynamic select/deselect buttons
     for _, cg, sel_btn, desel_btn in dynamic_components:
-        sel_btn.click(fn=select_all, inputs=[cg], outputs=[cg])
         desel_btn.click(fn=deselect_all, inputs=None, outputs=[cg])
     analyze_btn.click(

 import google.generativeai as genai
 import gradio as gr
 from gliner import GLiNER
+from collections import defaultdict
 import numpy as np
 import os
 **Instructions:**
 1.  Identify 4-6 high-level **Conceptual Categories** relevant to analyzing this historical topic (e.g., 'Key Figures', 'Core Ideologies', 'Significant Events').
 2.  For each category, list specific, searchable **Labels** that would appear in a primary or secondary source document.
+3.  **Crucial Rule for Labels:** Use concise, singular, and fundamental terms (e.g., use `Treaty` not `Diplomatic Treaties`). Use Title Case (e.g. `Working Class`).
 **Output Format:**
 Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its labels.
 # --- UI Definitions ---
+# REFORMATTED: No underscores, uses Title Case
 STANDARD_LABELS = [
+    "Person", "Organization", "Location", "Country", "City", "State",
+    "Nationality", "Group", "Date", "Event", "Law", "Legal Document",
+    "Product", "Facility", "Work Of Art", "Language", "Time", "Percentage",
+    "Money", "Currency", "Quantity", "Ordinal Number", "Cardinal Number"
 ]
 MAX_CATEGORIES = 8
         ### Understanding "Entities" and "Labels"
         In text analysis, this process is often called "Named Entity Recognition" (NER).
         - An **Entity** is a specific piece of text in your document, like a name, a place, or a date (e.g., `Queen Victoria`, `1848`, `London`).
+        - A **Label** is the category that entity belongs to (e.g., `Person`, `Date`, `Location`).
         This tool helps you define your labels and then automatically finds the corresponding entities in your text.
         """
         custom_labels_textbox = gr.Textbox(label="Enter Custom Labels (comma-separated)", placeholder="e.g., Technology, Weapon, Secret Society...")
     gr.Markdown("--- \n## Step 3: Run Analysis")
+    threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls the strictness of the Extraction AI. Lower values find more potential matches. Higher values return fewer, more precise matches.")
     text_input = gr.Textbox(label="Paste Your Source Text Here for Analysis", lines=15, placeholder="Paste a historical document, an article, or a chapter...")
+    # UPDATED BUTTON TEXT
+    analyze_btn = gr.Button("Find Entities", variant="primary")
     analysis_status = gr.Markdown(visible=False)
     # --- Backend Functions ---
     def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
+        yield {generate_btn: gr.update(value="Generating...", interactive=False)}
         try:
             key_dict = {"openai_key": os.environ.get("OPENAI_API_KEY", openai_k), "anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k), "google_key": os.environ.get("GOOGLE_API_KEY", google_k)}
             raise gr.Error(str(e))
     def analyze_text(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
+        # UPDATED PROGRESS MESSAGE
         yield {
+            analyze_btn: gr.update(value="Finding Entities...", interactive=False),
             analysis_status: gr.update(value="The Extraction AI is scanning your text. This may take a moment...", visible=True),
             highlighted_text_output: None, detailed_results_output: None, debug_output: "Starting analysis..."
         }
         if not text or not final_labels:
             yield {
+                analyze_btn: gr.update(value="Find Entities", interactive=True),
                 analysis_status: gr.update(visible=False),
                 highlighted_text_output: {"text": text, "entities": []},
                 detailed_results_output: "Analysis stopped: Please provide text and select at least one label to search for.",
         unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
         debug_info.append(f"Found {len(unique_entities)} raw entity mentions.")
         highlighted_output_data = {
             "text": text,
             "entities": [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]
             if not aggregated_matches[key]['original_casing']:
                 aggregated_matches[key]['original_casing'] = match_text
+        # --- NEW LOGIC FOR SINGLE, UNIFIED TABLE ---
+        table_rows = []
         for (label, _), data in aggregated_matches.items():
             avg_score = np.mean(data['scores'])
+            table_rows.append({
+                'label': label,
+                'text': data['original_casing'],
+                'count': data['count'],
+                'avg_score': avg_score
+            })
+        # Sort the rows by Label (alphabetically), then by count (descending)
+        table_rows.sort(key=lambda x: (x['label'], -x['count']))
         markdown_string = ""
+        if not table_rows:
             markdown_string = "No entities found. Consider lowering the confidence threshold or refining your labels."
         else:
+            # Build the Markdown table string
+            markdown_string += "| Label | Text Found | Instances | Avg. Confidence Score* |\n"
+            markdown_string += "|-------|------------|-----------|--------------------------|\n"
+            for row in table_rows:
+                markdown_string += f"| {row['label']} | {row['text']} | {row['count']} | {row['avg_score']:.2f} |\n"
             markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the Extraction AI is that it found the correct label (1.00 = 100% certain). The score is an average across all instances of that text.</i></small>"
         debug_info.append("Analysis complete.")
         yield {
+            analyze_btn: gr.update(value="Find Entities", interactive=True),
             analysis_status: gr.update(visible=False),
             highlighted_text_output: highlighted_output_data,
             detailed_results_output: markdown_string,
     # Wire up the dynamic select/deselect buttons
     for _, cg, sel_btn, desel_btn in dynamic_components:
+        # BUG FIX: Use a lambda to capture the component `cg` itself, allowing `cg.choices` to provide the full list of options.
+        sel_btn.click(fn=lambda c=cg: gr.update(value=c.choices), inputs=None, outputs=[cg])
         desel_btn.click(fn=deselect_all, inputs=None, outputs=[cg])
     analyze_btn.click(