Spaces:

mk1985
/

Historical-Text-Analyser

Sleeping

App Files Files Community

mk1985 commited on Jul 23, 2025

Commit

f9a5a03

verified ·

1 Parent(s): d51705e

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -50

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # 📚 Install dependencies
 # Make sure to run this in your environment if you haven't already
-# !pip install openai anthropic google-generativeai gradio transformers torch gliner numpy --quiet
 # ⚙️ Imports
 import openai
@@ -10,7 +10,9 @@ import gradio as gr
 from gliner import GLiNER
 from collections import defaultdict
 import numpy as np
 import os
 # 🧠 Supported models and their providers
 MODEL_OPTIONS = {
@@ -34,15 +36,12 @@ except Exception as e:
 # 🧠 Prompt for the Conceptual AI to generate a research framework
 FRAMEWORK_PROMPT_TEMPLATE = """
 You are an expert research assistant specializing in history. For the provided topic: **"{topic}"**, your task is to generate a conceptual research framework.
 **Instructions:**
 1.  Identify 4-6 high-level **Conceptual Categories** relevant to analyzing this historical topic (e.g., 'Key Figures', 'Core Ideologies', 'Significant Events').
 2.  For each category, list specific, searchable **Labels** that would appear in a primary or secondary source document.
 3.  **Crucial Rule for Labels:** Use concise, singular, and fundamental terms (e.g., use `Treaty` not `Diplomatic Treaties`). Use Title Case (e.g. `Working Class`).
 **Output Format:**
 Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its labels.
 ### Example Category: Political Actions
 - Petition, Charter, Protest, Rally, Legislation
 ### Example Category: Social Groups
@@ -51,6 +50,7 @@ Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-
 # 🧠 Generator Function (The "Conceptual AI")
 def generate_from_prompt(prompt, provider, key_dict):
     provider_id = MODEL_OPTIONS.get(provider)
     api_key = key_dict.get(f"{provider_id}_key")
     if not api_key:
@@ -73,7 +73,6 @@ def generate_from_prompt(prompt, provider, key_dict):
 # --- UI Definitions ---
-# REFORMATTED: No underscores, uses Title Case
 STANDARD_LABELS = [
     "Person", "Organization", "Location", "Country", "City", "State",
     "Nationality", "Group", "Date", "Event", "Law", "Legal Document",
@@ -84,7 +83,11 @@ STANDARD_LABELS = [
 MAX_CATEGORIES = 8
 with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break: break-word; }") as demo:
     gr.Markdown("# Historical Text Analysis Tool")
     gr.Markdown(
         """
         This tool uses two forms of AI to accelerate historical research. First, a **Conceptual AI** generates a research framework with relevant search terms for your topic. Second, an **Extraction AI** scans your source text to find and highlight those terms with high precision.
@@ -142,7 +145,6 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
     gr.Markdown("--- \n## Step 3: Run Analysis")
     threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls the strictness of the Extraction AI. Lower values find more potential matches. Higher values return fewer, more precise matches.")
     text_input = gr.Textbox(label="Paste Your Source Text Here for Analysis", lines=15, placeholder="Paste a historical document, an article, or a chapter...")
-    # UPDATED BUTTON TEXT
     analyze_btn = gr.Button("Find Entities", variant="primary")
     analysis_status = gr.Markdown(visible=False)
@@ -159,13 +161,28 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
         with gr.TabItem("Highlighted Text"):
             highlighted_text_output = gr.HighlightedText(label="Found Entities", interactive=True)
         with gr.TabItem("Detailed Results"):
-            detailed_results_output = gr.Markdown(label="Aggregated List of Found Entities")
         with gr.TabItem("Debug Log"):
             debug_output = gr.Textbox(label="Extraction Process Log", interactive=False, lines=8)
     # --- Backend Functions ---
     def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
         yield {generate_btn: gr.update(value="Generating...", interactive=False)}
         try:
@@ -214,24 +231,22 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
             raise gr.Error(str(e))
     def analyze_text(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
-        # UPDATED PROGRESS MESSAGE
         yield {
             analyze_btn: gr.update(value="Finding Entities...", interactive=False),
-            analysis_status: gr.update(value="The Extraction AI is scanning your text. This may take a moment...", visible=True),
-            highlighted_text_output: None, detailed_results_output: None, debug_output: "Starting analysis..."
         }
         debug_info = []
-        if gliner_model is None:
-            raise gr.Error("Extraction AI (GLiNER model) is not loaded. Cannot analyze text. Please check logs and restart.")
         labels_to_use = set()
         for group in suggested_labels_from_groups:
             if group: labels_to_use.update(group)
         if standard_labels: labels_to_use.update(standard_labels)
         custom = {l.strip() for l in custom_label_text.split(',') if l.strip()}
         if custom: labels_to_use.update(custom)
         final_labels = sorted(list(labels_to_use))
         debug_info.append(f"Searching for {len(final_labels)} unique labels.")
         debug_info.append(f"Confidence Threshold set to: {threshold}")
@@ -241,11 +256,13 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
                 analyze_btn: gr.update(value="Find Entities", interactive=True),
                 analysis_status: gr.update(visible=False),
                 highlighted_text_output: {"text": text, "entities": []},
-                detailed_results_output: "Analysis stopped: Please provide text and select at least one label to search for.",
                 debug_output: "Analysis stopped: No text or no labels provided."
             }
             return
         all_entities = []
         chunk_size, overlap = 1024, 100
         for i in range(0, len(text), chunk_size - overlap):
@@ -254,14 +271,10 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
             for ent in chunk_entities:
                 ent['start'] += i; ent['end'] += i
                 all_entities.append(ent)
         unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
         debug_info.append(f"Found {len(unique_entities)} raw entity mentions.")
-        highlighted_output_data = {
-            "text": text,
-            "entities": [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]
-        }
         aggregated_matches = defaultdict(lambda: {'count': 0, 'scores': [], 'original_casing': ''})
         for ent in unique_entities:
@@ -269,44 +282,46 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
             key = (ent['label'], match_text.lower())
             aggregated_matches[key]['count'] += 1
             aggregated_matches[key]['scores'].append(ent['score'])
-            if not aggregated_matches[key]['original_casing']:
-                aggregated_matches[key]['original_casing'] = match_text
-        # --- NEW LOGIC FOR SINGLE, UNIFIED TABLE ---
         table_rows = []
         for (label, _), data in aggregated_matches.items():
             avg_score = np.mean(data['scores'])
             table_rows.append({
-                'label': label,
-                'text': data['original_casing'],
-                'count': data['count'],
-                'avg_score': avg_score
             })
-        # Sort the rows by Label (alphabetically), then by count (descending)
-        table_rows.sort(key=lambda x: (x['label'], -x['count']))
-        markdown_string = ""
-        if not table_rows:
-            markdown_string = "No entities found. Consider lowering the confidence threshold or refining your labels."
-        else:
-            # Build the Markdown table string
-            markdown_string += "| Label | Text Found | Instances | Avg. Confidence Score* |\n"
-            markdown_string += "|-------|------------|-----------|--------------------------|\n"
-            for row in table_rows:
-                markdown_string += f"| {row['label']} | {row['text']} | {row['count']} | {row['avg_score']:.2f} |\n"
-            markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the Extraction AI is that it found the correct label (1.00 = 100% certain). The score is an average across all instances of that text.</i></small>"
         debug_info.append("Analysis complete.")
         yield {
             analyze_btn: gr.update(value="Find Entities", interactive=True),
             analysis_status: gr.update(visible=False),
             highlighted_text_output: highlighted_output_data,
-            detailed_results_output: markdown_string,
             debug_output: "\n".join(debug_info)
         }
     # --- Wire up UI events ---
     generate_btn.click(
@@ -315,24 +330,28 @@ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break:
         outputs=[generate_btn] + [comp for pair in dynamic_components for comp in pair]
     )
-    def deselect_all():
-        return gr.update(value=[])
-    def select_all(choices):
-        return gr.update(value=choices)
     deselect_all_std_btn.click(fn=deselect_all, inputs=None, outputs=[standard_labels_checkbox])
     select_all_std_btn.click(lambda: select_all(STANDARD_LABELS), inputs=None, outputs=[standard_labels_checkbox])
-    # Wire up the dynamic select/deselect buttons
     for _, cg, sel_btn, desel_btn in dynamic_components:
-        # BUG FIX: Use a lambda to capture the component `cg` itself, allowing `cg.choices` to provide the full list of options.
         sel_btn.click(fn=lambda c=cg: gr.update(value=c.choices), inputs=None, outputs=[cg])
         desel_btn.click(fn=deselect_all, inputs=None, outputs=[cg])
     analyze_btn.click(
         fn=analyze_text,
         inputs=[text_input, standard_labels_checkbox, custom_labels_textbox, threshold_slider] + [cg for acc, cg, sel, desel in dynamic_components],
-        outputs=[analyze_btn, analysis_status, highlighted_text_output, detailed_results_output, debug_output]
     )
 demo.launch(share=True, debug=True)

 # 📚 Install dependencies
 # Make sure to run this in your environment if you haven't already
+# !pip install openai anthropic google-generativeai gradio transformers torch gliner numpy pandas --quiet
 # ⚙️ Imports
 import openai
 from gliner import GLiNER
 from collections import defaultdict
 import numpy as np
+import pandas as pd # Import pandas for DataFrame
 import os
+import tempfile # For creating temporary CSV files
 # 🧠 Supported models and their providers
 MODEL_OPTIONS = {
 # 🧠 Prompt for the Conceptual AI to generate a research framework
 FRAMEWORK_PROMPT_TEMPLATE = """
 You are an expert research assistant specializing in history. For the provided topic: **"{topic}"**, your task is to generate a conceptual research framework.
 **Instructions:**
 1.  Identify 4-6 high-level **Conceptual Categories** relevant to analyzing this historical topic (e.g., 'Key Figures', 'Core Ideologies', 'Significant Events').
 2.  For each category, list specific, searchable **Labels** that would appear in a primary or secondary source document.
 3.  **Crucial Rule for Labels:** Use concise, singular, and fundamental terms (e.g., use `Treaty` not `Diplomatic Treaties`). Use Title Case (e.g. `Working Class`).
 **Output Format:**
 Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its labels.
 ### Example Category: Political Actions
 - Petition, Charter, Protest, Rally, Legislation
 ### Example Category: Social Groups
 # 🧠 Generator Function (The "Conceptual AI")
 def generate_from_prompt(prompt, provider, key_dict):
+    # (This function remains unchanged)
     provider_id = MODEL_OPTIONS.get(provider)
     api_key = key_dict.get(f"{provider_id}_key")
     if not api_key:
 # --- UI Definitions ---
 STANDARD_LABELS = [
     "Person", "Organization", "Location", "Country", "City", "State",
     "Nationality", "Group", "Date", "Event", "Law", "Legal Document",
 MAX_CATEGORIES = 8
 with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break: break-word; }") as demo:
+    # Invisible component to store the results DataFrame for later use (like exporting)
+    results_state = gr.State()
     gr.Markdown("# Historical Text Analysis Tool")
+    # ... (Introduction and Step 1-3 UI remains the same)
     gr.Markdown(
         """
         This tool uses two forms of AI to accelerate historical research. First, a **Conceptual AI** generates a research framework with relevant search terms for your topic. Second, an **Extraction AI** scans your source text to find and highlight those terms with high precision.
     gr.Markdown("--- \n## Step 3: Run Analysis")
     threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls the strictness of the Extraction AI. Lower values find more potential matches. Higher values return fewer, more precise matches.")
     text_input = gr.Textbox(label="Paste Your Source Text Here for Analysis", lines=15, placeholder="Paste a historical document, an article, or a chapter...")
     analyze_btn = gr.Button("Find Entities", variant="primary")
     analysis_status = gr.Markdown(visible=False)
         with gr.TabItem("Highlighted Text"):
             highlighted_text_output = gr.HighlightedText(label="Found Entities", interactive=True)
         with gr.TabItem("Detailed Results"):
+            # NEW: Helpful text about copy/pasting and exporting
+            gr.Markdown("You can sort the table by clicking on column headers or filter by typing in the search box below. Use the button to export the full table to a CSV file.")
+            with gr.Row():
+                export_btn = gr.Button("Export Results to CSV")
+            # NEW: Switched to gr.DataFrame for interactive results
+            detailed_results_output = gr.DataFrame(
+                headers=["Label", "Text Found", "Instances", "Confidence Score"],
+                datatype=["str", "str", "number", "number"],
+                label="Aggregated List of Found Entities"
+            )
+            # NEW: File output component for the download link
+            csv_file_output = gr.File(label="Download CSV", visible=False)
         with gr.TabItem("Debug Log"):
             debug_output = gr.Textbox(label="Extraction Process Log", interactive=False, lines=8)
     # --- Backend Functions ---
     def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
+        # (This function remains unchanged)
         yield {generate_btn: gr.update(value="Generating...", interactive=False)}
         try:
             raise gr.Error(str(e))
     def analyze_text(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
         yield {
             analyze_btn: gr.update(value="Finding Entities...", interactive=False),
+            analysis_status: gr.update(value="The Extraction AI is scanning your text...", visible=True),
+            highlighted_text_output: None, detailed_results_output: None, debug_output: "Starting analysis...",
+            csv_file_output: gr.update(visible=False) # Hide old CSV link
         }
+        # ... (Label collection logic is the same)
         debug_info = []
+        if gliner_model is None: raise gr.Error("Extraction AI (GLiNER model) is not loaded.")
         labels_to_use = set()
         for group in suggested_labels_from_groups:
             if group: labels_to_use.update(group)
         if standard_labels: labels_to_use.update(standard_labels)
         custom = {l.strip() for l in custom_label_text.split(',') if l.strip()}
         if custom: labels_to_use.update(custom)
         final_labels = sorted(list(labels_to_use))
         debug_info.append(f"Searching for {len(final_labels)} unique labels.")
         debug_info.append(f"Confidence Threshold set to: {threshold}")
                 analyze_btn: gr.update(value="Find Entities", interactive=True),
                 analysis_status: gr.update(visible=False),
                 highlighted_text_output: {"text": text, "entities": []},
+                detailed_results_output: None,
+                results_state: None, # Clear state
                 debug_output: "Analysis stopped: No text or no labels provided."
             }
             return
+        # ... (GLiNER prediction logic is the same)
         all_entities = []
         chunk_size, overlap = 1024, 100
         for i in range(0, len(text), chunk_size - overlap):
             for ent in chunk_entities:
                 ent['start'] += i; ent['end'] += i
                 all_entities.append(ent)
         unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
         debug_info.append(f"Found {len(unique_entities)} raw entity mentions.")
+        highlighted_output_data = {"text": text, "entities": [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]}
         aggregated_matches = defaultdict(lambda: {'count': 0, 'scores': [], 'original_casing': ''})
         for ent in unique_entities:
             key = (ent['label'], match_text.lower())
             aggregated_matches[key]['count'] += 1
             aggregated_matches[key]['scores'].append(ent['score'])
+            if not aggregated_matches[key]['original_casing']: aggregated_matches[key]['original_casing'] = match_text
+        # --- NEW LOGIC FOR PANDAS DATAFRAME ---
         table_rows = []
         for (label, _), data in aggregated_matches.items():
             avg_score = np.mean(data['scores'])
             table_rows.append({
+                "Label": label,
+                "Text Found": data['original_casing'],
+                "Instances": data['count'],
+                "Confidence Score": round(avg_score, 2)
             })
+        # Create DataFrame and sort it
+        results_df = pd.DataFrame(table_rows)
+        if not results_df.empty:
+            results_df = results_df.sort_values(by=["Label", "Instances"], ascending=[True, False])
         debug_info.append("Analysis complete.")
         yield {
             analyze_btn: gr.update(value="Find Entities", interactive=True),
             analysis_status: gr.update(visible=False),
             highlighted_text_output: highlighted_output_data,
+            # Output the DataFrame to the gr.DataFrame component
+            detailed_results_output: results_df,
+            # Store the DataFrame in the invisible gr.State component
+            results_state: results_df,
             debug_output: "\n".join(debug_info)
         }
+    # --- NEW FUNCTION TO HANDLE CSV EXPORT ---
+    def export_to_csv(df):
+        if df is None or df.empty:
+            gr.Info("No data to export. Please run 'Find Entities' first.")
+            return None # Return None to keep the file component hidden
+        with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv', encoding='utf-8') as tmpfile:
+            df.to_csv(tmpfile.name, index=False)
+            return gr.update(value=tmpfile.name, visible=True)
     # --- Wire up UI events ---
     generate_btn.click(
         outputs=[generate_btn] + [comp for pair in dynamic_components for comp in pair]
     )
+    def deselect_all(): return gr.update(value=[])
+    def select_all(choices): return gr.update(value=choices)
     deselect_all_std_btn.click(fn=deselect_all, inputs=None, outputs=[standard_labels_checkbox])
     select_all_std_btn.click(lambda: select_all(STANDARD_LABELS), inputs=None, outputs=[standard_labels_checkbox])
     for _, cg, sel_btn, desel_btn in dynamic_components:
         sel_btn.click(fn=lambda c=cg: gr.update(value=c.choices), inputs=None, outputs=[cg])
         desel_btn.click(fn=deselect_all, inputs=None, outputs=[cg])
     analyze_btn.click(
         fn=analyze_text,
         inputs=[text_input, standard_labels_checkbox, custom_labels_textbox, threshold_slider] + [cg for acc, cg, sel, desel in dynamic_components],
+        # Add results_state to the outputs list
+        outputs=[analyze_btn, analysis_status, highlighted_text_output, detailed_results_output, results_state, debug_output, csv_file_output]
+    )
+    # Wire up the new export button
+    export_btn.click(
+        fn=export_to_csv,
+        inputs=[results_state],
+        outputs=[csv_file_output]
     )
 demo.launch(share=True, debug=True)