Spaces:

mk1985
/

Historical-Text-Analyser

Sleeping

File size: 18,109 Bytes

2f5ff37
 
e9738aa
2f5ff37
 
 
 
 
 
 
e9738aa
31914d5
e9738aa
80cecba
2f5ff37
 
 
 
 
 
 
 
 
 
 
 
 
e9738aa
2f5ff37
e9738aa
2f5ff37
 
 
 
e9738aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5ff37
e9738aa
 
2f5ff37
80cecba
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5ff37
 
e9738aa
2f5ff37
e9738aa
 
 
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5ff37
 
 
e9738aa
2f5ff37
e9738aa
2f5ff37
 
 
 
35ef54e
2f5ff37
e9738aa
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
2f5ff37
e9738aa
 
 
2f5ff37
 
e9738aa
 
2f5ff37
 
 
 
e9738aa
 
 
 
 
 
 
 
 
2f5ff37
 
 
e9738aa
2f5ff37
e9738aa
 
2f5ff37
e9738aa
 
 
2f5ff37
 
 
e9738aa
 
 
2f5ff37
 
e9738aa
2f5ff37
 
e9738aa
 
80cecba
e9738aa
 
 
80cecba
e9738aa
80cecba
 
e9738aa
 
 
 
 
2f5ff37
e9738aa
 
2f5ff37
e9738aa
 
 
2f5ff37
 
80cecba
e9738aa
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
80cecba
e9738aa
 
2f5ff37
e9738aa
 
 
2f5ff37
 
 
e9738aa
 
2f5ff37
 
e9738aa
2f5ff37
e9738aa
 
 
 
 
 
 
 
 
 
80cecba
e9738aa
 
 
 
 
 
 
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80cecba
e9738aa
 
 
 
 
80cecba
2f5ff37
e9738aa
 
 
 
 
80cecba
e9738aa
 
 
 
 
 
80cecba
e9738aa
 
80cecba
e9738aa
 
 
 
 
 
 
2f5ff37

# 📚 Install dependencies
# Make sure to run this in your environment if you haven't already
# !pip install openai anthropic google-generativeai gradio transformers torch gliner --quiet

# ⚙️ Imports
import openai
import anthropic
import google.generativeai as genai
import gradio as gr
from gliner import GLiNER
import traceback 
from collections import defaultdict, Counter
import numpy as np # For calculating average score
import os

# 🧠 Supported models and their providers
MODEL_OPTIONS = {
    "OpenAI (GPT-4o)": "openai",
    "Anthropic (Claude 3 Opus)": "anthropic",
    "Google (Gemini 1.5 Pro)": "google"
}

# 🔧 GLiNER Model Configuration
GLINER_MODEL_NAME = "urchade/gliner_large-v2.1"

# --- Load the model only once at startup ---
try:
    print("Loading AI Detective (GLiNER model)... This may take a moment.")
    gliner_model = GLiNER.from_pretrained(GLINER_MODEL_NAME)
    print("AI Detective loaded successfully.")
except Exception as e:
    print(f"FATAL ERROR: Could not load GLiNER model. The app will not be able to find entities. Error: {e}")
    gliner_model = None

# 🧠 Prompt for the Creative AI to generate label ideas
HIERARCHICAL_PROMPT_TEMPLATE = """
You are a helpful research assistant. For the historical topic: **"{topic}"**, your job is to suggest a research framework.

**Instructions:**
1.  First, think of 4-6 **Conceptual Categories** that are useful for analyzing this topic (e.g., 'Forms of Protest', 'Key Demands'). These will become the labels.
2.  For each category, list specific **Examples** someone could search for in a text.
3.  **Crucial Rule for Labels:** Use the most basic, fundamental form (e.g., `Petition`, not `Political Petition`).

**Output Format:**
Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its examples.

### Example Category 1
- Example A, Example B, Example C
### Example Category 2
- Example D, Example E
"""

# 🧠 Generator Function (The "Creative Brain")
def generate_from_prompt(prompt, provider, key_dict):
    provider_id = MODEL_OPTIONS.get(provider)
    api_key = key_dict.get(f"{provider_id}_key")
    if not api_key:
        raise ValueError(f"API key for {provider} not found.")

    if provider_id == "openai":
        client = openai.OpenAI(api_key=api_key)
        response = client.chat.completions.create(model="gpt-4o", messages=[{"role": "user", "content": prompt}], temperature=0.2)
        return response.choices[0].message.content.strip()
    elif provider_id == "anthropic":
        client = anthropic.Anthropic(api_key=api_key)
        response = client.messages.create(model="claude-3-opus-20240229", max_tokens=1024, messages=[{"role": "user", "content": prompt}])
        return response.content[0].text.strip()
    elif provider_id == "google":
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-1.5-pro-latest')
        response = model.generate_content(prompt)
        return response.text.strip()
    return ""

# --- UI Definitions ---

# A list of standard, common labels the user can always choose from
STANDARD_LABELS = [
    "PERSON", "ORGANIZATION", "LOCATION", "COUNTRY", "CITY", "STATE", 
    "NATIONALITY", "GROUP", "DATE", "EVENT", "LAW", "LEGAL_DOCUMENT", 
    "PRODUCT", "FACILITY", "WORK_OF_ART", "LANGUAGE", "TIME", "PERCENTAGE", 
    "MONEY", "CURRENCY", "QUANTITY", "ORDINAL_NUMBER", "CARDINAL_NUMBER"
]

MAX_CATEGORIES = 8 # The maximum number of AI-suggested categories to show

with gr.Blocks(title="Smart Text Analyzer", css=".prose { word-break: break-word; }") as demo:
    gr.Markdown("# Smart Text Analyzer")
    gr.Markdown(
        """
        Welcome! Paste your text below to automatically find and highlight key information. It's like having two smart assistants read your document for you.

        ### How It Works: Two Brains are Better Than One!
        We use two different types of AI to give you the best results.

        🧠 **1. The Creative Brain (Generative AI - like GPT)**
        This AI is a brainstormer. It reads your topic to understand the context, then *imagines* and *suggests* useful labels that fit your document. It helps you discover what to look for!

        🕵️ **2. The Detective (Extractive AI - GLiNER)**
        This AI is a precise detective. Once you give it a list of labels, it meticulously scans the text and *pulls out* (extracts) the exact words that match. It's fantastic at finding specific information with high accuracy.
        """
    )
    
    gr.Markdown("--- \n## Step 1: Get Label Ideas from the Creative AI")
    with gr.Row():
        topic = gr.Textbox(label="Enter a Topic", placeholder="e.g., The Chartist Movement, The Protestant Reformation")
        provider = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose Creative AI Model")
    with gr.Row():
        openai_key = gr.Textbox(label="OpenAI API Key", type="password")
        anthropic_key = gr.Textbox(label="Anthropic API Key", type="password")
        google_key = gr.Textbox(label="Google API Key", type="password")
    
    generate_btn = gr.Button("Generate Label Suggestions", variant="primary")

    gr.Markdown("--- \n## Step 2: Build Your Search & Analyze Text")
    gr.Markdown(
        """
        ### What are Entities or Labels?
        Think of them as special highlighters! They find and color-code specific types of information in your text, like `PERSON`, `DATE`, `LOCATION`, or custom things you define.
        """
    )
    
    gr.Markdown("#### 1. Review AI-Suggested Labels")
    gr.Markdown("The AI's suggestions appear below. Uncheck any you don't want.")
    
    dynamic_components = []
    with gr.Column():
        for i in range(MAX_CATEGORIES):
            with gr.Accordion(f"Suggested Label Category {i+1}", visible=False) as acc:
                with gr.Row():
                    # The CheckboxGroup holds the actual labels (e.g., "Protest", "Petition")
                    cg = gr.CheckboxGroup(label="Labels in this category", interactive=True, container=False, scale=4)
                    deselect_btn = gr.Button("Deselect All", size="sm", scale=1, min_width=80)
                dynamic_components.append((acc, cg, deselect_btn))
    
    gr.Markdown("#### 2. Include Standard Labels (Optional)")
    with gr.Group():
        standard_labels_checkbox = gr.CheckboxGroup(choices=STANDARD_LABELS, value=STANDARD_LABELS, label="Standard Entity Labels", info="Common categories like people, places, and dates.")
        with gr.Row():
            select_all_std_btn = gr.Button("Select All", size="sm")
            deselect_all_std_btn = gr.Button("Deselect All", size="sm")


    gr.Markdown("#### 3. Add Your Own Custom Labels (Optional)")
    with gr.Group():
        custom_labels_textbox = gr.Textbox(label="Enter Custom Labels (comma-separated)", placeholder="e.g., Technology, Weapon, Secret Society...")
    
    gr.Markdown("--- \n## Step 3: Analyze Your Document")
    threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls how strict the AI Detective is. Lower to find more matches. Higher for fewer, more precise matches.")
    text_input = gr.Textbox(label="Paste Your Full Text Here for Analysis", lines=10, placeholder="Paste a historical document, an article, or a chapter...")
    analyze_btn = gr.Button("Analyze Text & Find Entities", variant="primary")
    
    analysis_status = gr.Markdown(visible=False) # For the "Analyzing..." message
    
    gr.Markdown("--- \n## Step 4: Review Your Results")
    gr.Markdown(
        """
        ✨ **Pro Tip: Create Your Own Labels!**
        Did our AI miss something? In the **"Highlighted Text"** view below, simply **click and drag to highlight any piece of text**. A small box will appear, allowing you to name and add your own custom label!
        """
    )
    
    with gr.Tabs():
        with gr.TabItem("Highlighted Text"):
            highlighted_text_output = gr.HighlightedText(label="Found Entities", interactive=True)
        with gr.TabItem("Detailed Results"):
            detailed_results_output = gr.Markdown(label="List of Found Entities by Label")
        with gr.TabItem("Debug Info"):
            debug_output = gr.Textbox(label="Extraction Log", interactive=False, lines=8)
    
    # --- Backend Functions ---

    def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
        yield {
            generate_btn: gr.update(value="🧠 Generating suggestions...", interactive=False)
        }
        
        try:
            key_dict = {
                "openai_key": os.environ.get("OPENAI_API_KEY", openai_k),
                "anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k),
                "google_key": os.environ.get("GOOGLE_API_KEY", google_k)
            }

            provider_id = MODEL_OPTIONS.get(provider)
            if not topic or not provider or not key_dict.get(f"{provider_id}_key"):
                raise gr.Error("Topic, Provider, and the correct API Key are required.")
            
            prompt = HIERARCHICAL_PROMPT_TEMPLATE.format(topic=topic)
            raw_framework = generate_from_prompt(prompt, provider, key_dict)
            
            # This parsing is simplified for the new structure
            framework = defaultdict(list)
            current_category = None
            for line in raw_framework.split('\n'):
                line = line.strip()
                if line.startswith("###"):
                    current_category = line.replace("###", "").strip()
                elif line.startswith("-") and current_category:
                    entities = line.replace("-", "").strip()
                    framework[current_category].extend([e.strip() for e in entities.split(',') if e.strip()])
            
            if not framework:
                raise gr.Error("AI failed to generate categories. Please try again or rephrase your topic.")

            updates = {}
            categories = list(framework.items())
            for i in range(MAX_CATEGORIES):
                accordion_comp, checkbox_comp, button_comp = dynamic_components[i]
                if i < len(categories):
                    category_name, entities = categories[i]
                    # The labels are the entities themselves, grouped by the category name
                    sorted_entities = sorted(list(set(entities)))
                    updates[accordion_comp] = gr.update(label=f"Category: {category_name}", visible=True)
                    updates[checkbox_comp] = gr.update(choices=sorted_entities, value=sorted_entities, label="Suggested Labels", visible=True)
                    updates[button_comp] = gr.update(visible=True)
                else:
                    updates[accordion_comp] = gr.update(visible=False)
                    updates[checkbox_comp] = gr.update(visible=False)
                    updates[button_comp] = gr.update(visible=False)
            
            updates[generate_btn] = gr.update(value="Generate Label Suggestions", interactive=True)
            yield updates
        except Exception as e:
            yield {generate_btn: gr.update(value="Generate Label Suggestions", interactive=True)}
            raise gr.Error(str(e))

    def analyze_text_and_find_entities(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
        # --- 1. Show Progress to User ---
        yield {
            analyze_btn: gr.update(value="🕵️ Analyzing...", interactive=False),
            analysis_status: gr.update(value="Our AI Detective is scanning your text. This may take a moment...", visible=True),
            highlighted_text_output: None,
            detailed_results_output: None,
            debug_output: "Starting analysis..."
        }
        
        debug_info = []
        if gliner_model is None:
            raise gr.Error("GLiNER model failed to load at startup. Cannot analyze text. Please check logs.")

        # --- 2. Collect All Labels from UI ---
        labels_to_use = set()
        # Add labels from the dynamically generated suggestion groups
        for group in suggested_labels_from_groups:
            if group: labels_to_use.update(group)
        # Add labels from the standard list
        if standard_labels: labels_to_use.update(standard_labels)
        # Add labels from the custom textbox
        custom = {l.strip() for l in custom_label_text.split(',') if l.strip()}
        if custom: labels_to_use.update(custom)
        
        final_labels = sorted(list(labels_to_use))
        debug_info.append(f"🧠 Searching for {len(final_labels)} unique labels.")
        debug_info.append(f"⚙️ Confidence Threshold: {threshold}")

        if not text or not final_labels:
            yield {
                analyze_btn: gr.update(value="Analyze Text & Find Entities", interactive=True),
                analysis_status: gr.update(visible=False),
                highlighted_text_output: {"text": text, "entities": []},
                detailed_results_output: "Please provide text and select at least one label to search for.",
                debug_output: "Analysis stopped: No text or no labels provided."
            }
            return

        # --- 3. Run the GLiNER Model (The "Detective") ---
        all_entities = []
        # Process text in chunks to handle very long documents
        chunk_size, overlap = 1024, 100
        for i in range(0, len(text), chunk_size - overlap):
            chunk = text[i : i + chunk_size]
            chunk_entities = gliner_model.predict_entities(chunk, final_labels, threshold=threshold)
            for ent in chunk_entities:
                ent['start'] += i
                ent['end'] += i
                all_entities.append(ent)
        
        # Deduplicate entities that might span across chunk overlaps
        unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
        debug_info.append(f"📊 Found {len(unique_entities)} raw entity mentions.")

        # --- 4. Prepare Highlighted Text Output ---
        highlighted_output_data = {
            "text": text,
            "entities": [{"start": ent["start"], "end": ent["end"], "label": ent["label"]} for ent in unique_entities]
        }
        
        # --- 5. Prepare Detailed Table-Based Results ---
        aggregated_matches = defaultdict(lambda: {'count': 0, 'scores': [], 'original_casing': ''})
        
        for ent in unique_entities:
            match_text = text[ent['start']:ent['end']]
            # Use a key of (label, lowercase_text) to group similar items
            key = (ent['label'], match_text.lower())
            
            aggregated_matches[key]['count'] += 1
            aggregated_matches[key]['scores'].append(ent['score'])
            # Store the first-seen casing of the text
            if not aggregated_matches[key]['original_casing']:
                aggregated_matches[key]['original_casing'] = match_text
        
        # Group aggregated results by label for final display
        results_by_label = defaultdict(list)
        for (label, _), data in aggregated_matches.items():
            avg_score = np.mean(data['scores'])
            results_by_label[label].append({
                'text': data['original_casing'],
                'count': data['count'],
                'avg_score': avg_score
            })

        # --- 6. Build the Markdown String for the Detailed Table ---
        markdown_string = ""
        for label, items in sorted(results_by_label.items()):
            markdown_string += f"### {label}\n"
            markdown_string += "| Text Found | Instances Found | Avg. Confidence Score* |\n"
            markdown_string += "|------------|-----------------|--------------------------|\n"
            
            # Sort items by count (most frequent first)
            for item in sorted(items, key=lambda x: x['count'], reverse=True):
                markdown_string += f"| {item['text']} | {item['count']} | {item['avg_score']:.2f} |\n"
            markdown_string += "\n"
        
        if not markdown_string:
            markdown_string = "No entities found. Try lowering the confidence threshold or changing your labels."
        else:
            markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the AI Detective (GLiNER) is that it found the correct label (1.00 = 100% certain). The score shown is the average across all instances of that text.</i></small>"
        
        debug_info.append("✅ Analysis complete.")
        
        # --- 7. Yield Final Results to UI ---
        yield {
            analyze_btn: gr.update(value="Analyze Text & Find Entities", interactive=True),
            analysis_status: gr.update(visible=False),
            highlighted_text_output: highlighted_output_data,
            detailed_results_output: markdown_string,
            debug_output: "\n".join(debug_info)
        }

    # --- Wire up UI events ---
    generate_btn.click(
        fn=handle_generate,
        inputs=[topic, provider, openai_key, anthropic_key, google_key],
        outputs=[generate_btn] + [comp for pair in dynamic_components for comp in pair]
    )
    
    # Functions for Select/Deselect All buttons
    def deselect_all():
        return gr.update(value=[])
    def select_all(choices):
        return gr.update(value=choices)

    deselect_all_std_btn.click(fn=deselect_all, inputs=None, outputs=[standard_labels_checkbox])
    select_all_std_btn.click(lambda: select_all(STANDARD_LABELS), inputs=None, outputs=[standard_labels_checkbox])
    
    for _, cg, btn in dynamic_components:
        btn.click(fn=deselect_all, inputs=None, outputs=[cg])

    analyze_btn.click(
        fn=analyze_text_and_find_entities,
        inputs=[text_input, standard_labels_checkbox, custom_labels_textbox, threshold_slider] + [cg for acc, cg, btn in dynamic_components],
        outputs=[analyze_btn, analysis_status, highlighted_text_output, detailed_results_output, debug_output]
    )

demo.launch(share=True, debug=True)