File size: 18,109 Bytes
2f5ff37
 
e9738aa
2f5ff37
 
 
 
 
 
 
e9738aa
31914d5
e9738aa
80cecba
2f5ff37
 
 
 
 
 
 
 
 
 
 
 
 
e9738aa
2f5ff37
e9738aa
2f5ff37
 
 
 
e9738aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5ff37
e9738aa
 
2f5ff37
80cecba
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5ff37
 
e9738aa
2f5ff37
e9738aa
 
 
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5ff37
 
 
e9738aa
2f5ff37
e9738aa
2f5ff37
 
 
 
35ef54e
2f5ff37
e9738aa
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
2f5ff37
e9738aa
 
 
2f5ff37
 
e9738aa
 
2f5ff37
 
 
 
e9738aa
 
 
 
 
 
 
 
 
2f5ff37
 
 
e9738aa
2f5ff37
e9738aa
 
2f5ff37
e9738aa
 
 
2f5ff37
 
 
e9738aa
 
 
2f5ff37
 
e9738aa
2f5ff37
 
e9738aa
 
80cecba
e9738aa
 
 
80cecba
e9738aa
80cecba
 
e9738aa
 
 
 
 
2f5ff37
e9738aa
 
2f5ff37
e9738aa
 
 
2f5ff37
 
80cecba
e9738aa
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
80cecba
e9738aa
 
2f5ff37
e9738aa
 
 
2f5ff37
 
 
e9738aa
 
2f5ff37
 
e9738aa
2f5ff37
e9738aa
 
 
 
 
 
 
 
 
 
80cecba
e9738aa
 
 
 
 
 
 
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
 
 
2f5ff37
e9738aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80cecba
e9738aa
 
 
 
 
80cecba
2f5ff37
e9738aa
 
 
 
 
80cecba
e9738aa
 
 
 
 
 
80cecba
e9738aa
 
80cecba
e9738aa
 
 
 
 
 
 
2f5ff37
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# πŸ“š Install dependencies
# Make sure to run this in your environment if you haven't already
# !pip install openai anthropic google-generativeai gradio transformers torch gliner --quiet

# βš™οΈ Imports
import openai
import anthropic
import google.generativeai as genai
import gradio as gr
from gliner import GLiNER
import traceback 
from collections import defaultdict, Counter
import numpy as np # For calculating average score
import os

# 🧠 Supported models and their providers
MODEL_OPTIONS = {
    "OpenAI (GPT-4o)": "openai",
    "Anthropic (Claude 3 Opus)": "anthropic",
    "Google (Gemini 1.5 Pro)": "google"
}

# πŸ”§ GLiNER Model Configuration
GLINER_MODEL_NAME = "urchade/gliner_large-v2.1"

# --- Load the model only once at startup ---
try:
    print("Loading AI Detective (GLiNER model)... This may take a moment.")
    gliner_model = GLiNER.from_pretrained(GLINER_MODEL_NAME)
    print("AI Detective loaded successfully.")
except Exception as e:
    print(f"FATAL ERROR: Could not load GLiNER model. The app will not be able to find entities. Error: {e}")
    gliner_model = None

# 🧠 Prompt for the Creative AI to generate label ideas
HIERARCHICAL_PROMPT_TEMPLATE = """
You are a helpful research assistant. For the historical topic: **"{topic}"**, your job is to suggest a research framework.

**Instructions:**
1.  First, think of 4-6 **Conceptual Categories** that are useful for analyzing this topic (e.g., 'Forms of Protest', 'Key Demands'). These will become the labels.
2.  For each category, list specific **Examples** someone could search for in a text.
3.  **Crucial Rule for Labels:** Use the most basic, fundamental form (e.g., `Petition`, not `Political Petition`).

**Output Format:**
Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its examples.

### Example Category 1
- Example A, Example B, Example C
### Example Category 2
- Example D, Example E
"""

# 🧠 Generator Function (The "Creative Brain")
def generate_from_prompt(prompt, provider, key_dict):
    provider_id = MODEL_OPTIONS.get(provider)
    api_key = key_dict.get(f"{provider_id}_key")
    if not api_key:
        raise ValueError(f"API key for {provider} not found.")

    if provider_id == "openai":
        client = openai.OpenAI(api_key=api_key)
        response = client.chat.completions.create(model="gpt-4o", messages=[{"role": "user", "content": prompt}], temperature=0.2)
        return response.choices[0].message.content.strip()
    elif provider_id == "anthropic":
        client = anthropic.Anthropic(api_key=api_key)
        response = client.messages.create(model="claude-3-opus-20240229", max_tokens=1024, messages=[{"role": "user", "content": prompt}])
        return response.content[0].text.strip()
    elif provider_id == "google":
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-1.5-pro-latest')
        response = model.generate_content(prompt)
        return response.text.strip()
    return ""

# --- UI Definitions ---

# A list of standard, common labels the user can always choose from
STANDARD_LABELS = [
    "PERSON", "ORGANIZATION", "LOCATION", "COUNTRY", "CITY", "STATE", 
    "NATIONALITY", "GROUP", "DATE", "EVENT", "LAW", "LEGAL_DOCUMENT", 
    "PRODUCT", "FACILITY", "WORK_OF_ART", "LANGUAGE", "TIME", "PERCENTAGE", 
    "MONEY", "CURRENCY", "QUANTITY", "ORDINAL_NUMBER", "CARDINAL_NUMBER"
]

MAX_CATEGORIES = 8 # The maximum number of AI-suggested categories to show

with gr.Blocks(title="Smart Text Analyzer", css=".prose { word-break: break-word; }") as demo:
    gr.Markdown("# Smart Text Analyzer")
    gr.Markdown(
        """
        Welcome! Paste your text below to automatically find and highlight key information. It's like having two smart assistants read your document for you.

        ### How It Works: Two Brains are Better Than One!
        We use two different types of AI to give you the best results.

        🧠 **1. The Creative Brain (Generative AI - like GPT)**
        This AI is a brainstormer. It reads your topic to understand the context, then *imagines* and *suggests* useful labels that fit your document. It helps you discover what to look for!

        πŸ•΅οΈ **2. The Detective (Extractive AI - GLiNER)**
        This AI is a precise detective. Once you give it a list of labels, it meticulously scans the text and *pulls out* (extracts) the exact words that match. It's fantastic at finding specific information with high accuracy.
        """
    )
    
    gr.Markdown("--- \n## Step 1: Get Label Ideas from the Creative AI")
    with gr.Row():
        topic = gr.Textbox(label="Enter a Topic", placeholder="e.g., The Chartist Movement, The Protestant Reformation")
        provider = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose Creative AI Model")
    with gr.Row():
        openai_key = gr.Textbox(label="OpenAI API Key", type="password")
        anthropic_key = gr.Textbox(label="Anthropic API Key", type="password")
        google_key = gr.Textbox(label="Google API Key", type="password")
    
    generate_btn = gr.Button("Generate Label Suggestions", variant="primary")

    gr.Markdown("--- \n## Step 2: Build Your Search & Analyze Text")
    gr.Markdown(
        """
        ### What are Entities or Labels?
        Think of them as special highlighters! They find and color-code specific types of information in your text, like `PERSON`, `DATE`, `LOCATION`, or custom things you define.
        """
    )
    
    gr.Markdown("#### 1. Review AI-Suggested Labels")
    gr.Markdown("The AI's suggestions appear below. Uncheck any you don't want.")
    
    dynamic_components = []
    with gr.Column():
        for i in range(MAX_CATEGORIES):
            with gr.Accordion(f"Suggested Label Category {i+1}", visible=False) as acc:
                with gr.Row():
                    # The CheckboxGroup holds the actual labels (e.g., "Protest", "Petition")
                    cg = gr.CheckboxGroup(label="Labels in this category", interactive=True, container=False, scale=4)
                    deselect_btn = gr.Button("Deselect All", size="sm", scale=1, min_width=80)
                dynamic_components.append((acc, cg, deselect_btn))
    
    gr.Markdown("#### 2. Include Standard Labels (Optional)")
    with gr.Group():
        standard_labels_checkbox = gr.CheckboxGroup(choices=STANDARD_LABELS, value=STANDARD_LABELS, label="Standard Entity Labels", info="Common categories like people, places, and dates.")
        with gr.Row():
            select_all_std_btn = gr.Button("Select All", size="sm")
            deselect_all_std_btn = gr.Button("Deselect All", size="sm")


    gr.Markdown("#### 3. Add Your Own Custom Labels (Optional)")
    with gr.Group():
        custom_labels_textbox = gr.Textbox(label="Enter Custom Labels (comma-separated)", placeholder="e.g., Technology, Weapon, Secret Society...")
    
    gr.Markdown("--- \n## Step 3: Analyze Your Document")
    threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls how strict the AI Detective is. Lower to find more matches. Higher for fewer, more precise matches.")
    text_input = gr.Textbox(label="Paste Your Full Text Here for Analysis", lines=10, placeholder="Paste a historical document, an article, or a chapter...")
    analyze_btn = gr.Button("Analyze Text & Find Entities", variant="primary")
    
    analysis_status = gr.Markdown(visible=False) # For the "Analyzing..." message
    
    gr.Markdown("--- \n## Step 4: Review Your Results")
    gr.Markdown(
        """
        ✨ **Pro Tip: Create Your Own Labels!**
        Did our AI miss something? In the **"Highlighted Text"** view below, simply **click and drag to highlight any piece of text**. A small box will appear, allowing you to name and add your own custom label!
        """
    )
    
    with gr.Tabs():
        with gr.TabItem("Highlighted Text"):
            highlighted_text_output = gr.HighlightedText(label="Found Entities", interactive=True)
        with gr.TabItem("Detailed Results"):
            detailed_results_output = gr.Markdown(label="List of Found Entities by Label")
        with gr.TabItem("Debug Info"):
            debug_output = gr.Textbox(label="Extraction Log", interactive=False, lines=8)
    
    # --- Backend Functions ---

    def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
        yield {
            generate_btn: gr.update(value="🧠 Generating suggestions...", interactive=False)
        }
        
        try:
            key_dict = {
                "openai_key": os.environ.get("OPENAI_API_KEY", openai_k),
                "anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k),
                "google_key": os.environ.get("GOOGLE_API_KEY", google_k)
            }

            provider_id = MODEL_OPTIONS.get(provider)
            if not topic or not provider or not key_dict.get(f"{provider_id}_key"):
                raise gr.Error("Topic, Provider, and the correct API Key are required.")
            
            prompt = HIERARCHICAL_PROMPT_TEMPLATE.format(topic=topic)
            raw_framework = generate_from_prompt(prompt, provider, key_dict)
            
            # This parsing is simplified for the new structure
            framework = defaultdict(list)
            current_category = None
            for line in raw_framework.split('\n'):
                line = line.strip()
                if line.startswith("###"):
                    current_category = line.replace("###", "").strip()
                elif line.startswith("-") and current_category:
                    entities = line.replace("-", "").strip()
                    framework[current_category].extend([e.strip() for e in entities.split(',') if e.strip()])
            
            if not framework:
                raise gr.Error("AI failed to generate categories. Please try again or rephrase your topic.")

            updates = {}
            categories = list(framework.items())
            for i in range(MAX_CATEGORIES):
                accordion_comp, checkbox_comp, button_comp = dynamic_components[i]
                if i < len(categories):
                    category_name, entities = categories[i]
                    # The labels are the entities themselves, grouped by the category name
                    sorted_entities = sorted(list(set(entities)))
                    updates[accordion_comp] = gr.update(label=f"Category: {category_name}", visible=True)
                    updates[checkbox_comp] = gr.update(choices=sorted_entities, value=sorted_entities, label="Suggested Labels", visible=True)
                    updates[button_comp] = gr.update(visible=True)
                else:
                    updates[accordion_comp] = gr.update(visible=False)
                    updates[checkbox_comp] = gr.update(visible=False)
                    updates[button_comp] = gr.update(visible=False)
            
            updates[generate_btn] = gr.update(value="Generate Label Suggestions", interactive=True)
            yield updates
        except Exception as e:
            yield {generate_btn: gr.update(value="Generate Label Suggestions", interactive=True)}
            raise gr.Error(str(e))

    def analyze_text_and_find_entities(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
        # --- 1. Show Progress to User ---
        yield {
            analyze_btn: gr.update(value="πŸ•΅οΈ Analyzing...", interactive=False),
            analysis_status: gr.update(value="Our AI Detective is scanning your text. This may take a moment...", visible=True),
            highlighted_text_output: None,
            detailed_results_output: None,
            debug_output: "Starting analysis..."
        }
        
        debug_info = []
        if gliner_model is None:
            raise gr.Error("GLiNER model failed to load at startup. Cannot analyze text. Please check logs.")

        # --- 2. Collect All Labels from UI ---
        labels_to_use = set()
        # Add labels from the dynamically generated suggestion groups
        for group in suggested_labels_from_groups:
            if group: labels_to_use.update(group)
        # Add labels from the standard list
        if standard_labels: labels_to_use.update(standard_labels)
        # Add labels from the custom textbox
        custom = {l.strip() for l in custom_label_text.split(',') if l.strip()}
        if custom: labels_to_use.update(custom)
        
        final_labels = sorted(list(labels_to_use))
        debug_info.append(f"🧠 Searching for {len(final_labels)} unique labels.")
        debug_info.append(f"βš™οΈ Confidence Threshold: {threshold}")

        if not text or not final_labels:
            yield {
                analyze_btn: gr.update(value="Analyze Text & Find Entities", interactive=True),
                analysis_status: gr.update(visible=False),
                highlighted_text_output: {"text": text, "entities": []},
                detailed_results_output: "Please provide text and select at least one label to search for.",
                debug_output: "Analysis stopped: No text or no labels provided."
            }
            return

        # --- 3. Run the GLiNER Model (The "Detective") ---
        all_entities = []
        # Process text in chunks to handle very long documents
        chunk_size, overlap = 1024, 100
        for i in range(0, len(text), chunk_size - overlap):
            chunk = text[i : i + chunk_size]
            chunk_entities = gliner_model.predict_entities(chunk, final_labels, threshold=threshold)
            for ent in chunk_entities:
                ent['start'] += i
                ent['end'] += i
                all_entities.append(ent)
        
        # Deduplicate entities that might span across chunk overlaps
        unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
        debug_info.append(f"πŸ“Š Found {len(unique_entities)} raw entity mentions.")

        # --- 4. Prepare Highlighted Text Output ---
        highlighted_output_data = {
            "text": text,
            "entities": [{"start": ent["start"], "end": ent["end"], "label": ent["label"]} for ent in unique_entities]
        }
        
        # --- 5. Prepare Detailed Table-Based Results ---
        aggregated_matches = defaultdict(lambda: {'count': 0, 'scores': [], 'original_casing': ''})
        
        for ent in unique_entities:
            match_text = text[ent['start']:ent['end']]
            # Use a key of (label, lowercase_text) to group similar items
            key = (ent['label'], match_text.lower())
            
            aggregated_matches[key]['count'] += 1
            aggregated_matches[key]['scores'].append(ent['score'])
            # Store the first-seen casing of the text
            if not aggregated_matches[key]['original_casing']:
                aggregated_matches[key]['original_casing'] = match_text
        
        # Group aggregated results by label for final display
        results_by_label = defaultdict(list)
        for (label, _), data in aggregated_matches.items():
            avg_score = np.mean(data['scores'])
            results_by_label[label].append({
                'text': data['original_casing'],
                'count': data['count'],
                'avg_score': avg_score
            })

        # --- 6. Build the Markdown String for the Detailed Table ---
        markdown_string = ""
        for label, items in sorted(results_by_label.items()):
            markdown_string += f"### {label}\n"
            markdown_string += "| Text Found | Instances Found | Avg. Confidence Score* |\n"
            markdown_string += "|------------|-----------------|--------------------------|\n"
            
            # Sort items by count (most frequent first)
            for item in sorted(items, key=lambda x: x['count'], reverse=True):
                markdown_string += f"| {item['text']} | {item['count']} | {item['avg_score']:.2f} |\n"
            markdown_string += "\n"
        
        if not markdown_string:
            markdown_string = "No entities found. Try lowering the confidence threshold or changing your labels."
        else:
            markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the AI Detective (GLiNER) is that it found the correct label (1.00 = 100% certain). The score shown is the average across all instances of that text.</i></small>"
        
        debug_info.append("βœ… Analysis complete.")
        
        # --- 7. Yield Final Results to UI ---
        yield {
            analyze_btn: gr.update(value="Analyze Text & Find Entities", interactive=True),
            analysis_status: gr.update(visible=False),
            highlighted_text_output: highlighted_output_data,
            detailed_results_output: markdown_string,
            debug_output: "\n".join(debug_info)
        }

    # --- Wire up UI events ---
    generate_btn.click(
        fn=handle_generate,
        inputs=[topic, provider, openai_key, anthropic_key, google_key],
        outputs=[generate_btn] + [comp for pair in dynamic_components for comp in pair]
    )
    
    # Functions for Select/Deselect All buttons
    def deselect_all():
        return gr.update(value=[])
    def select_all(choices):
        return gr.update(value=choices)

    deselect_all_std_btn.click(fn=deselect_all, inputs=None, outputs=[standard_labels_checkbox])
    select_all_std_btn.click(lambda: select_all(STANDARD_LABELS), inputs=None, outputs=[standard_labels_checkbox])
    
    for _, cg, btn in dynamic_components:
        btn.click(fn=deselect_all, inputs=None, outputs=[cg])

    analyze_btn.click(
        fn=analyze_text_and_find_entities,
        inputs=[text_input, standard_labels_checkbox, custom_labels_textbox, threshold_slider] + [cg for acc, cg, btn in dynamic_components],
        outputs=[analyze_btn, analysis_status, highlighted_text_output, detailed_results_output, debug_output]
    )

demo.launch(share=True, debug=True)