Spaces:
Sleeping
Sleeping
File size: 18,109 Bytes
2f5ff37 e9738aa 2f5ff37 e9738aa 31914d5 e9738aa 80cecba 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 80cecba 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 35ef54e 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 80cecba e9738aa 80cecba e9738aa 80cecba e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 80cecba e9738aa 2f5ff37 e9738aa 80cecba e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 80cecba e9738aa 2f5ff37 e9738aa 2f5ff37 e9738aa 80cecba e9738aa 80cecba 2f5ff37 e9738aa 80cecba e9738aa 80cecba e9738aa 80cecba e9738aa 2f5ff37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 |
# π Install dependencies
# Make sure to run this in your environment if you haven't already
# !pip install openai anthropic google-generativeai gradio transformers torch gliner --quiet
# βοΈ Imports
import openai
import anthropic
import google.generativeai as genai
import gradio as gr
from gliner import GLiNER
import traceback
from collections import defaultdict, Counter
import numpy as np # For calculating average score
import os
# π§ Supported models and their providers
MODEL_OPTIONS = {
"OpenAI (GPT-4o)": "openai",
"Anthropic (Claude 3 Opus)": "anthropic",
"Google (Gemini 1.5 Pro)": "google"
}
# π§ GLiNER Model Configuration
GLINER_MODEL_NAME = "urchade/gliner_large-v2.1"
# --- Load the model only once at startup ---
try:
print("Loading AI Detective (GLiNER model)... This may take a moment.")
gliner_model = GLiNER.from_pretrained(GLINER_MODEL_NAME)
print("AI Detective loaded successfully.")
except Exception as e:
print(f"FATAL ERROR: Could not load GLiNER model. The app will not be able to find entities. Error: {e}")
gliner_model = None
# π§ Prompt for the Creative AI to generate label ideas
HIERARCHICAL_PROMPT_TEMPLATE = """
You are a helpful research assistant. For the historical topic: **"{topic}"**, your job is to suggest a research framework.
**Instructions:**
1. First, think of 4-6 **Conceptual Categories** that are useful for analyzing this topic (e.g., 'Forms of Protest', 'Key Demands'). These will become the labels.
2. For each category, list specific **Examples** someone could search for in a text.
3. **Crucial Rule for Labels:** Use the most basic, fundamental form (e.g., `Petition`, not `Political Petition`).
**Output Format:**
Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its examples.
### Example Category 1
- Example A, Example B, Example C
### Example Category 2
- Example D, Example E
"""
# π§ Generator Function (The "Creative Brain")
def generate_from_prompt(prompt, provider, key_dict):
provider_id = MODEL_OPTIONS.get(provider)
api_key = key_dict.get(f"{provider_id}_key")
if not api_key:
raise ValueError(f"API key for {provider} not found.")
if provider_id == "openai":
client = openai.OpenAI(api_key=api_key)
response = client.chat.completions.create(model="gpt-4o", messages=[{"role": "user", "content": prompt}], temperature=0.2)
return response.choices[0].message.content.strip()
elif provider_id == "anthropic":
client = anthropic.Anthropic(api_key=api_key)
response = client.messages.create(model="claude-3-opus-20240229", max_tokens=1024, messages=[{"role": "user", "content": prompt}])
return response.content[0].text.strip()
elif provider_id == "google":
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-pro-latest')
response = model.generate_content(prompt)
return response.text.strip()
return ""
# --- UI Definitions ---
# A list of standard, common labels the user can always choose from
STANDARD_LABELS = [
"PERSON", "ORGANIZATION", "LOCATION", "COUNTRY", "CITY", "STATE",
"NATIONALITY", "GROUP", "DATE", "EVENT", "LAW", "LEGAL_DOCUMENT",
"PRODUCT", "FACILITY", "WORK_OF_ART", "LANGUAGE", "TIME", "PERCENTAGE",
"MONEY", "CURRENCY", "QUANTITY", "ORDINAL_NUMBER", "CARDINAL_NUMBER"
]
MAX_CATEGORIES = 8 # The maximum number of AI-suggested categories to show
with gr.Blocks(title="Smart Text Analyzer", css=".prose { word-break: break-word; }") as demo:
gr.Markdown("# Smart Text Analyzer")
gr.Markdown(
"""
Welcome! Paste your text below to automatically find and highlight key information. It's like having two smart assistants read your document for you.
### How It Works: Two Brains are Better Than One!
We use two different types of AI to give you the best results.
π§ **1. The Creative Brain (Generative AI - like GPT)**
This AI is a brainstormer. It reads your topic to understand the context, then *imagines* and *suggests* useful labels that fit your document. It helps you discover what to look for!
π΅οΈ **2. The Detective (Extractive AI - GLiNER)**
This AI is a precise detective. Once you give it a list of labels, it meticulously scans the text and *pulls out* (extracts) the exact words that match. It's fantastic at finding specific information with high accuracy.
"""
)
gr.Markdown("--- \n## Step 1: Get Label Ideas from the Creative AI")
with gr.Row():
topic = gr.Textbox(label="Enter a Topic", placeholder="e.g., The Chartist Movement, The Protestant Reformation")
provider = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose Creative AI Model")
with gr.Row():
openai_key = gr.Textbox(label="OpenAI API Key", type="password")
anthropic_key = gr.Textbox(label="Anthropic API Key", type="password")
google_key = gr.Textbox(label="Google API Key", type="password")
generate_btn = gr.Button("Generate Label Suggestions", variant="primary")
gr.Markdown("--- \n## Step 2: Build Your Search & Analyze Text")
gr.Markdown(
"""
### What are Entities or Labels?
Think of them as special highlighters! They find and color-code specific types of information in your text, like `PERSON`, `DATE`, `LOCATION`, or custom things you define.
"""
)
gr.Markdown("#### 1. Review AI-Suggested Labels")
gr.Markdown("The AI's suggestions appear below. Uncheck any you don't want.")
dynamic_components = []
with gr.Column():
for i in range(MAX_CATEGORIES):
with gr.Accordion(f"Suggested Label Category {i+1}", visible=False) as acc:
with gr.Row():
# The CheckboxGroup holds the actual labels (e.g., "Protest", "Petition")
cg = gr.CheckboxGroup(label="Labels in this category", interactive=True, container=False, scale=4)
deselect_btn = gr.Button("Deselect All", size="sm", scale=1, min_width=80)
dynamic_components.append((acc, cg, deselect_btn))
gr.Markdown("#### 2. Include Standard Labels (Optional)")
with gr.Group():
standard_labels_checkbox = gr.CheckboxGroup(choices=STANDARD_LABELS, value=STANDARD_LABELS, label="Standard Entity Labels", info="Common categories like people, places, and dates.")
with gr.Row():
select_all_std_btn = gr.Button("Select All", size="sm")
deselect_all_std_btn = gr.Button("Deselect All", size="sm")
gr.Markdown("#### 3. Add Your Own Custom Labels (Optional)")
with gr.Group():
custom_labels_textbox = gr.Textbox(label="Enter Custom Labels (comma-separated)", placeholder="e.g., Technology, Weapon, Secret Society...")
gr.Markdown("--- \n## Step 3: Analyze Your Document")
threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls how strict the AI Detective is. Lower to find more matches. Higher for fewer, more precise matches.")
text_input = gr.Textbox(label="Paste Your Full Text Here for Analysis", lines=10, placeholder="Paste a historical document, an article, or a chapter...")
analyze_btn = gr.Button("Analyze Text & Find Entities", variant="primary")
analysis_status = gr.Markdown(visible=False) # For the "Analyzing..." message
gr.Markdown("--- \n## Step 4: Review Your Results")
gr.Markdown(
"""
β¨ **Pro Tip: Create Your Own Labels!**
Did our AI miss something? In the **"Highlighted Text"** view below, simply **click and drag to highlight any piece of text**. A small box will appear, allowing you to name and add your own custom label!
"""
)
with gr.Tabs():
with gr.TabItem("Highlighted Text"):
highlighted_text_output = gr.HighlightedText(label="Found Entities", interactive=True)
with gr.TabItem("Detailed Results"):
detailed_results_output = gr.Markdown(label="List of Found Entities by Label")
with gr.TabItem("Debug Info"):
debug_output = gr.Textbox(label="Extraction Log", interactive=False, lines=8)
# --- Backend Functions ---
def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
yield {
generate_btn: gr.update(value="π§ Generating suggestions...", interactive=False)
}
try:
key_dict = {
"openai_key": os.environ.get("OPENAI_API_KEY", openai_k),
"anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k),
"google_key": os.environ.get("GOOGLE_API_KEY", google_k)
}
provider_id = MODEL_OPTIONS.get(provider)
if not topic or not provider or not key_dict.get(f"{provider_id}_key"):
raise gr.Error("Topic, Provider, and the correct API Key are required.")
prompt = HIERARCHICAL_PROMPT_TEMPLATE.format(topic=topic)
raw_framework = generate_from_prompt(prompt, provider, key_dict)
# This parsing is simplified for the new structure
framework = defaultdict(list)
current_category = None
for line in raw_framework.split('\n'):
line = line.strip()
if line.startswith("###"):
current_category = line.replace("###", "").strip()
elif line.startswith("-") and current_category:
entities = line.replace("-", "").strip()
framework[current_category].extend([e.strip() for e in entities.split(',') if e.strip()])
if not framework:
raise gr.Error("AI failed to generate categories. Please try again or rephrase your topic.")
updates = {}
categories = list(framework.items())
for i in range(MAX_CATEGORIES):
accordion_comp, checkbox_comp, button_comp = dynamic_components[i]
if i < len(categories):
category_name, entities = categories[i]
# The labels are the entities themselves, grouped by the category name
sorted_entities = sorted(list(set(entities)))
updates[accordion_comp] = gr.update(label=f"Category: {category_name}", visible=True)
updates[checkbox_comp] = gr.update(choices=sorted_entities, value=sorted_entities, label="Suggested Labels", visible=True)
updates[button_comp] = gr.update(visible=True)
else:
updates[accordion_comp] = gr.update(visible=False)
updates[checkbox_comp] = gr.update(visible=False)
updates[button_comp] = gr.update(visible=False)
updates[generate_btn] = gr.update(value="Generate Label Suggestions", interactive=True)
yield updates
except Exception as e:
yield {generate_btn: gr.update(value="Generate Label Suggestions", interactive=True)}
raise gr.Error(str(e))
def analyze_text_and_find_entities(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
# --- 1. Show Progress to User ---
yield {
analyze_btn: gr.update(value="π΅οΈ Analyzing...", interactive=False),
analysis_status: gr.update(value="Our AI Detective is scanning your text. This may take a moment...", visible=True),
highlighted_text_output: None,
detailed_results_output: None,
debug_output: "Starting analysis..."
}
debug_info = []
if gliner_model is None:
raise gr.Error("GLiNER model failed to load at startup. Cannot analyze text. Please check logs.")
# --- 2. Collect All Labels from UI ---
labels_to_use = set()
# Add labels from the dynamically generated suggestion groups
for group in suggested_labels_from_groups:
if group: labels_to_use.update(group)
# Add labels from the standard list
if standard_labels: labels_to_use.update(standard_labels)
# Add labels from the custom textbox
custom = {l.strip() for l in custom_label_text.split(',') if l.strip()}
if custom: labels_to_use.update(custom)
final_labels = sorted(list(labels_to_use))
debug_info.append(f"π§ Searching for {len(final_labels)} unique labels.")
debug_info.append(f"βοΈ Confidence Threshold: {threshold}")
if not text or not final_labels:
yield {
analyze_btn: gr.update(value="Analyze Text & Find Entities", interactive=True),
analysis_status: gr.update(visible=False),
highlighted_text_output: {"text": text, "entities": []},
detailed_results_output: "Please provide text and select at least one label to search for.",
debug_output: "Analysis stopped: No text or no labels provided."
}
return
# --- 3. Run the GLiNER Model (The "Detective") ---
all_entities = []
# Process text in chunks to handle very long documents
chunk_size, overlap = 1024, 100
for i in range(0, len(text), chunk_size - overlap):
chunk = text[i : i + chunk_size]
chunk_entities = gliner_model.predict_entities(chunk, final_labels, threshold=threshold)
for ent in chunk_entities:
ent['start'] += i
ent['end'] += i
all_entities.append(ent)
# Deduplicate entities that might span across chunk overlaps
unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
debug_info.append(f"π Found {len(unique_entities)} raw entity mentions.")
# --- 4. Prepare Highlighted Text Output ---
highlighted_output_data = {
"text": text,
"entities": [{"start": ent["start"], "end": ent["end"], "label": ent["label"]} for ent in unique_entities]
}
# --- 5. Prepare Detailed Table-Based Results ---
aggregated_matches = defaultdict(lambda: {'count': 0, 'scores': [], 'original_casing': ''})
for ent in unique_entities:
match_text = text[ent['start']:ent['end']]
# Use a key of (label, lowercase_text) to group similar items
key = (ent['label'], match_text.lower())
aggregated_matches[key]['count'] += 1
aggregated_matches[key]['scores'].append(ent['score'])
# Store the first-seen casing of the text
if not aggregated_matches[key]['original_casing']:
aggregated_matches[key]['original_casing'] = match_text
# Group aggregated results by label for final display
results_by_label = defaultdict(list)
for (label, _), data in aggregated_matches.items():
avg_score = np.mean(data['scores'])
results_by_label[label].append({
'text': data['original_casing'],
'count': data['count'],
'avg_score': avg_score
})
# --- 6. Build the Markdown String for the Detailed Table ---
markdown_string = ""
for label, items in sorted(results_by_label.items()):
markdown_string += f"### {label}\n"
markdown_string += "| Text Found | Instances Found | Avg. Confidence Score* |\n"
markdown_string += "|------------|-----------------|--------------------------|\n"
# Sort items by count (most frequent first)
for item in sorted(items, key=lambda x: x['count'], reverse=True):
markdown_string += f"| {item['text']} | {item['count']} | {item['avg_score']:.2f} |\n"
markdown_string += "\n"
if not markdown_string:
markdown_string = "No entities found. Try lowering the confidence threshold or changing your labels."
else:
markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the AI Detective (GLiNER) is that it found the correct label (1.00 = 100% certain). The score shown is the average across all instances of that text.</i></small>"
debug_info.append("β
Analysis complete.")
# --- 7. Yield Final Results to UI ---
yield {
analyze_btn: gr.update(value="Analyze Text & Find Entities", interactive=True),
analysis_status: gr.update(visible=False),
highlighted_text_output: highlighted_output_data,
detailed_results_output: markdown_string,
debug_output: "\n".join(debug_info)
}
# --- Wire up UI events ---
generate_btn.click(
fn=handle_generate,
inputs=[topic, provider, openai_key, anthropic_key, google_key],
outputs=[generate_btn] + [comp for pair in dynamic_components for comp in pair]
)
# Functions for Select/Deselect All buttons
def deselect_all():
return gr.update(value=[])
def select_all(choices):
return gr.update(value=choices)
deselect_all_std_btn.click(fn=deselect_all, inputs=None, outputs=[standard_labels_checkbox])
select_all_std_btn.click(lambda: select_all(STANDARD_LABELS), inputs=None, outputs=[standard_labels_checkbox])
for _, cg, btn in dynamic_components:
btn.click(fn=deselect_all, inputs=None, outputs=[cg])
analyze_btn.click(
fn=analyze_text_and_find_entities,
inputs=[text_input, standard_labels_checkbox, custom_labels_textbox, threshold_slider] + [cg for acc, cg, btn in dynamic_components],
outputs=[analyze_btn, analysis_status, highlighted_text_output, detailed_results_output, debug_output]
)
demo.launch(share=True, debug=True) |