Spaces:

mk1985
/

Historical-Text-Analyser

Sleeping

App Files Files Community

Historical-Text-Analyser / app.py

mk1985

Update app.py

e9738aa verified 5 months ago

raw

history blame

18.1 kB

	# 📚 Install dependencies
	# Make sure to run this in your environment if you haven't already
	# !pip install openai anthropic google-generativeai gradio transformers torch gliner --quiet

	# ⚙️ Imports
	import openai
	import anthropic
	import google.generativeai as genai
	import gradio as gr
	from gliner import GLiNER
	import traceback
	from collections import defaultdict, Counter
	import numpy as np # For calculating average score
	import os

	# 🧠 Supported models and their providers
	MODEL_OPTIONS = {
	"OpenAI (GPT-4o)": "openai",
	"Anthropic (Claude 3 Opus)": "anthropic",
	"Google (Gemini 1.5 Pro)": "google"
	}

	# 🔧 GLiNER Model Configuration
	GLINER_MODEL_NAME = "urchade/gliner_large-v2.1"

	# --- Load the model only once at startup ---
	try:
	print("Loading AI Detective (GLiNER model)... This may take a moment.")
	gliner_model = GLiNER.from_pretrained(GLINER_MODEL_NAME)
	print("AI Detective loaded successfully.")
	except Exception as e:
	print(f"FATAL ERROR: Could not load GLiNER model. The app will not be able to find entities. Error: {e}")
	gliner_model = None

	# 🧠 Prompt for the Creative AI to generate label ideas
	HIERARCHICAL_PROMPT_TEMPLATE = """
	You are a helpful research assistant. For the historical topic: "{topic}", your job is to suggest a research framework.

	Instructions:
	1. First, think of 4-6 Conceptual Categories that are useful for analyzing this topic (e.g., 'Forms of Protest', 'Key Demands'). These will become the labels.
	2. For each category, list specific Examples someone could search for in a text.
	3. Crucial Rule for Labels: Use the most basic, fundamental form (e.g., `Petition`, not `Political Petition`).

	Output Format:
	Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its examples.

	### Example Category 1
	- Example A, Example B, Example C
	### Example Category 2
	- Example D, Example E
	"""

	# 🧠 Generator Function (The "Creative Brain")
	def generate_from_prompt(prompt, provider, key_dict):
	provider_id = MODEL_OPTIONS.get(provider)
	api_key = key_dict.get(f"{provider_id}_key")
	if not api_key:
	raise ValueError(f"API key for {provider} not found.")

	if provider_id == "openai":
	client = openai.OpenAI(api_key=api_key)
	response = client.chat.completions.create(model="gpt-4o", messages=[{"role": "user", "content": prompt}], temperature=0.2)
	return response.choices[0].message.content.strip()
	elif provider_id == "anthropic":
	client = anthropic.Anthropic(api_key=api_key)
	response = client.messages.create(model="claude-3-opus-20240229", max_tokens=1024, messages=[{"role": "user", "content": prompt}])
	return response.content[0].text.strip()
	elif provider_id == "google":
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('gemini-1.5-pro-latest')
	response = model.generate_content(prompt)
	return response.text.strip()
	return ""

	# --- UI Definitions ---

	# A list of standard, common labels the user can always choose from
	STANDARD_LABELS = [
	"PERSON", "ORGANIZATION", "LOCATION", "COUNTRY", "CITY", "STATE",
	"NATIONALITY", "GROUP", "DATE", "EVENT", "LAW", "LEGAL_DOCUMENT",
	"PRODUCT", "FACILITY", "WORK_OF_ART", "LANGUAGE", "TIME", "PERCENTAGE",
	"MONEY", "CURRENCY", "QUANTITY", "ORDINAL_NUMBER", "CARDINAL_NUMBER"
	]

	MAX_CATEGORIES = 8 # The maximum number of AI-suggested categories to show

	with gr.Blocks(title="Smart Text Analyzer", css=".prose { word-break: break-word; }") as demo:
	gr.Markdown("# Smart Text Analyzer")
	gr.Markdown(
	"""
	Welcome! Paste your text below to automatically find and highlight key information. It's like having two smart assistants read your document for you.

	### How It Works: Two Brains are Better Than One!
	We use two different types of AI to give you the best results.

	🧠 1. The Creative Brain (Generative AI - like GPT)
	This AI is a brainstormer. It reads your topic to understand the context, then imagines and suggests useful labels that fit your document. It helps you discover what to look for!

	🕵️ 2. The Detective (Extractive AI - GLiNER)
	This AI is a precise detective. Once you give it a list of labels, it meticulously scans the text and pulls out (extracts) the exact words that match. It's fantastic at finding specific information with high accuracy.
	"""
	)

	gr.Markdown("--- \n## Step 1: Get Label Ideas from the Creative AI")
	with gr.Row():
	topic = gr.Textbox(label="Enter a Topic", placeholder="e.g., The Chartist Movement, The Protestant Reformation")
	provider = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose Creative AI Model")
	with gr.Row():
	openai_key = gr.Textbox(label="OpenAI API Key", type="password")
	anthropic_key = gr.Textbox(label="Anthropic API Key", type="password")
	google_key = gr.Textbox(label="Google API Key", type="password")

	generate_btn = gr.Button("Generate Label Suggestions", variant="primary")

	gr.Markdown("--- \n## Step 2: Build Your Search & Analyze Text")
	gr.Markdown(
	"""
	### What are Entities or Labels?
	Think of them as special highlighters! They find and color-code specific types of information in your text, like `PERSON`, `DATE`, `LOCATION`, or custom things you define.
	"""
	)

	gr.Markdown("#### 1. Review AI-Suggested Labels")
	gr.Markdown("The AI's suggestions appear below. Uncheck any you don't want.")

	dynamic_components = []
	with gr.Column():
	for i in range(MAX_CATEGORIES):
	with gr.Accordion(f"Suggested Label Category {i+1}", visible=False) as acc:
	with gr.Row():
	# The CheckboxGroup holds the actual labels (e.g., "Protest", "Petition")
	cg = gr.CheckboxGroup(label="Labels in this category", interactive=True, container=False, scale=4)
	deselect_btn = gr.Button("Deselect All", size="sm", scale=1, min_width=80)
	dynamic_components.append((acc, cg, deselect_btn))

	gr.Markdown("#### 2. Include Standard Labels (Optional)")
	with gr.Group():
	standard_labels_checkbox = gr.CheckboxGroup(choices=STANDARD_LABELS, value=STANDARD_LABELS, label="Standard Entity Labels", info="Common categories like people, places, and dates.")
	with gr.Row():
	select_all_std_btn = gr.Button("Select All", size="sm")
	deselect_all_std_btn = gr.Button("Deselect All", size="sm")


	gr.Markdown("#### 3. Add Your Own Custom Labels (Optional)")
	with gr.Group():
	custom_labels_textbox = gr.Textbox(label="Enter Custom Labels (comma-separated)", placeholder="e.g., Technology, Weapon, Secret Society...")

	gr.Markdown("--- \n## Step 3: Analyze Your Document")
	threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="Controls how strict the AI Detective is. Lower to find more matches. Higher for fewer, more precise matches.")
	text_input = gr.Textbox(label="Paste Your Full Text Here for Analysis", lines=10, placeholder="Paste a historical document, an article, or a chapter...")
	analyze_btn = gr.Button("Analyze Text & Find Entities", variant="primary")

	analysis_status = gr.Markdown(visible=False) # For the "Analyzing..." message

	gr.Markdown("--- \n## Step 4: Review Your Results")
	gr.Markdown(
	"""
	✨ Pro Tip: Create Your Own Labels!
	Did our AI miss something? In the "Highlighted Text" view below, simply click and drag to highlight any piece of text. A small box will appear, allowing you to name and add your own custom label!
	"""
	)

	with gr.Tabs():
	with gr.TabItem("Highlighted Text"):
	highlighted_text_output = gr.HighlightedText(label="Found Entities", interactive=True)
	with gr.TabItem("Detailed Results"):
	detailed_results_output = gr.Markdown(label="List of Found Entities by Label")
	with gr.TabItem("Debug Info"):
	debug_output = gr.Textbox(label="Extraction Log", interactive=False, lines=8)

	# --- Backend Functions ---

	def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
	yield {
	generate_btn: gr.update(value="🧠 Generating suggestions...", interactive=False)
	}

	try:
	key_dict = {
	"openai_key": os.environ.get("OPENAI_API_KEY", openai_k),
	"anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k),
	"google_key": os.environ.get("GOOGLE_API_KEY", google_k)
	}

	provider_id = MODEL_OPTIONS.get(provider)
	if not topic or not provider or not key_dict.get(f"{provider_id}_key"):
	raise gr.Error("Topic, Provider, and the correct API Key are required.")

	prompt = HIERARCHICAL_PROMPT_TEMPLATE.format(topic=topic)
	raw_framework = generate_from_prompt(prompt, provider, key_dict)

	# This parsing is simplified for the new structure
	framework = defaultdict(list)
	current_category = None
	for line in raw_framework.split('\n'):
	line = line.strip()
	if line.startswith("###"):
	current_category = line.replace("###", "").strip()
	elif line.startswith("-") and current_category:
	entities = line.replace("-", "").strip()
	framework[current_category].extend([e.strip() for e in entities.split(',') if e.strip()])

	if not framework:
	raise gr.Error("AI failed to generate categories. Please try again or rephrase your topic.")

	updates = {}
	categories = list(framework.items())
	for i in range(MAX_CATEGORIES):
	accordion_comp, checkbox_comp, button_comp = dynamic_components[i]
	if i < len(categories):
	category_name, entities = categories[i]
	# The labels are the entities themselves, grouped by the category name
	sorted_entities = sorted(list(set(entities)))
	updates[accordion_comp] = gr.update(label=f"Category: {category_name}", visible=True)
	updates[checkbox_comp] = gr.update(choices=sorted_entities, value=sorted_entities, label="Suggested Labels", visible=True)
	updates[button_comp] = gr.update(visible=True)
	else:
	updates[accordion_comp] = gr.update(visible=False)
	updates[checkbox_comp] = gr.update(visible=False)
	updates[button_comp] = gr.update(visible=False)

	updates[generate_btn] = gr.update(value="Generate Label Suggestions", interactive=True)
	yield updates
	except Exception as e:
	yield {generate_btn: gr.update(value="Generate Label Suggestions", interactive=True)}
	raise gr.Error(str(e))

	def analyze_text_and_find_entities(text, standard_labels, custom_label_text, threshold, *suggested_labels_from_groups):
	# --- 1. Show Progress to User ---
	yield {
	analyze_btn: gr.update(value="🕵️ Analyzing...", interactive=False),
	analysis_status: gr.update(value="Our AI Detective is scanning your text. This may take a moment...", visible=True),
	highlighted_text_output: None,
	detailed_results_output: None,
	debug_output: "Starting analysis..."
	}

	debug_info = []
	if gliner_model is None:
	raise gr.Error("GLiNER model failed to load at startup. Cannot analyze text. Please check logs.")

	# --- 2. Collect All Labels from UI ---
	labels_to_use = set()
	# Add labels from the dynamically generated suggestion groups
	for group in suggested_labels_from_groups:
	if group: labels_to_use.update(group)
	# Add labels from the standard list
	if standard_labels: labels_to_use.update(standard_labels)
	# Add labels from the custom textbox
	custom = {l.strip() for l in custom_label_text.split(',') if l.strip()}
	if custom: labels_to_use.update(custom)

	final_labels = sorted(list(labels_to_use))
	debug_info.append(f"🧠 Searching for {len(final_labels)} unique labels.")
	debug_info.append(f"⚙️ Confidence Threshold: {threshold}")

	if not text or not final_labels:
	yield {
	analyze_btn: gr.update(value="Analyze Text & Find Entities", interactive=True),
	analysis_status: gr.update(visible=False),
	highlighted_text_output: {"text": text, "entities": []},
	detailed_results_output: "Please provide text and select at least one label to search for.",
	debug_output: "Analysis stopped: No text or no labels provided."
	}
	return

	# --- 3. Run the GLiNER Model (The "Detective") ---
	all_entities = []
	# Process text in chunks to handle very long documents
	chunk_size, overlap = 1024, 100
	for i in range(0, len(text), chunk_size - overlap):
	chunk = text[i : i + chunk_size]
	chunk_entities = gliner_model.predict_entities(chunk, final_labels, threshold=threshold)
	for ent in chunk_entities:
	ent['start'] += i
	ent['end'] += i
	all_entities.append(ent)

	# Deduplicate entities that might span across chunk overlaps
	unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
	debug_info.append(f"📊 Found {len(unique_entities)} raw entity mentions.")

	# --- 4. Prepare Highlighted Text Output ---
	highlighted_output_data = {
	"text": text,
	"entities": [{"start": ent["start"], "end": ent["end"], "label": ent["label"]} for ent in unique_entities]
	}

	# --- 5. Prepare Detailed Table-Based Results ---
	aggregated_matches = defaultdict(lambda: {'count': 0, 'scores': [], 'original_casing': ''})

	for ent in unique_entities:
	match_text = text[ent['start']:ent['end']]
	# Use a key of (label, lowercase_text) to group similar items
	key = (ent['label'], match_text.lower())

	aggregated_matches[key]['count'] += 1
	aggregated_matches[key]['scores'].append(ent['score'])
	# Store the first-seen casing of the text
	if not aggregated_matches[key]['original_casing']:
	aggregated_matches[key]['original_casing'] = match_text

	# Group aggregated results by label for final display
	results_by_label = defaultdict(list)
	for (label, _), data in aggregated_matches.items():
	avg_score = np.mean(data['scores'])
	results_by_label[label].append({
	'text': data['original_casing'],
	'count': data['count'],
	'avg_score': avg_score
	})

	# --- 6. Build the Markdown String for the Detailed Table ---
	markdown_string = ""
	for label, items in sorted(results_by_label.items()):
	markdown_string += f"### {label}\n"
	markdown_string += "\| Text Found \| Instances Found \| Avg. Confidence Score* \|\n"
	markdown_string += "\|------------\|-----------------\|--------------------------\|\n"

	# Sort items by count (most frequent first)
	for item in sorted(items, key=lambda x: x['count'], reverse=True):
	markdown_string += f"\| {item['text']} \| {item['count']} \| {item['avg_score']:.2f} \|\n"
	markdown_string += "\n"

	if not markdown_string:
	markdown_string = "No entities found. Try lowering the confidence threshold or changing your labels."
	else:
	markdown_string += "\n---\n<small><i>*<b>Confidence Score:</b> How sure the AI Detective (GLiNER) is that it found the correct label (1.00 = 100% certain). The score shown is the average across all instances of that text.</i></small>"

	debug_info.append("✅ Analysis complete.")

	# --- 7. Yield Final Results to UI ---
	yield {
	analyze_btn: gr.update(value="Analyze Text & Find Entities", interactive=True),
	analysis_status: gr.update(visible=False),
	highlighted_text_output: highlighted_output_data,
	detailed_results_output: markdown_string,
	debug_output: "\n".join(debug_info)
	}

	# --- Wire up UI events ---
	generate_btn.click(
	fn=handle_generate,
	inputs=[topic, provider, openai_key, anthropic_key, google_key],
	outputs=[generate_btn] + [comp for pair in dynamic_components for comp in pair]
	)

	# Functions for Select/Deselect All buttons
	def deselect_all():
	return gr.update(value=[])
	def select_all(choices):
	return gr.update(value=choices)

	deselect_all_std_btn.click(fn=deselect_all, inputs=None, outputs=[standard_labels_checkbox])
	select_all_std_btn.click(lambda: select_all(STANDARD_LABELS), inputs=None, outputs=[standard_labels_checkbox])

	for _, cg, btn in dynamic_components:
	btn.click(fn=deselect_all, inputs=None, outputs=[cg])

	analyze_btn.click(
	fn=analyze_text_and_find_entities,
	inputs=[text_input, standard_labels_checkbox, custom_labels_textbox, threshold_slider] + [cg for acc, cg, btn in dynamic_components],
	outputs=[analyze_btn, analysis_status, highlighted_text_output, detailed_results_output, debug_output]
	)

	demo.launch(share=True, debug=True)