Spaces:

LakzSpace
/

entity-analysis

Sleeping

File size: 9,568 Bytes

43ea66b

import gradio as gr
import os
import json
import pandas as pd
import requests
from google.oauth2 import service_account
from google.cloud import language_v1
import tempfile
from collections import defaultdict

# --- Configuration & Authentication ---
# (This section remains unchanged)
api_key_json_string = os.environ.get("GOOGLE_API_KEY_JSON")
language_client = None
auth_success = False
auth_error_message = ""
if not api_key_json_string:
    auth_error_message = "ERROR: The `GOOGLE_API_KEY_JSON` secret is not set..."
else:
    try:
        credentials_info = json.loads(api_key_json_string)
        credentials = service_account.Credentials.from_service_account_info(credentials_info)
        language_client = language_v1.LanguageServiceClient(credentials=credentials)
        auth_success = True
    except Exception as e:
        auth_error_message = f"ERROR: Failed to authenticate with Google Cloud: {e}"

# --- Color Mapping for Highlighting ---
# Define a consistent color for each entity type for the highlighted text
ENTITY_TYPE_COLORS = {
    "PERSON": "#ffc107", # Amber
    "LOCATION": "#f44336", # Red
    "ORGANIZATION": "#2196f3", # Blue
    "EVENT": "#4caf50", # Green
    "WORK_OF_ART": "#9c27b0", # Purple
    "CONSUMER_GOOD": "#ff9800", # Orange
    "OTHER": "#9e9e9e", # Grey
    "PHONE_NUMBER": "#795548", # Brown
    "ADDRESS": "#607d8b", # Blue Grey
    "DATE": "#00bcd4", # Cyan
    "NUMBER": "#cddc39", # Lime
    "PRICE": "#e91e63" # Pink
}


# --- Helper Functions ---

def find_wikidata_entity(entity_name: str):
    """Searches the Wikidata API for an entity and returns its URL if found."""
    try:
        params = {"action": "wbsearchentities", "format": "json", "language": "en", "search": entity_name}
        response = requests.get("https://www.wikidata.org/w/api.php", params=params, timeout=3)
        response.raise_for_status()
        data = response.json()
        if data.get("search"):
            return f"https://www.wikidata.org/wiki/{data['search'][0]['id']}"
    except (requests.RequestException, KeyError, IndexError):
        return None
    return None

def create_entity_dataframe(entities):
    """Converts a list of processed entity objects into a pandas DataFrame for display."""
    if not entities:
        return pd.DataFrame()
    
    display_data = []
    for entity in entities:
        display_data.append({
            "Entity": entity['name'],
            "Salience": f"{entity['salience']:.3f}",
            "Google KG": f"[Search]({entity['google_kg_url']})" if entity['google_kg_url'] else "N/A",
            "Wikipedia": f"[Link]({entity['wikipedia_url']})" if entity['wikipedia_url'] else "N/A",
            "Wikidata": f"[Link]({entity['wikidata_url']})" if entity['wikidata_url'] else "N/A",
        })
    return pd.DataFrame(display_data)

def format_text_for_highlighting(text, entities):
    """
    Processes the original text and entities to create the data structure
    needed for Gradio's HighlightedText component.
    """
    mentions = []
    for entity in entities:
        entity_type_name = language_v1.Entity.Type(entity.type_).name
        for mention in entity.mentions:
            mentions.append({
                "start": mention.text.begin_offset,
                "end": mention.text.begin_offset + len(mention.text.content),
                "text": mention.text.content,
                "label": entity_type_name
            })
    
    mentions.sort(key=lambda x: x['start'])
    
    highlighted_data = []
    last_index = 0
    for mention in mentions:
        if mention['start'] > last_index:
            highlighted_data.append((text[last_index:mention['start']], None))
        
        highlighted_data.append((mention['text'], mention['label']))
        last_index = mention['end']
        
    if last_index < len(text):
        highlighted_data.append((text[last_index:], None))
        
    return highlighted_data

# --- Core Logic Functions ---

def analyze_text(text_input: str):
    """
    Main function to perform analysis. It calls the NLP API, processes results,
    and returns all necessary data for the UI, including visibility updates.
    """
    if not auth_success:
        raise gr.Error(auth_error_message)
    if not text_input or not text_input.strip():
        return "Please enter text to analyze.", {}, None, gr.update(visible=False)

    try:
        document = language_v1.Document(content=text_input, type_=language_v1.Document.Type.PLAIN_TEXT)
        encoding_type = language_v1.EncodingType.UTF8
        response = language_client.analyze_entities(request={"document": document, "encoding_type": encoding_type})
        
        highlight_data = format_text_for_highlighting(text_input, response.entities)

        all_entities = defaultdict(list)
        for entity in response.entities:
            google_kg_url = f"https://www.google.com/search?kgmid={entity.metadata['mid']}" if 'mid' in entity.metadata else None
            wikipedia_url = entity.metadata.get("wikipedia_url", None)
            wikidata_url = find_wikidata_entity(entity.name)
            link_count = sum(1 for link in [google_kg_url, wikipedia_url, wikidata_url] if link)
            entity_type_name = language_v1.Entity.Type(entity.type_).name
            
            all_entities[entity_type_name].append({
                'name': entity.name, 'type': entity_type_name, 'salience': entity.salience,
                'google_kg_url': google_kg_url, 'wikipedia_url': wikipedia_url,
                'wikidata_url': wikidata_url, 'link_count': link_count
            })
        
        summary = f"Analysis complete. Found {len(response.entities)} total entities across {len(all_entities)} types."
        # Return an update to make the results section visible
        return summary, all_entities, highlight_data, gr.update(visible=True)

    except Exception as e:
        raise gr.Error(f"An error occurred during API call: {e}")

def sort_and_update_ui(all_entities, prioritize_identified):
    """
    Takes the full entity data and a boolean toggle, then returns a list of
    UI updates for all the accordions and dataframes.
    """
    display_order = ["PERSON", "ORGANIZATION", "LOCATION", "EVENT", "WORK_OF_ART", "CONSUMER_GOOD", "OTHER", "PHONE_NUMBER", "ADDRESS", "DATE", "NUMBER", "PRICE"]
    updates = []
    
    for entity_type in display_order:
        entities_of_type = all_entities.get(entity_type, [])
        
        if prioritize_identified:
            entities_of_type.sort(key=lambda x: (x['link_count'], x['salience']), reverse=True)
        else:
            entities_of_type.sort(key=lambda x: x['salience'], reverse=True)
            
        df = create_entity_dataframe(entities_of_type)
        is_visible = len(entities_of_type) > 0
        accordion_label = f"{entity_type.replace('_', ' ')} ({len(entities_of_type)} entities)"
        updates.append(gr.Accordion(label=accordion_label, visible=is_visible))
        updates.append(df)
        
    return updates

# --- Gradio Interface Definition ---

with gr.Blocks(theme=gr.themes.Soft()) as iface:
    gr.Markdown("# Advanced Entity Analyzer")
    
    analysis_results_state = gr.State({})

    with gr.Row():
        text_input = gr.Textbox(lines=20, label="Enter Text for Analysis", placeholder="Paste your article content here...")
    
    analyze_button = gr.Button("Analyze Text", variant="primary")
    
    # Use gr.Column to group results, and make it initially invisible
    with gr.Column(visible=False) as results_section:
        gr.Markdown("### Analysis Results")
        summary_output = gr.Textbox(label="Summary", interactive=False)
        
        with gr.Tabs():
            with gr.TabItem("Visual Analysis"):
                 highlighted_text_output = gr.HighlightedText(
                    label="Highlighted Entities",
                    color_map=ENTITY_TYPE_COLORS,
                    show_legend=True
                )
            with gr.TabItem("Detailed Breakdown"):
                prioritize_checkbox = gr.Checkbox(label="Prioritize identified entities (with links)", value=False)
                accordions = {}
                dataframes = {}
                all_types = ["PERSON", "ORGANIZATION", "LOCATION", "EVENT", "WORK_OF_ART", "CONSUMER_GOOD", "OTHER", "PHONE_NUMBER", "ADDRESS", "DATE", "NUMBER", "PRICE"]
                for entity_type in all_types:
                    with gr.Accordion(f"{entity_type.replace('_', ' ')}", visible=False) as acc:
                        accordions[entity_type] = acc
                        df = gr.Dataframe(
                            headers=["Entity", "Salience", "Google KG", "Wikipedia", "Wikidata"],
                            datatype=["str", "str", "markdown", "markdown", "markdown"],
                            wrap=True
                        )
                        dataframes[entity_type] = df
    
    all_detailed_outputs = [item for pair in zip(accordions.values(), dataframes.values()) for item in pair]

    analyze_button.click(
        fn=analyze_text,
        inputs=[text_input],
        outputs=[summary_output, analysis_results_state, highlighted_text_output, results_section]
    ).then(
        fn=sort_and_update_ui,
        inputs=[analysis_results_state, prioritize_checkbox],
        outputs=all_detailed_outputs
    )

    prioritize_checkbox.change(
        fn=sort_and_update_ui,
        inputs=[analysis_results_state, prioritize_checkbox],
        outputs=all_detailed_outputs
    )

if __name__ == "__main__":
    iface.launch()