Spaces:

AIEcosystem
/

relationship-map

Sleeping

App Files Files Community

AIEcosystem commited on Oct 8, 2025

Commit

beafc64

verified ·

1 Parent(s): 3d9a101

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +766 -58

src/streamlit_app.py CHANGED Viewed

@@ -1,66 +1,774 @@
 import streamlit as st
-import plotly.express as px
 import pandas as pd
 from io import BytesIO
-from pptx import Presentation
-from pptx.util import Inches
-# Sample data and Plotly graph
-df = pd.DataFrame({'Category': ['A', 'B', 'C'], 'Value': [10, 20, 30]})
-fig = px.bar(df, x='Category', y='Value', title='Sample Plotly Bar Chart')
-# Convert Plotly figure to image
-img_buffer = BytesIO()
-fig.write_image(img_buffer, format='png', width=800, height=400)
-img_buffer.seek(0)
-img_data = img_buffer.getvalue()
-# Function to create PPTX
-def create_presentation():
-    prs = Presentation()
-    # Title slide
-    slide = prs.slides.add_slide(prs.slide_layouts[0])
-    title = slide.shapes.title
-    title.text = "Streamlit Plotly Export"
-    # Slide with Plotly image and table
-    slide = prs.slides.add_slide(prs.slide_layouts[1])
-    title = slide.shapes.title
-    title.text = "Plotly Chart and Data"
-    # Add Plotly image
-    left = Inches(1)
-    top = Inches(1.5)
-    slide.shapes.add_picture(BytesIO(img_data), left, top, width=Inches(6))
-    # Add table
-    rows, cols = df.shape
-    left = Inches(1)
-    top = Inches(4)
-    width = Inches(6)
-    height = Inches(0.8)
-    table = slide.shapes.add_table(rows + 1, cols, left, top, width, height).table
-    table.cell(0, 0).text = 'Category'
-    table.cell(0, 1).text = 'Value'
-    for i in range(rows):
-        table.cell(i + 1, 0).text = df.iloc[i]['Category']
-        table.cell(i + 1, 1).text = str(df.iloc[i]['Value'])
-    # Save to bytes
-    bio = BytesIO()
-    prs.save(bio)
-    bio.seek(0)
-    return bio.getvalue()
-# Streamlit UI
-st.title("Export Plotly Graph to PPTX")
-st.plotly_chart(fig)  # Display the Plotly chart
-if st.button("Generate and Download Slides"):
-    pptx_data = create_presentation()
-    st.download_button(
-        label="Download PPTX",
-        data=pptx_data,
-        file_name="plotly_slides.pptx",
-        mime="application/vnd.openxmlformats-officedocument.presentationml.presentation"
-    )

+import os
+os.environ['HF_HOME'] = '/tmp'
+import time
 import streamlit as st
+import streamlit.components.v1 as components
 import pandas as pd
+import io
+import plotly.express as px
+import plotly.graph_objects as go
+import numpy as np
+import re
+import string
+import json
+# --- Imports for file generation (no pptx) ---
 from io import BytesIO
+import plotly.io as pio
+# ---------------------------
+# --- Stable Scikit-learn LDA Imports ---
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+# ------------------------------
+from gliner import GLiNER
+from streamlit_extras.stylable_container import stylable_container
+# Using a try/except for comet_ml import
+try:
+    from comet_ml import Experiment
+except ImportError:
+    class Experiment:
+        def __init__(self, **kwargs): pass
+        def log_parameter(self, *args): pass
+        def log_table(self, *args): pass
+        def end(self): pass
+# --- Model Home Directory (Fix for deployment environments) ---
+# Set HF_HOME environment variable to a writable path
+os.environ['HF_HOME'] = '/tmp'
+# --- Color Map for Highlighting and Network Graph Nodes (NO PINK COLORS) ---
+entity_color_map = {
+    "person": "#10b981",
+    "country": "#3b82f6",
+    "city": "#4ade80",
+    "organization": "#f59e0b",
+    "date": "#8b5cf6",
+    "time": "#ec4899",
+    "cardinal": "#06b6d4",
+    "money": "#f43f5e",
+    "position": "#a855f7",
+}
+# --- Label Definitions and Category Mapping (Used by the App) ---
+labels = list(entity_color_map.keys())
+labels = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
+category_mapping = {
+   "People": ["person", "organization", "position"],
+   "Locations": ["country", "city"],
+   "Time": ["date", "time"],
+   "Numbers": ["money", "cardinal"]
+}
+reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
+# --- Utility Functions for Analysis and Plotly ---
+def extract_label(node_name):
+    """Extracts the label from a node string like 'Text (Label)'."""
+    match = re.search(r'\(([^)]+)\)$', node_name)
+    return match.group(1) if match else "Unknown"
+def remove_trailing_punctuation(text_string):
+    """Removes trailing punctuation from a string."""
+    return text_string.rstrip(string.punctuation)
+def highlight_entities(text, df_entities):
+    """Generates HTML to display text with entities highlighted and colored."""
+    if df_entities.empty:
+        return text
+    # Sort entities by start index descending to insert highlights without affecting subsequent indices
+    entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
+    highlighted_text = text
+    for entity in entities:
+        start = entity['start']
+        end = entity['end']
+        label = entity['label']
+        entity_text = entity['text']
+        color = entity_color_map.get(label, '#000000')
+        # Create a span with background color and tooltip
+        highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
+        # Replace the original text segment with the highlighted HTML
+        highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
+    # Use a div to mimic the Streamlit input box style for the report
+    return f'<div style="border: 1px solid #CCCCCC; padding: 15px; border-radius: 5px; background-color: #FFFFFF; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
+def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
+    """
+    Performs basic Topic Modeling using LDA on the extracted entities
+    and returns structured data for visualization.
+    """
+    documents = df_entities['text'].unique().tolist()
+    if len(documents) < 2:
+        return None
+    N = min(num_top_words, len(documents))
+    try:
+        tfidf_vectorizer = TfidfVectorizer(
+            max_df=0.95,
+            min_df=1,
+            stop_words='english'
+        )
+        tfidf = tfidf_vectorizer.fit_transform(documents)
+        tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
+        lda = LatentDirichletAllocation(
+            n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
+        )
+        lda.fit(tfidf)
+        topic_data_list = []
+        for topic_idx, topic in enumerate(lda.components_):
+            top_words_indices = topic.argsort()[:-N - 1:-1]
+            top_words = [tfidf_feature_names[i] for i in top_words_indices]
+            word_weights = [topic[i] for i in top_words_indices]
+            for word, weight in zip(top_words, word_weights):
+                topic_data_list.append({
+                    'Topic_ID': f'Topic #{topic_idx + 1}',
+                    'Word': word,
+                    'Weight': weight,
+                })
+        return pd.DataFrame(topic_data_list)
+    except Exception as e:
+        st.error(f"Topic modeling failed: {e}")
+        return None
+def create_topic_word_bubbles(df_topic_data):
+    """Generates a Plotly Bubble Chart for top words across all topics."""
+    # Renaming columns to match the output of perform_topic_modeling
+    df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
+    df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
+    if df_topic_data.empty:
+        return None
+    fig = px.scatter(
+        df_topic_data,
+        x='x_pos',
+        y='weight',
+        size='weight',
+        color='topic',
+        hover_name='word',
+        size_max=80,
+        title='Topic Word Weights (Bubble Chart)',
+        color_discrete_sequence=px.colors.qualitative.Bold,
+        labels={
+            'x_pos': 'Entity/Word Index',
+            'weight': 'Word Weight',
+            'topic': 'Topic ID'
+        },
+        custom_data=['word', 'weight', 'topic']
+    )
+    fig.update_layout(
+        xaxis_title="Entity/Word (Bubble size = Word Weight)",
+        yaxis_title="Word Weight",
+        xaxis={'tickangle': -45, 'showgrid': False},
+        yaxis={'showgrid': True},
+        showlegend=True,
+        plot_bgcolor='#FFFFFF', # Removed pink
+        paper_bgcolor='#FFFFFF', # Removed pink
+        height=600,
+        margin=dict(t=50, b=100, l=50, r=10),
+    )
+    fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',
+marker=dict(line=dict(width=1, color='DarkSlateGrey')))
+    return fig
+def generate_network_graph(df, raw_text):
+    """
+    Generates a network graph visualization (Node Plot) with edges
+    based on entity co-occurrence in sentences.
+    """
+    entity_counts = df['text'].value_counts().reset_index()
+    entity_counts.columns = ['text', 'frequency']
+    unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
+    if unique_entities.shape[0] < 2:
+        return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
+    num_nodes = len(unique_entities)
+    thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
+    radius = 10
+    unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
+    unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
+    pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
+    edges = set()
+    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
+    for sentence in sentences:
+        entities_in_sentence = []
+        for entity_text in unique_entities['text'].unique():
+            if entity_text.lower() in sentence.lower():
+                entities_in_sentence.append(entity_text)
+        unique_entities_in_sentence = list(set(entities_in_sentence))
+        for i in range(len(unique_entities_in_sentence)):
+            for j in range(i + 1, len(unique_entities_in_sentence)):
+                node1 = unique_entities_in_sentence[i]
+                node2 = unique_entities_in_sentence[j]
+                edge_tuple = tuple(sorted((node1, node2)))
+                edges.add(edge_tuple)
+    edge_x = []
+    edge_y = []
+    for edge in edges:
+        n1, n2 = edge
+        if n1 in pos_map and n2 in pos_map:
+            edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
+            edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
+    fig = go.Figure()
+    edge_trace = go.Scatter(
+        x=edge_x, y=edge_y,
+        line=dict(width=0.5, color='#888'),
+        hoverinfo='none',
+        mode='lines',
+        name='Co-occurrence Edges',
+        showlegend=False
+    )
+    fig.add_trace(edge_trace)
+    fig.add_trace(go.Scatter(
+        x=unique_entities['x'],
+        y=unique_entities['y'],
+        mode='markers+text',
+        name='Entities',
+        text=unique_entities['text'],
+        textposition="top center",
+        showlegend=False,
+        marker=dict(
+            size=unique_entities['frequency'] * 5 + 10,
+            color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
+            line_width=1,
+            line_color='black',
+            opacity=0.9
+        ),
+        textfont=dict(size=10),
+        customdata=unique_entities[['label', 'score', 'frequency']],
+        hovertemplate=(
+            "<b>%{text}</b><br>" +
+            "Label: %{customdata[0]}<br>" +
+            "Score: %{customdata[1]:.2f}<br>" +
+            "Frequency: %{customdata[2]}<extra></extra>"
+        )
+    ))
+    legend_traces = []
+    seen_labels = set()
+    for index, row in unique_entities.iterrows():
+        label = row['label']
+        if label not in seen_labels:
+            seen_labels.add(label)
+            color = entity_color_map.get(label, '#cccccc')
+            legend_traces.append(go.Scatter(
+                x=[None], y=[None], mode='markers', marker=dict(size=10, color=color),
+name=f"{label.capitalize()}", showlegend=True
+            ))
+    for trace in legend_traces:
+        fig.add_trace(trace)
+    fig.update_layout(
+        title='Entity Co-occurrence Network (Edges = Same Sentence)',
+        showlegend=True,
+        hovermode='closest',
+        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
+        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
+        plot_bgcolor='#f9f9f9',
+        paper_bgcolor='#f9f9f9',
+        margin=dict(t=50, b=10, l=10, r=10),
+        height=600
+    )
+    return fig
+# --- NEW CSV GENERATION FUNCTION ---
+def generate_entity_csv(df):
+    """
+    Generates a CSV file of the extracted entities in an in-memory buffer,
+    including text, label, category, score, start, and end indices.
+    """
+    csv_buffer = BytesIO()
+    # Select desired columns and write to buffer
+    df_export = df[['text', 'label', 'category', 'score', 'start', 'end']]
+    csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
+    csv_buffer.seek(0)
+    return csv_buffer
+# -----------------------------------
+# --- Existing App Functionality (HTML) ---
+# NOTE: Removed the 'grouped_entity_table_html' generation that counted by label,
+# keeping only the grouped by category table generation if needed for the HTML report,
+# but prioritizing the Streamlit display of the grouped-by-category table.
+def generate_html_report(df, text_input, elapsed_time, df_topic_data):
+    """
+    Generates a full HTML report containing all analysis results and
+    visualizations. (Simplified HTML generation for brevity in code)
+    """
+    # ... (Plotly chart HTML generation code remains largely the same)
+    # 1. Generate Visualizations (Plotly HTML)
+    # 1a. Treemap
+    fig_treemap = px.treemap(
+        df,
+        path=[px.Constant("All Entities"), 'category', 'label', 'text'],
+        values='score',
+        color='category',
+        title="Entity Distribution by Category and Label",
+        color_discrete_sequence=px.colors.qualitative.Dark24
+    )
+    fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+    treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
+    # 1b. Pie Chart
+    grouped_counts = df['category'].value_counts().reset_index()
+    grouped_counts.columns = ['Category', 'Count']
+    fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
+    fig_pie.update_layout(margin=dict(t=50, b=10))
+    pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
+    # 1c. Bar Chart (Category Count)
+    fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
+    fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
+    bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
+    # 1d. Bar Chart (Most Frequent Entities)
+    word_counts = df['text'].value_counts().reset_index()
+    word_counts.columns = ['Entity', 'Count']
+    repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
+    bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
+    if not repeating_entities.empty:
+        fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
+        fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
+        bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
+    # 1e. Network Graph HTML
+    network_fig = generate_network_graph(df, text_input)
+    network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
+    # 1f. Topic Charts HTML
+    topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
+    if df_topic_data is not None and not df_topic_data.empty:
+        bubble_figure = create_topic_word_bubbles(df_topic_data)
+        if bubble_figure:
+            topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn")}</div>'
+        else:
+            topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
+    else:
+        topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #cccccc;">'
+        topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
+        topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
+        topic_charts_html += '</div>'
+    # 2. Get Highlighted Text
+    highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
+    # 3. Entity Tables (Pandas to HTML)
+    # The grouped by category table is used here for the HTML export
+    grouped_entity_table_df = df.groupby(['category', 'label']).size().reset_index(name='Count')
+    grouped_entity_table_df.columns = ['Category', 'Entity', 'Count'] # Column Renaming
+    grouped_entity_table_html = grouped_entity_table_df.to_html(
+         classes='table table-striped',
+         index=False
+    )
+    # 4. Construct the Final HTML
+    html_content = f"""<!DOCTYPE html><html lang="en"><head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Entity and Topic Analysis Report</title>
+    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+    <style>
+        body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
+        .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
+        h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom: 10px; margin-top: 0; }}
+        h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
+        h3 {{ color: #555; margin-top: 20px; }}
+        .metadata {{ background-color: #e6f7ff; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
+        .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
+        table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
+        table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
+        table th {{ background-color: #f0f0f0; }}
+        .highlighted-text {{ border: 1px solid #CCCCCC; padding: 15px; border-radius: 5px; background-color: #FFFFFF; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
+    </style></head><body>
+    <div class="container">
+        <h1>Entity and Topic Analysis Report</h1>
+        <div class="metadata">
+            <p><strong>Generated At:</strong> {time.strftime('%Y-%m-%d %H:%M:%S')}</p>
+            <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
+        </div>
+        <h2>1. Analyzed Text & Extracted Entities</h2>
+        <h3>Original Text with Highlighted Entities</h3>
+        <div class="highlighted-text-container">
+            {highlighted_text_html}
+        </div>
+        <h2>2. Entities Count by Category and Entity</h2>
+        {grouped_entity_table_html}
+        <h2>3. Data Visualizations</h2>
+        <h3>3.1 Entity Distribution Treemap</h3>
+        <div class="chart-box">{treemap_html}</div>
+        <h3>3.2 Comparative Charts</h3>
+        <div class="chart-box">{pie_html}</div>
+        <div class="chart-box">{bar_category_html}</div>
+        <div class="chart-box">{bar_freq_html}</div>
+        <h3>3.3 Entity Relationship Map</h3>
+        <div class="chart-box">{network_html}</div>
+        <h2>4. Topic Modeling</h2>
+        {topic_charts_html}
+    </div></body></html>
+    """
+    return html_content
+# --- Page Configuration and Styling (No Sidebar, Removed Pink) ---
+st.set_page_config(layout="wide", page_title="NER & Topic Report App")
+st.markdown(
+    """
+    <style>
+    /* Overall app container - NO SIDEBAR */
+    .main {
+        background-color: #f0f2f6; /* Light Grey/Default */
+        color: #333333; /* Dark grey text for contrast */
+    }
+    .stApp {
+        background-color: #f0f2f6;
+    }
+    /* Text Area background and text color (input fields) */
+    .stTextArea textarea {
+        background-color: #FFFFFF; /* White for input fields */
+        color: #000000; /* Black text for input */
+        border: 1px solid #CCCCCC; /* Neutral border */
+    }
+    /* Button styling */
+    .stButton > button {
+        background-color: #007bff; /* Blue for the button */
+        color: #FFFFFF; /* White text for contrast */
+        border: none;
+        padding: 10px 20px;
+        border-radius: 5px;
+    }
+    /* Expander header and content background */
+    .streamlit-expanderHeader, .streamlit-expanderContent {
+        background-color: #e6f7ff; /* Very Light Blue/Neutral */
+        color: #333333;
+    }
+    </style>
+    """,
+    unsafe_allow_html=True)
+st.subheader("NER and Topic Analysis Report Generator", divider="blue")
+st.link_button("by nlpblogs", "https://nlpblogs.com", type="secondary")
+expander = st.expander("**Important notes**")
+expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
+**Dependencies:** Note that **image export** requires the Python libraries `plotly` and `kaleido`.
+**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
+expander = st.expander("**Important notes**")
+expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
+**Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
+**How to Use:** Type or paste your text (max. 1000 words) into the text area below, press Ctrl + Enter, and then click the 'Results' button.
+**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
+st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
+# --- Comet ML Setup (Placeholder/Conditional) ---
+COMET_API_KEY = os.environ.get("COMET_API_KEY")
+COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
+COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
+comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
+# --- Model Loading ---
+@st.cache_resource
+def load_ner_model():
+    """Loads the GLiNER model and caches it."""
+    try:
+        return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
+    except Exception as e:
+        st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
+        st.stop()
+model = load_ner_model()
+# --- LONG DEFAULT TEXT (178 Words) ---
+DEFAULT_TEXT = (
+    "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
+    "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
+    "leap forward for commercial space technology across the entire European Union. The agreement, finalized "
+    "on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
+    "software platform. This platform is critical for processing and managing the vast amounts of data being sent "
+    "back from the recent Mars rover mission. The core team, including lead engineer Marcus Davies, will hold "
+    "their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
+    "media platform X (under the username @TechSolutionsCEO) was overwhelmingly positive, with many major tech "
+    "publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
+    "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
+    "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
+    "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
+    "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
+)
+# -----------------------------------
+# --- Session State Initialization (CRITICAL FIX) ---
+if 'show_results' not in st.session_state:
+    st.session_state.show_results = False
+if 'last_text' not in st.session_state:
+    st.session_state.last_text = ""
+if 'results_df' not in st.session_state:
+    st.session_state.results_df = pd.DataFrame()
+if 'elapsed_time' not in st.session_state:
+    st.session_state.elapsed_time = 0.0
+if 'topic_results' not in st.session_state:
+    st.session_state.topic_results = None
+if 'my_text_area' not in st.session_state:
+    st.session_state.my_text_area = DEFAULT_TEXT
+# --- Clear Button Function (MODIFIED) ---
+def clear_text():
+    """Clears the text area (sets it to an empty string) and hides results."""
+    st.session_state['my_text_area'] = ""
+    st.session_state.show_results = False
+    st.session_state.last_text = ""
+    st.session_state.results_df = pd.DataFrame()
+    st.session_state.elapsed_time = 0.0
+    st.session_state.topic_results = None
+# --- Text Input and Clear Button ---
+word_limit = 1000
+text = st.text_area(
+    f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
+    height=250,
+    key='my_text_area',
+    value=st.session_state.my_text_area)
+word_count = len(text.split())
+st.markdown(f"**Word count:** {word_count}/{word_limit}")
+st.button("Clear text", on_click=clear_text)
+# --- Results Trigger and Processing (Updated Logic) ---
+if st.button("Results"):
+    if not text.strip():
+        st.warning("Please enter some text to extract entities.")
+        st.session_state.show_results = False
+    elif word_count > word_limit:
+        st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
+        st.session_state.show_results = False
+    else:
+        with st.spinner("Extracting entities and generating report data...", show_time=True):
+            if text != st.session_state.last_text:
+                st.session_state.last_text = text
+                start_time = time.time()
+                # --- Model Prediction & Dataframe Creation ---
+                entities = model.predict_entities(text, labels)
+                df = pd.DataFrame(entities)
+                if not df.empty:
+                    df['text'] = df['text'].apply(remove_trailing_punctuation)
+                    df['category'] = df['label'].map(reverse_category_mapping)
+                    st.session_state.results_df = df
+                    unique_entity_count = len(df['text'].unique())
+                    N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
+                    st.session_state.topic_results = perform_topic_modeling(
+                        df,
+                        num_topics=2,
+                        num_top_words=N_TOP_WORDS_TO_USE
+                    )
+                    if comet_initialized:
+                        experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
+                        experiment.log_parameter("input_text", text)
+                        experiment.log_table("predicted_entities", df)
+                        experiment.end()
+                else:
+                    st.session_state.results_df = pd.DataFrame()
+                    st.session_state.topic_results = None
+                end_time = time.time()
+                st.session_state.elapsed_time = end_time - start_time
+            st.session_state.show_results = True
+# --- Results Display ---
+if st.session_state.show_results and not st.session_state.results_df.empty:
+    st.success(f"Processing complete in {st.session_state.elapsed_time:.2f} seconds! 🎉")
+    df = st.session_state.results_df
+    text_input = st.session_state.last_text
+    elapsed_time = st.session_state.elapsed_time
+    df_topic_data = st.session_state.topic_results
+    # --- Highlighted Text and Download Buttons (Above Tabs) ---
+    st.subheader("1. Analyzed Text & Extracted Entities", divider="blue")
+    st.markdown(
+        highlight_entities(text_input, df),
+        unsafe_allow_html=True
+    )
+    st.subheader("Downloads", divider="blue")
+    col1, col2, col3 = st.columns([1, 1, 3])
+    # 1. Download CSV
+    csv_buffer = generate_entity_csv(df)
+    col1.download_button(
+        label="Download Entities as CSV",
+        data=csv_buffer.getvalue(),
+        file_name="ner_entities.csv",
+        mime="text/csv"
+    )
+    # 2. Download HTML Report
+    html_content = generate_html_report(df, text_input, elapsed_time, df_topic_data)
+    col2.download_button(
+        label="Download Full HTML Report",
+        data=html_content.encode('utf-8'),
+        file_name="ner_analysis_report.html",
+        mime="text/html"
+    )
+    st.markdown("---")
+    # --- Tabs Implementation ---
+    tab1, tab2 = st.tabs(["📊 Entity Data (Table)", "📈 Visualizations & Topics"])
+    with tab1:
+        # Create the summary table with the requested column name changes
+        grouped_entity_table = df.groupby(['category', 'label']).size().reset_index(name='Count')
+        grouped_entity_table.columns = ['Category', 'Entity', 'Count']
+        st.markdown("## Entity Counts by Category and Entity")
+        st.dataframe(grouped_entity_table.sort_values(by=['Category', 'Count'], ascending=[True, False]), use_container_width=True)
+               with st.expander("See Glossary of tags"):
+           st.write('''
+           - **start**: ['index of the start of the corresponding entity']
+           - **end**: ['index of the end of the corresponding entity']
+           - **text**: ['entity extracted from your text data']
+           - **label**: ['label (tag) assigned to a given extracted entity']
+           - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
+           ''')
+    with tab2:
+        st.markdown("## Visualizations")
+        # 3a. Treemap (As requested in Tab 2)
+        fig_treemap = px.treemap(
+            df,
+            path=[px.Constant("All Entities"), 'category', 'label', 'text'],
+            values='score',
+            color='category',
+            title="Entity Distribution by Category and Label",
+            color_discrete_sequence=px.colors.qualitative.Dark24
+        )
+        fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+        st.markdown("### Entity Distribution Treemap")
+        st.plotly_chart(fig_treemap, use_container_width=True)
+        st.markdown("---")
+        # 3b. Pie Chart and Category Bar Chart side-by-side
+        col_pie, col_bar_cat = st.columns(2)
+        # Pie Chart
+        grouped_counts = df['category'].value_counts().reset_index()
+        grouped_counts.columns = ['Category', 'Count']
+        fig_pie = px.pie(grouped_counts, values='Count', names='Category',
+                         title='Distribution of Entities by Category',
+                         color_discrete_sequence=px.colors.sequential.RdBu)
+        fig_pie.update_layout(margin=dict(t=50, b=10))
+        with col_pie:
+            st.markdown("### Distribution of Entities by Category")
+            st.plotly_chart(fig_pie, use_container_width=True)
+        # Category Bar Chart
+        fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',
+                                 color='Category', title='Total Entities per Category',
+                                 color_discrete_sequence=px.colors.qualitative.Pastel)
+        fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=10))
+        with col_bar_cat:
+            st.markdown("### Total Entities per Category")
+            st.plotly_chart(fig_bar_category, use_container_width=True)
+        st.markdown("---")
+        # 3c. Most Frequent Entities Bar Chart
+        word_counts = df['text'].value_counts().reset_index()
+        word_counts.columns = ['Entity', 'Count']
+        repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
+        st.markdown("### Top 10 Most Frequent Entities")
+        if not repeating_entities.empty:
+            fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',
+                                 color='Entity', title='Top 10 Most Frequent Entities',
+                                 color_discrete_sequence=px.colors.sequential.Plasma)
+            fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=100))
+            st.plotly_chart(fig_bar_freq, use_container_width=True)
+        else:
+            st.info("No entities appear more than once in the text for visualization.")
+        st.markdown("---")
+        # 3d. Network Graph
+        st.markdown("### Entity Relationship Map")
+        network_fig = generate_network_graph(df, text_input)
+        st.plotly_chart(network_fig, use_container_width=True)
+        st.markdown("---")
+        # 4. Topic Modeling
+        st.markdown("## Topic Modeling")
+        if df_topic_data is not None and not df_topic_data.empty:
+            st.markdown("### Bubble size = word weight")
+            bubble_figure = create_topic_word_bubbles(df_topic_data)
+            st.plotly_chart(bubble_figure, use_container_width=True)
+            st.markdown("### Top Words by Topic")
+            # Simple table display for topic data
+            st.dataframe(df_topic_data, use_container_width=True)
+        else:
+            st.info("Topic Modeling requires more unique input (at least two unique entities) to be performed.")
+elif st.session_state.show_results and st.session_state.results_df.empty:
+    st.warning("No entities were extracted from the provided text.")
+st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
+code = '''
+<iframe
+src="https://aiecosystem-dataharvest.hf.space"
+frameborder="0"
+width="850"
+height="450"
+></iframe>
+'''
+st.code(code, language="html")