Spaces:

AIEcosystem
/

DataHarvest

Running

App Files Files Community

AIEcosystem commited on Oct 11, 2025

Commit

9881c2e

verified ·

1 Parent(s): b2f8b8b

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +197 -177

src/streamlit_app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import time
 import streamlit as st
 import streamlit.components.v1 as components
@@ -10,23 +11,28 @@ import numpy as np
 import re
 import string
 import json
-# --- PPTX Imports (Kept for completeness) ---
 from io import BytesIO
 from pptx import Presentation
 from pptx.util import Inches, Pt
 from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
 import plotly.io as pio # Required for image export
-# -------------------------------------------
 # --- Stable Scikit-learn LDA Imports ---
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
-# ---------------------------------------
 from gliner import GLiNER
 from streamlit_extras.stylable_container import stylable_container
 # Using a try/except for comet_ml import
 try:
     from comet_ml import Experiment
@@ -36,11 +42,9 @@ except ImportError:
         def log_parameter(self, *args): pass
         def log_table(self, *args): pass
         def end(self): pass
 # --- Model Home Directory (Fix for deployment environments) ---
 # Set HF_HOME environment variable to a writable path
 os.environ['HF_HOME'] = '/tmp'
 # --- Color Map for Highlighting and Network Graph Nodes ---
 entity_color_map = {
     "person": "#10b981",
@@ -52,28 +56,23 @@ entity_color_map = {
     "cardinal": "#06b6d4",
     "money": "#f43f5e",
     "position": "#a855f7",
-}
 # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
 labels = list(entity_color_map.keys())
 category_mapping = {
    "People": ["person", "organization", "position"],
    "Locations": ["country", "city"],
    "Time": ["date", "time"],
-   "Numbers": ["money", "cardinal"]
-}
 reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 # --- Utility Functions for Analysis and Plotly ---
 def extract_label(node_name):
     """Extracts the label from a node string like 'Text (Label)'."""
     match = re.search(r'\(([^)]+)\)$', node_name)
     return match.group(1) if match else "Unknown"
 def remove_trailing_punctuation(text_string):
     """Removes trailing punctuation from a string."""
     return text_string.rstrip(string.punctuation)
 def highlight_entities(text, df_entities):
     """Generates HTML to display text with entities highlighted and colored."""
     if df_entities.empty:
@@ -94,31 +93,33 @@ def highlight_entities(text, df_entities):
     # Use a div to mimic the Streamlit input box style for the report
     return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     """
     Performs basic Topic Modeling using LDA on the extracted entities,
-    allowing for n-grams (up to 3 words) to capture multi-word entities.
     """
-    # 1. Prepare Documents: Use unique entities
     documents = df_entities['text'].unique().tolist()
     if len(documents) < 2:
         return None
     N = min(num_top_words, len(documents))
     try:
-        # 2. Vectorizer: Use TfidfVectorizer with ngram_range to capture multi-word entities.
         tfidf_vectorizer = TfidfVectorizer(
             max_df=0.95,
             min_df=2, # Only consider words/phrases that appear at least twice to find topics
             stop_words='english',
-            ngram_range=(1, 3)
         )
         tfidf = tfidf_vectorizer.fit_transform(documents)
         tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
         # Check if the vocabulary is too small after tokenization/ngram generation
         if len(tfidf_feature_names) < num_topics:
             # Re-run with min_df=1 if vocab is too small
@@ -136,35 +137,43 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
             random_state=42, n_jobs=-1
         )
         lda.fit(tfidf)
         # 4. Extract Topic Data
         topic_data_list = []
         for topic_idx, topic in enumerate(lda.components_):
             top_words_indices = topic.argsort()[:-N - 1:-1]
             top_words = [tfidf_feature_names[i] for i in top_words_indices]
             word_weights = [topic[i] for i in top_words_indices]
             for word, weight in zip(top_words, word_weights):
                 topic_data_list.append({
                     'Topic_ID': f'Topic #{topic_idx + 1}',
                     'Word': word,
                     'Weight': weight,
                 })
         return pd.DataFrame(topic_data_list)
     except Exception as e:
         return None
 def create_topic_word_bubbles(df_topic_data):
     """Generates a Plotly Bubble Chart for top words across
     all topics, displaying the word directly on the bubble."""
     # Renaming columns to match the output of perform_topic_modeling
-    df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
     df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position
     if df_topic_data.empty:
         return None
     fig = px.scatter(
         df_topic_data,
         x='x_pos',
@@ -174,7 +183,7 @@ def create_topic_word_bubbles(df_topic_data):
         # Set text to the word
         text='word',
         hover_name='word',
-        size_max=40, # Reduced size_max for smaller bubbles
         title='Topic Word Weights (Bubble Chart)',
         color_discrete_sequence=px.colors.qualitative.Bold,
         labels={
@@ -188,7 +197,7 @@ def create_topic_word_bubbles(df_topic_data):
     fig.update_layout(
         xaxis_title="Entity/Word",
         yaxis_title="Word Weight",
-        # Hides the vertical X-axis line, tick labels, and grid
         xaxis={'tickangle': -45, 'showgrid': False, 'showticklabels': False, 'zeroline': False, 'showline': False},
         yaxis={'showgrid': True},
         showlegend=True,
@@ -197,56 +206,54 @@ def create_topic_word_bubbles(df_topic_data):
         height=600,
         margin=dict(t=50, b=100, l=50, r=10),
     )
-    # Update traces to set text color to white
     fig.update_traces(
         textposition='middle center',
-        textfont=dict(color='white', size=10), # Fix for text visibility
         hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',
         marker=dict(line=dict(width=1, color='DarkSlateGrey'))
     )
     return fig
 def generate_network_graph(df, raw_text):
     """
     Generates a network graph visualization (Node Plot) with edges
-    based on entity co-occurrence in sentences.
     """
     entity_counts = df['text'].value_counts().reset_index()
     entity_counts.columns = ['text', 'frequency']
     unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
     if unique_entities.shape[0] < 2:
         return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
-    # Positioning logic (simplified circular layout with slight jitter)
     num_nodes = len(unique_entities)
     thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
     radius = 10
     unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
     unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
     pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
-    # Co-occurrence Edges based on sentences
     edges = set()
     sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
     for sentence in sentences:
         entities_in_sentence = []
         for entity_text in unique_entities['text'].unique():
             if entity_text.lower() in sentence.lower():
                 entities_in_sentence.append(entity_text)
         unique_entities_in_sentence = list(set(entities_in_sentence))
-        # Create edges for all pairs in the sentence
         for i in range(len(unique_entities_in_sentence)):
             for j in range(i + 1, len(unique_entities_in_sentence)):
                 node1 = unique_entities_in_sentence[i]
                 node2 = unique_entities_in_sentence[j]
                 edge_tuple = tuple(sorted((node1, node2)))
                 edges.add(edge_tuple)
     edge_x = []
     edge_y = []
     for edge in edges:
@@ -254,10 +261,7 @@ def generate_network_graph(df, raw_text):
         if n1 in pos_map and n2 in pos_map:
             edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
             edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
     fig = go.Figure()
-    # Edge Trace
     edge_trace = go.Scatter(
         x=edge_x, y=edge_y,
         line=dict(width=0.5, color='#888'),
@@ -267,8 +271,6 @@ def generate_network_graph(df, raw_text):
         showlegend=False
     )
     fig.add_trace(edge_trace)
-    # Node Trace
     fig.add_trace(go.Scatter(
         x=unique_entities['x'],
         y=unique_entities['y'],
@@ -278,7 +280,6 @@ def generate_network_graph(df, raw_text):
         textposition="top center",
         showlegend=False,
         marker=dict(
-            # Size nodes based on frequency
             size=unique_entities['frequency'] * 5 + 10,
             color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
             line_width=1,
@@ -294,8 +295,6 @@ def generate_network_graph(df, raw_text):
             "Frequency: %{customdata[2]}<extra></extra>"
         )
     ))
-    # Custom Legend for Node Colors
     legend_traces = []
     seen_labels = set()
     for index, row in unique_entities.iterrows():
@@ -308,7 +307,6 @@ def generate_network_graph(df, raw_text):
             ))
     for trace in legend_traces:
         fig.add_trace(trace)
     fig.update_layout(
         title='Entity Co-occurrence Network (Edges = Same Sentence)',
         showlegend=True,
@@ -321,8 +319,7 @@ def generate_network_graph(df, raw_text):
         height=600
     )
     return fig
-# --- CSV GENERATION FUNCTION ---
 def generate_entity_csv(df):
     """
     Generates a CSV file of the extracted entities in an in-memory buffer,
@@ -334,16 +331,14 @@ def generate_entity_csv(df):
     csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
     csv_buffer.seek(0)
     return csv_buffer
-# -----------------------------
-# --- HTML REPORT GENERATION FUNCTION ---
 def generate_html_report(df, text_input, elapsed_time, df_topic_data):
     """
     Generates a full HTML report containing all analysis results and visualizations.
-    Includes mobile-specific CSS fixes.
     """
     # 1. Generate Visualizations (Plotly HTML)
     # 1a. Treemap
     fig_treemap = px.treemap(
         df,
@@ -355,101 +350,69 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
     )
     fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
     # 1b. Pie Chart
     grouped_counts = df['category'].value_counts().reset_index()
     grouped_counts.columns = ['Category', 'Count']
     fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
     fig_pie.update_layout(margin=dict(t=50, b=10))
     pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
     # 1c. Bar Chart (Category Count)
     fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
     fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
     bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
     # 1d. Bar Chart (Most Frequent Entities)
     word_counts = df['text'].value_counts().reset_index()
     word_counts.columns = ['Entity', 'Count']
     repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
     bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
     if not repeating_entities.empty:
         fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
         fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
         bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
     # 1e. Network Graph HTML
     network_fig = generate_network_graph(df, text_input)
     network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
     # 1f. Topic Charts HTML
     topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
     if df_topic_data is not None and not df_topic_data.empty:
         bubble_figure = create_topic_word_bubbles(df_topic_data)
         if bubble_figure:
-            # Added config={'responsive': True} for HTML report resizing
             topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn", config={"responsive": True})}</div>'
         else:
             topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
     else:
-        topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">'
         topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
         topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
         topic_charts_html += '</div>'
     # 2. Get Highlighted Text
     highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
     # 3. Entity Tables (Pandas to HTML)
     entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
         classes='table table-striped',
         index=False
     )
-    # 4. Construct the Final HTML with Corrected Mobile CSS
     html_content = f"""<!DOCTYPE html><html lang="en"><head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>Entity and Topic Analysis Report</title>
     <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
     <style>
-        body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px;background-color: #f4f4f9; color: #333; }}
-        .container {{ max-width: 1200px; margin: 0 auto; background-color:#ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12pxrgba(0,0,0,0.1); }}
-        h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom:10px; margin-top: 0; }}
-        h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd;padding-bottom: 5px; }}
         h3 {{ color: #555; margin-top: 20px; }}
-        .metadata {{ background-color: #e6f0ff; padding: 15px; border-radius:8px; margin-bottom: 20px; font-size: 0.9em; }}
-        .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius:8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px;}}
         table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
-        table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align:left; }}
         table th {{ background-color: #f0f0f0; }}
-        .highlighted-text {{ border: 1px solid #888888; padding: 15px;border-radius: 5px; background-color: #ffffff; font-family: monospace;white-space: pre-wrap; margin-bottom: 20px; }}
-        /* === MOBILE-SPECIFIC FIXES FOR REPORT OVERLAP === */
-        @media (max-width: 600px) {
-            body {
-                padding: 10px;
-            }
-            .container {
-                padding: 10px;
-                border-radius: 0;
-            }
-            .chart-box {
-                padding: 5px;
-                overflow-x: auto; /* Allow horizontal scrolling for wide charts */
-            }
-            /* Ensures the Plotly chart inside has a minimum width */
-            .chart-box > div {
-                min-width: 400px;
-            }
-            /* Force tables to be scrollable */
-            table {
-                display: block;
-                overflow-x: auto;
-                white-space: nowrap;
-            }
-        }
-        /* ============================================== */
     </style></head><body>
     <div class="container">
         <h1>Entity and Topic Analysis Report</h1>
@@ -478,10 +441,10 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
     </div></body></html>
     """
     return html_content
 # --- Page Configuration and Styling (No Sidebar) ---
 st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 # --- Conditional Mobile Warning ---
 st.markdown(
     """
@@ -517,35 +480,50 @@ st.markdown(
 )
 # ----------------------------------
-# --- General Streamlit Style Fixes ---
 st.markdown(
     """
     <style>
     /* --- FIX: Tab Label Colors for Visibility --- */
     [data-testid="stConfigurableTabs"] button {
-        color: #333333 !important;
-        background-color: #f0f0f0;
         border: 1px solid #cccccc;
     }
     /* Target the ACTIVE tab label */
     [data-testid="stConfigurableTabs"] button[aria-selected="true"] {
-        color: #FFFFFF !important;
-        background-color: #007bff;
-        border-bottom: 2px solid #007bff;
     }
-    /* Expander header color fix */
     .streamlit-expanderHeader {
-        color: #007bff;
     }
     </style>
     """,
     unsafe_allow_html=True
 )
-st.subheader("Entity and Topic Analysis Report Generator", divider="blue")
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
-tab1, tab2 = st.tabs(["Embed", "Important Notes"])
 with tab1:
     with st.expander("Embed"):
@@ -558,25 +536,32 @@ with tab1:
         height="450"
     ></iframe>
     '''
-        st.code(code, language="html")
 with tab2:
     expander = st.expander("**Important Notes**")
     expander.markdown("""
     **Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
     **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
     **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
     **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
     """)
-    st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 # --- Comet ML Setup (Placeholder/Conditional) ---
 COMET_API_KEY = os.environ.get("COMET_API_KEY")
 COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
 comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 # --- Model Loading ---
 @st.cache_resource
 def load_ner_model():
@@ -586,10 +571,9 @@ def load_ner_model():
     except Exception as e:
         st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
         st.stop()
 model = load_ner_model()
 # --- LONG DEFAULT TEXT (178 Words) ---
 DEFAULT_TEXT = (
     "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
     "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
@@ -606,9 +590,16 @@ DEFAULT_TEXT = (
     "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
     "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026."
 )
-# -----------------------------------
-# --- Session State Initialization (CRITICAL) ---
 if 'show_results' not in st.session_state:
     st.session_state.show_results = False
 if 'last_text' not in st.session_state:
@@ -619,11 +610,9 @@ if 'elapsed_time' not in st.session_state:
     st.session_state.elapsed_time = 0.0
 if 'topic_results' not in st.session_state:
     st.session_state.topic_results = None
-# --- FIX: Only set default text in session state, not in st.text_area value ---
 if 'my_text_area' not in st.session_state:
     st.session_state.my_text_area = DEFAULT_TEXT
-# --- Clear Button Function ---
 def clear_text():
     """Clears the text area (sets it to an empty string) and hides results."""
     st.session_state['my_text_area'] = ""
@@ -632,19 +621,16 @@ def clear_text():
     st.session_state.results_df = pd.DataFrame()
     st.session_state.elapsed_time = 0.0
     st.session_state.topic_results = None
 # --- Text Input and Clear Button ---
 word_limit = 1000
 text = st.text_area(
     f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
     height=250,
-    key='my_text_area', # Streamlit automatically uses st.session_state.my_text_area here
     )
 word_count = len(text.split())
 st.markdown(f"**Word count:** {word_count}/{word_limit}")
 st.button("Clear text", on_click=clear_text)
 # --- Results Trigger and Processing (Updated Logic) ---
 if st.button("Results"):
     if not text.strip():
@@ -658,25 +644,20 @@ if st.button("Results"):
             if text != st.session_state.last_text:
                 st.session_state.last_text = text
                 start_time = time.time()
                 # --- Model Prediction & Dataframe Creation ---
                 entities = model.predict_entities(text, labels)
                 df = pd.DataFrame(entities)
                 if not df.empty:
                     df['text'] = df['text'].apply(remove_trailing_punctuation)
                     df['category'] = df['label'].map(reverse_category_mapping)
                     st.session_state.results_df = df
                     unique_entity_count = len(df['text'].unique())
                     N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
                     st.session_state.topic_results = perform_topic_modeling(
                         df,
                         num_topics=2,
                         num_top_words=N_TOP_WORDS_TO_USE
                     )
                     if comet_initialized:
                         experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
                         experiment.log_parameter("input_text", text)
@@ -685,37 +666,32 @@ if st.button("Results"):
                 else:
                     st.session_state.results_df = pd.DataFrame()
                     st.session_state.topic_results = None
                 end_time = time.time()
                 st.session_state.elapsed_time = end_time - start_time
                 st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
             st.session_state.show_results = True
-# --- Display Download Link and Results (Updated with Download Buttons) ---
 if st.session_state.show_results:
     df = st.session_state.results_df
     df_topic_data = st.session_state.topic_results
     if df.empty:
         st.warning("No entities were found in the provided text.")
     else:
         st.subheader("Analysis Results", divider="blue")
         # 1. Highlighted Text
         st.markdown("### 1. Analyzed Text with Highlighted Entities")
         st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
         # 2. Detailed Entity Analysis Tabs
         st.markdown("### 2. Detailed Entity Analysis")
         tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
         with tab_category_details:
             st.markdown("#### Detailed Entities Table (Grouped by Category)")
             unique_categories = list(category_mapping.keys())
             tabs_category = st.tabs(unique_categories)
             for category, tab in zip(unique_categories, tabs_category):
                 df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
                 with tab:
@@ -726,45 +702,89 @@ if st.session_state.show_results:
                             use_container_width=True,
                             column_config={'score': st.column_config.NumberColumn(format="%.4f")}
                         )
         with tab_treemap_viz:
             fig_treemap = px.treemap(
                 df,
                 path=[px.Constant("All Entities"), 'category', 'label', 'text'],
                 values='score',
                 color='category',
-                title="Entity Distribution by Category and Label",
                 color_discrete_sequence=px.colors.qualitative.Dark24
             )
-            fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
             st.plotly_chart(fig_treemap, use_container_width=True)
-        # 3. Download Options (NEW)
-        st.markdown("### 3. Download Options")
-        col_csv, col_html = st.columns(2)
-        with col_csv:
-            csv_data = generate_entity_csv(df)
-            st.download_button(
-                label="Download Entities as CSV",
-                data=csv_data,
-                file_name="entity_analysis_data.csv",
-                mime="text/csv",
-                type="primary"
-            )
-        with col_html:
-            html_report = generate_html_report(
-                df,
-                st.session_state.last_text,
-                st.session_state.elapsed_time,
-                df_topic_data
-            )
-            st.download_button(
-                label="Download Full HTML Report",
-                data=html_report,
-                file_name="entity_topic_report.html",
-                mime="text/html",
-                type="secondary"
-            )

 import os
+os.environ['HF_HOME'] = '/tmp'
 import time
 import streamlit as st
 import streamlit.components.v1 as components
 import re
 import string
 import json
+# --- PPTX Imports ---
 from io import BytesIO
 from pptx import Presentation
 from pptx.util import Inches, Pt
 from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
 import plotly.io as pio # Required for image export
+# ---------------------------
 # --- Stable Scikit-learn LDA Imports ---
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
+# ------------------------------
 from gliner import GLiNER
 from streamlit_extras.stylable_container import stylable_container
 # Using a try/except for comet_ml import
 try:
     from comet_ml import Experiment
         def log_parameter(self, *args): pass
         def log_table(self, *args): pass
         def end(self): pass
 # --- Model Home Directory (Fix for deployment environments) ---
 # Set HF_HOME environment variable to a writable path
 os.environ['HF_HOME'] = '/tmp'
 # --- Color Map for Highlighting and Network Graph Nodes ---
 entity_color_map = {
     "person": "#10b981",
     "cardinal": "#06b6d4",
     "money": "#f43f5e",
     "position": "#a855f7",
+    }
 # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
 labels = list(entity_color_map.keys())
 category_mapping = {
    "People": ["person", "organization", "position"],
    "Locations": ["country", "city"],
    "Time": ["date", "time"],
+   "Numbers": ["money", "cardinal"]}
 reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 # --- Utility Functions for Analysis and Plotly ---
 def extract_label(node_name):
     """Extracts the label from a node string like 'Text (Label)'."""
     match = re.search(r'\(([^)]+)\)$', node_name)
     return match.group(1) if match else "Unknown"
 def remove_trailing_punctuation(text_string):
     """Removes trailing punctuation from a string."""
     return text_string.rstrip(string.punctuation)
 def highlight_entities(text, df_entities):
     """Generates HTML to display text with entities highlighted and colored."""
     if df_entities.empty:
     # Use a div to mimic the Streamlit input box style for the report
     return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     """
     Performs basic Topic Modeling using LDA on the extracted entities,
+    allowing for n-grams to capture multi-word entities like 'Dr. Emily Carter'.
     """
+    # 1. Prepare Documents: Use unique entities (they are short, clean documents)
     documents = df_entities['text'].unique().tolist()
     if len(documents) < 2:
         return None
     N = min(num_top_words, len(documents))
     try:
+        # 2. Vectorizer: Use TfidfVectorizer, but allow unigrams, bigrams, and trigrams (ngram_range)
+        # to capture multi-word entities. We keep stop_words='english' for the *components* of the entity.
         tfidf_vectorizer = TfidfVectorizer(
             max_df=0.95,
             min_df=2, # Only consider words/phrases that appear at least twice to find topics
             stop_words='english',
+            ngram_range=(1, 3) # This is the KEY to capturing "Dr. Emily Carter" as a single token (if it appears enough times)
         )
         tfidf = tfidf_vectorizer.fit_transform(documents)
         tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
         # Check if the vocabulary is too small after tokenization/ngram generation
         if len(tfidf_feature_names) < num_topics:
             # Re-run with min_df=1 if vocab is too small
             random_state=42, n_jobs=-1
         )
         lda.fit(tfidf)
         # 4. Extract Topic Data
         topic_data_list = []
         for topic_idx, topic in enumerate(lda.components_):
             top_words_indices = topic.argsort()[:-N - 1:-1]
+            # These top_words will now include phrases like 'emily carter' or 'european space agency'
             top_words = [tfidf_feature_names[i] for i in top_words_indices]
             word_weights = [topic[i] for i in top_words_indices]
             for word, weight in zip(top_words, word_weights):
                 topic_data_list.append({
                     'Topic_ID': f'Topic #{topic_idx + 1}',
                     'Word': word,
                     'Weight': weight,
                 })
         return pd.DataFrame(topic_data_list)
     except Exception as e:
+        # A broader catch for robustness
+        # st.error(f"Topic modeling failed: {e}") # Keep commented out for cleaner app
         return None
 def create_topic_word_bubbles(df_topic_data):
     """Generates a Plotly Bubble Chart for top words across
     all topics, displaying the word directly on the bubble."""
     # Renaming columns to match the output of perform_topic_modeling
+    df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic',
+'Word': 'word', 'Weight': 'weight'})
     df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position
     if df_topic_data.empty:
         return None
     fig = px.scatter(
         df_topic_data,
         x='x_pos',
         # Set text to the word
         text='word',
         hover_name='word',
+        size_max=40,
         title='Topic Word Weights (Bubble Chart)',
         color_discrete_sequence=px.colors.qualitative.Bold,
         labels={
     fig.update_layout(
         xaxis_title="Entity/Word",
         yaxis_title="Word Weight",
+        # Hide x-axis labels since words are now labels
         xaxis={'tickangle': -45, 'showgrid': False, 'showticklabels': False, 'zeroline': False, 'showline': False},
         yaxis={'showgrid': True},
         showlegend=True,
         height=600,
         margin=dict(t=50, b=100, l=50, r=10),
     )
+    # Update traces to show the word text, set the text position, and set text color
     fig.update_traces(
+        # Position the text on top of the bubble
         textposition='middle center',
+        # --- THE KEY FIX IS HERE ---
+        # Set the text color to white for visibility against dark bubble colors
+        textfont=dict(color='white', size=10),
+        # ---------------------------
         hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',
         marker=dict(line=dict(width=1, color='DarkSlateGrey'))
     )
     return fig
 def generate_network_graph(df, raw_text):
     """
     Generates a network graph visualization (Node Plot) with edges
+    based on entity co-occurrence in sentences. (Content omitted for brevity but assumed to be here).
     """
+    # Using the existing generate_network_graph logic from previous context...
     entity_counts = df['text'].value_counts().reset_index()
     entity_counts.columns = ['text', 'frequency']
     unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
     if unique_entities.shape[0] < 2:
         return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
     num_nodes = len(unique_entities)
     thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
     radius = 10
     unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
     unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
     pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
     edges = set()
     sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
     for sentence in sentences:
         entities_in_sentence = []
         for entity_text in unique_entities['text'].unique():
             if entity_text.lower() in sentence.lower():
                 entities_in_sentence.append(entity_text)
         unique_entities_in_sentence = list(set(entities_in_sentence))
         for i in range(len(unique_entities_in_sentence)):
             for j in range(i + 1, len(unique_entities_in_sentence)):
                 node1 = unique_entities_in_sentence[i]
                 node2 = unique_entities_in_sentence[j]
                 edge_tuple = tuple(sorted((node1, node2)))
                 edges.add(edge_tuple)
     edge_x = []
     edge_y = []
     for edge in edges:
         if n1 in pos_map and n2 in pos_map:
             edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
             edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
     fig = go.Figure()
     edge_trace = go.Scatter(
         x=edge_x, y=edge_y,
         line=dict(width=0.5, color='#888'),
         showlegend=False
     )
     fig.add_trace(edge_trace)
     fig.add_trace(go.Scatter(
         x=unique_entities['x'],
         y=unique_entities['y'],
         textposition="top center",
         showlegend=False,
         marker=dict(
             size=unique_entities['frequency'] * 5 + 10,
             color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
             line_width=1,
             "Frequency: %{customdata[2]}<extra></extra>"
         )
     ))
     legend_traces = []
     seen_labels = set()
     for index, row in unique_entities.iterrows():
             ))
     for trace in legend_traces:
         fig.add_trace(trace)
     fig.update_layout(
         title='Entity Co-occurrence Network (Edges = Same Sentence)',
         showlegend=True,
         height=600
     )
     return fig
+# --- NEW CSV GENERATION FUNCTION ---
 def generate_entity_csv(df):
     """
     Generates a CSV file of the extracted entities in an in-memory buffer,
     csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
     csv_buffer.seek(0)
     return csv_buffer
+# -----------------------------------
+# --- Existing App Functionality (HTML) ---
 def generate_html_report(df, text_input, elapsed_time, df_topic_data):
     """
     Generates a full HTML report containing all analysis results and visualizations.
+    (Content omitted for brevity but assumed to be here).
     """
     # 1. Generate Visualizations (Plotly HTML)
     # 1a. Treemap
     fig_treemap = px.treemap(
         df,
     )
     fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
     # 1b. Pie Chart
     grouped_counts = df['category'].value_counts().reset_index()
     grouped_counts.columns = ['Category', 'Count']
+    # Changed color_discrete_sequence from sequential.RdBu (which has reds) to sequential.Cividis
     fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
     fig_pie.update_layout(margin=dict(t=50, b=10))
     pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
     # 1c. Bar Chart (Category Count)
     fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
     fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
     bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
     # 1d. Bar Chart (Most Frequent Entities)
     word_counts = df['text'].value_counts().reset_index()
     word_counts.columns = ['Entity', 'Count']
     repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
     bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
     if not repeating_entities.empty:
+        # Changed color_discrete_sequence from sequential.Plasma (which has pink/magenta) to sequential.Viridis
         fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
         fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
         bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
     # 1e. Network Graph HTML
     network_fig = generate_network_graph(df, text_input)
     network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
     # 1f. Topic Charts HTML
     topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
     if df_topic_data is not None and not df_topic_data.empty:
         bubble_figure = create_topic_word_bubbles(df_topic_data)
         if bubble_figure:
             topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn", config={"responsive": True})}</div>'
         else:
             topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
     else:
+        topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">' # Changed border color
         topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
         topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
         topic_charts_html += '</div>'
     # 2. Get Highlighted Text
     highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
     # 3. Entity Tables (Pandas to HTML)
     entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
         classes='table table-striped',
         index=False
     )
+    # 4. Construct the Final HTML
     html_content = f"""<!DOCTYPE html><html lang="en"><head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>Entity and Topic Analysis Report</title>
     <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
     <style>
+        body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
+        .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
+        h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom: 10px; margin-top: 0; }}
+        h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
         h3 {{ color: #555; margin-top: 20px; }}
+        .metadata {{ background-color: #e6f0ff; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
+        .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
         table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
+        table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
         table th {{ background-color: #f0f0f0; }}
+        .highlighted-text {{ border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
     </style></head><body>
     <div class="container">
         <h1>Entity and Topic Analysis Report</h1>
     </div></body></html>
     """
     return html_content
 # --- Page Configuration and Styling (No Sidebar) ---
 st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 # --- Conditional Mobile Warning ---
 st.markdown(
     """
 )
 # ----------------------------------
 st.markdown(
     """
     <style>
+    /* ... (Keep your existing styles for main, stApp, stTextArea, stButton) ... */
     /* --- FIX: Tab Label Colors for Visibility --- */
+    /* Target the container for the tab labels (the buttons) */
     [data-testid="stConfigurableTabs"] button {
+        color: #333333 !important; /* Dark gray for inactive tabs */
+        background-color: #f0f0f0; /* Light gray background for inactive tabs */
         border: 1px solid #cccccc;
     }
     /* Target the ACTIVE tab label */
     [data-testid="stConfigurableTabs"] button[aria-selected="true"] {
+        color: #FFFFFF !important; /* White text for active tab */
+        background-color: #007bff; /* Blue background for active tab */
+        border-bottom: 2px solid #007bff; /* Optional: adds an accent line */
     }
+    /* Expander header color fix (since you overwrote it to white) */
     .streamlit-expanderHeader {
+        color: #007bff; /* Blue text for Expander header */
     }
     </style>
     """,
     unsafe_allow_html=True
 )
+st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
+tab1, tab2 = st.tabs(["Embed", "Important Notes"]) # Assuming you have defined the tabs
 with tab1:
     with st.expander("Embed"):
         height="450"
     ></iframe>
     '''
+        st.code(code, language="html") # Keeps the copy icon, as intended for tab1
 with tab2:
     expander = st.expander("**Important Notes**")
+    # Use st.markdown() with a code block (```) to display the notes
+    # without the copy-to-clipboard icon, and retaining the styling.
     expander.markdown("""
     **Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
     **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
     **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
     **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
     """)
+st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 # --- Comet ML Setup (Placeholder/Conditional) ---
 COMET_API_KEY = os.environ.get("COMET_API_KEY")
 COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
 comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 # --- Model Loading ---
 @st.cache_resource
 def load_ner_model():
     except Exception as e:
         st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
         st.stop()
 model = load_ner_model()
 # --- LONG DEFAULT TEXT (178 Words) ---
 DEFAULT_TEXT = (
     "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
     "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
     "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
     "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026."
 )
+# -----------------------------------
+# --- Session State Initialization (CRITICAL FIX) ---
 if 'show_results' not in st.session_state:
     st.session_state.show_results = False
 if 'last_text' not in st.session_state:
     st.session_state.elapsed_time = 0.0
 if 'topic_results' not in st.session_state:
     st.session_state.topic_results = None
 if 'my_text_area' not in st.session_state:
     st.session_state.my_text_area = DEFAULT_TEXT
+# --- Clear Button Function (MODIFIED) ---
 def clear_text():
     """Clears the text area (sets it to an empty string) and hides results."""
     st.session_state['my_text_area'] = ""
     st.session_state.results_df = pd.DataFrame()
     st.session_state.elapsed_time = 0.0
     st.session_state.topic_results = None
 # --- Text Input and Clear Button ---
 word_limit = 1000
 text = st.text_area(
     f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
     height=250,
+    key='my_text_area',
     )
 word_count = len(text.split())
 st.markdown(f"**Word count:** {word_count}/{word_limit}")
 st.button("Clear text", on_click=clear_text)
 # --- Results Trigger and Processing (Updated Logic) ---
 if st.button("Results"):
     if not text.strip():
             if text != st.session_state.last_text:
                 st.session_state.last_text = text
                 start_time = time.time()
                 # --- Model Prediction & Dataframe Creation ---
                 entities = model.predict_entities(text, labels)
                 df = pd.DataFrame(entities)
                 if not df.empty:
                     df['text'] = df['text'].apply(remove_trailing_punctuation)
                     df['category'] = df['label'].map(reverse_category_mapping)
                     st.session_state.results_df = df
                     unique_entity_count = len(df['text'].unique())
                     N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
                     st.session_state.topic_results = perform_topic_modeling(
                         df,
                         num_topics=2,
                         num_top_words=N_TOP_WORDS_TO_USE
                     )
                     if comet_initialized:
                         experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
                         experiment.log_parameter("input_text", text)
                 else:
                     st.session_state.results_df = pd.DataFrame()
                     st.session_state.topic_results = None
                 end_time = time.time()
                 st.session_state.elapsed_time = end_time - start_time
                 st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
             st.session_state.show_results = True
+# --- Display Download Link and Results ---
 if st.session_state.show_results:
     df = st.session_state.results_df
     df_topic_data = st.session_state.topic_results
     if df.empty:
         st.warning("No entities were found in the provided text.")
     else:
         st.subheader("Analysis Results", divider="blue")
         # 1. Highlighted Text
         st.markdown("### 1. Analyzed Text with Highlighted Entities")
         st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
         # 2. Detailed Entity Analysis Tabs
         st.markdown("### 2. Detailed Entity Analysis")
         tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
         with tab_category_details:
             st.markdown("#### Detailed Entities Table (Grouped by Category)")
             unique_categories = list(category_mapping.keys())
             tabs_category = st.tabs(unique_categories)
             for category, tab in zip(unique_categories, tabs_category):
                 df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
                 with tab:
                             use_container_width=True,
                             column_config={'score': st.column_config.NumberColumn(format="%.4f")}
                         )
+                    else:
+                        st.info(f"No entities of category **{category}** were found in the text.")
+            with st.expander("See Glossary of tags"):
+                st.write('''
+                - **text**: ['entity extracted from your text data']
+                - **label**: ['label (tag) assigned to a given extracted entity']
+                - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
+                - **start**: ['index of the start of the corresponding entity']
+                - **end**: ['index of the end of the corresponding entity']
+                ''')
         with tab_treemap_viz:
+            st.markdown("#### Treemap: Entity Distribution")
             fig_treemap = px.treemap(
                 df,
                 path=[px.Constant("All Entities"), 'category', 'label', 'text'],
                 values='score',
                 color='category',
                 color_discrete_sequence=px.colors.qualitative.Dark24
             )
+            fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
             st.plotly_chart(fig_treemap, use_container_width=True)
+        # 3. Comparative Charts
+        st.markdown("---")
+        st.markdown("### 3. Comparative Charts")
+        col1, col2, col3 = st.columns(3)
+        grouped_counts = df['category'].value_counts().reset_index()
+        grouped_counts.columns = ['Category', 'Count']
+        with col1: # Pie Chart
+            # Changed color_discrete_sequence
+            fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
+            fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
+            st.plotly_chart(fig_pie, use_container_width=True)
+        with col2: # Bar Chart (Category Count)
+            fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
+            fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
+            st.plotly_chart(fig_bar_category, use_container_width=True)
+        with col3: # Bar Chart (Most Frequent Entities)
+            word_counts = df['text'].value_counts().reset_index()
+            word_counts.columns = ['Entity', 'Count']
+            repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
+            if not repeating_entities.empty:
+                # Changed color_discrete_sequence
+                fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
+                fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
+                st.plotly_chart(fig_bar_freq, use_container_width=True)
+            else:
+                st.info("No entities repeat for frequency chart.")
+        st.markdown("---")
+        st.markdown("### 4. Entity Relationship Map")
+        network_fig = generate_network_graph(df, st.session_state.last_text)
+        st.plotly_chart(network_fig, use_container_width=True)
+        st.markdown("---")
+        st.markdown("### 5. Topic Modelling Analysis")
+        if df_topic_data is not None and not df_topic_data.empty:
+            bubble_figure = create_topic_word_bubbles(df_topic_data)
+            if bubble_figure:
+                st.plotly_chart(bubble_figure, use_container_width=True)
+            else:
+                st.error("Error generating Topic Word Bubble Chart.")
+        else:
+            st.info("Topic modeling requires more unique input (at least two unique entities).")
+        # --- Report Download ---
+        st.markdown("---")
+        st.markdown("### Download Full Report Artifacts")
+        # 1. HTML Report Download (Retained)
+        html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
+        st.download_button(
+            label="Download Comprehensive HTML Report",
+            data=html_report,
+            file_name="ner_topic_report.html",
+            mime="text/html",
+            type="primary"
+        )
+        # 2. CSV Data Download (NEW)
+        csv_buffer = generate_entity_csv(df)
+        st.download_button(
+            label="Download Extracted Entities (CSV)",
+            data=csv_buffer,
+            file_name="extracted_entities.csv",
+            mime="text/csv",
+            type="secondary"
+        )