Spaces:

AIEcosystem
/

render4

Runtime error

App Files Files Community

AIEcosystem commited on Nov 6, 2025

Commit

ce1b83d

verified ·

1 Parent(s): 4f11778

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +184 -170

src/streamlit_app.py CHANGED Viewed

@@ -22,6 +22,7 @@ from sklearn.decomposition import LatentDirichletAllocation
 # ------------------------------
 from gliner import GLiNER
 from streamlit_extras.stylable_container import stylable_container
 # Using a try/except for comet_ml import
 try:
     from comet_ml import Experiment
@@ -31,8 +32,10 @@ except ImportError:
         def log_parameter(self, *args): pass
         def log_table(self, *args): pass
         def end(self): pass
 # --- Model Home Directory (Fix for deployment environments) ---
 os.environ['HF_HOME'] = '/tmp'
 # --- Fixed Label Definitions and Mappings (Used as Fallback) ---
 FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
 FIXED_ENTITY_COLOR_MAP = {
@@ -51,18 +54,23 @@ FIXED_CATEGORY_MAPPING = {
   "People & Roles": ["person", "organization", "position"],
   "Locations": ["country", "city"],
   "Time & Dates": ["date", "time"],
-  "Numbers & Finance": ["money", "cardinal"]}
 REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
 # --- Dynamic Color Generator for Custom Labels ---
-# Use Plotly's Alphabet set for a large pool of distinct colors
 COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
 def extract_label(node_name):
     """Extracts the label from a node string like 'Text (Label)'."""
     match = re.search(r'\(([^)]+)\)$', node_name)
     return match.group(1) if match else "Unknown"
 def remove_trailing_punctuation(text_string):
     """Removes trailing punctuation from a string."""
     return text_string.rstrip(string.punctuation)
 def get_dynamic_color_map(active_labels, fixed_map):
     """Generates a color map, using fixed colors if available, otherwise dynamic colors."""
     color_map = {}
@@ -78,6 +86,7 @@ def get_dynamic_color_map(active_labels, fixed_map):
             # Generate a new color from the palette
             color_map[label] = next(COLOR_PALETTE)
     return color_map
 def highlight_entities(text, df_entities, entity_color_map):
     """
     Generates HTML to display text with entities highlighted and colored.
@@ -103,6 +112,7 @@ def highlight_entities(text, df_entities, entity_color_map):
         highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
     # Use a div to mimic the Streamlit input box style for the report
     return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     """Performs basic Topic Modeling using LDA."""
     documents = df_entities['text'].unique().tolist()
@@ -137,6 +147,7 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
         return pd.DataFrame(topic_data_list)
     except Exception as e:
         return None
 def create_topic_word_bubbles(df_topic_data):
     """Generates a Plotly Bubble Chart for top words across all topics."""
     df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
@@ -166,6 +177,7 @@ def create_topic_word_bubbles(df_topic_data):
         marker=dict(line=dict(width=1, color='DarkSlateGrey'))
     )
     return fig
 def generate_network_graph(df, raw_text, entity_color_map):
     """Generates a network graph visualization (Node Plot) with edges based on entity co-occurrence in sentences."""
     entity_counts = df['text'].value_counts().reset_index()
@@ -202,6 +214,7 @@ def generate_network_graph(df, raw_text, entity_color_map):
         if n1 in pos_map and n2 in pos_map:
             edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
             edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
     fig = go.Figure()
     edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
     fig.add_trace(edge_trace)
@@ -235,7 +248,7 @@ def generate_network_graph(df, raw_text, entity_color_map):
         margin=dict(t=50, b=10, l=10, r=10), height=600
     )
     return fig
-# --- CSV GENERATION FUNCTION ---
 def generate_entity_csv(df):
     """Generates a CSV file of the extracted entities in an in-memory buffer."""
     csv_buffer = BytesIO()
@@ -243,6 +256,7 @@ def generate_entity_csv(df):
     csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
     csv_buffer.seek(0)
     return csv_buffer
 # -----------------------------------
 # --- HTML REPORT GENERATION FUNCTION (MODIFIED FOR WHITE-LABEL) ---
 def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
@@ -252,6 +266,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
     """
     # Use the category values from the DataFrame to ensure the report matches the app's current mode (fixed or custom)
     unique_categories = df['category'].unique()
     # 1. Generate Visualizations (Plotly HTML)
     # 1a. Treemap
     fig_treemap = px.treemap(
@@ -263,17 +278,21 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
         color_discrete_sequence=px.colors.qualitative.Dark24
     )
     fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
-    treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn') # 1b. Pie Chart
     grouped_counts = df['category'].value_counts().reset_index()
     grouped_counts.columns = ['Category', 'Count']
     color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
     fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
     fig_pie.update_layout(margin=dict(t=50, b=10))
     pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
     # 1c. Bar Chart (Category Count)
     fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
     fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
     bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
     # 1d. Bar Chart (Most Frequent Entities)
     word_counts = df['text'].value_counts().reset_index()
     word_counts.columns = ['Entity', 'Count']
@@ -283,10 +302,11 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
         fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
         fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
         bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
     # 1e. Network Graph HTML - IMPORTANT: Pass color map
     network_fig = generate_network_graph(df, text_input, entity_color_map)
     network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
-   # 1f. Topic Charts HTML
     topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
     if df_topic_data is not None and not df_topic_data.empty:
         bubble_figure = create_topic_word_bubbles(df_topic_data)
@@ -299,13 +319,16 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
         topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
         topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
         topic_charts_html += '</div>'
     # 2. Get Highlighted Text - IMPORTANT: Pass color map
     highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
     # 3. Entity Tables (Pandas to HTML)
     entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
         classes='table table-striped',
         index=False
     )
     # 4. Construct the Final HTML (UPDATED FOR WHITE-LABELING)
     html_content = f"""<!DOCTYPE html><html lang="en"><head>
         <meta charset="UTF-8">
@@ -330,7 +353,8 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
         <div class="container">
             <h1>{report_title}</h1>
             <div class="metadata">
-                {branding_html} <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
                 <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
             </div>
             <h2>1. Analyzed Text & Extracted Entities</h2>
@@ -338,8 +362,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
             <div class="highlighted-text-container">
                  {highlighted_text_html}
             </div>
-            <h2>2. Full Extracted Entities Table
-           </h2>
             {entity_table_html}
             <h2>3. Data Visualizations</h2>
             <h3>3.1 Entity Distribution Treemap</h3>
@@ -358,7 +381,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
     </html>
     """
     return html_content
-# --- CHUNKING IMPLEMENTATION FOR LARGE TEXT ---
 def chunk_text(text, max_chunk_size=1500):
     """Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
     # Split by double newline (paragraph) or sentence-like separators
@@ -378,6 +401,7 @@ def chunk_text(text, max_chunk_size=1500):
     if current_chunk:
         chunks.append((current_chunk, current_offset))
     return chunks
 def process_chunked_text(text, labels, model):
     """Processes large text in chunks and aggregates/offsets the entities."""
     # GLiNER model context size can be around 1024-1500 tokens/words. We use a generous char limit.
@@ -394,9 +418,9 @@ def process_chunked_text(text, labels, model):
             entity['end'] += chunk_offset
             all_entities.append(entity)
     return all_entities
-# -----------------------------------
-# --- Page Configuration and Styling (No Sidebar) ---
 st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 # --- Conditional Mobile Warning ---
 st.markdown(
     """
@@ -454,10 +478,8 @@ st.markdown(
     unsafe_allow_html=True)
 # --- Topic Modeling Settings (Moved to main body, but need to initialize key outside of 'if st.session_state.show_results:') ---
-# st.sidebar.header("Topic Modeling Settings 💡") # Removed sidebar header
-st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
-# Removed st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") for white-labeling
 tab1, tab2 = st.tabs(["Embed", "Important Notes"])
 with tab1:
     with st.expander("Embed"):
@@ -471,6 +493,7 @@ with tab1:
     ></iframe>
     '''
         st.code(code, language="html")
 with tab2:
     expander = st.expander("**Important Notes**")
     expander.markdown("""
@@ -480,6 +503,7 @@ with tab2:
     **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
     """)
     st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)") # Updated contact info
 # --- Comet ML Setup (Placeholder/Conditional) ---
 COMET_API_KEY = os.environ.get("COMET_API_KEY")
 COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
@@ -498,6 +522,7 @@ def load_ner_model(labels):
         print(f"FATAL ERROR: Failed to load NER model: {e}")
         st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
         st.stop()
 # --- LONG DEFAULT TEXT (178 Words) ---
 DEFAULT_TEXT = (
     "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
@@ -514,6 +539,7 @@ DEFAULT_TEXT = (
     "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
     "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
     "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
 # -----------------------------------
 # --- Session State Initialization (CRITICAL FIX) ---
 if 'show_results' not in st.session_state: st.session_state.show_results = False
@@ -530,8 +556,9 @@ if 'num_topics_slider' not in st.session_state: st.session_state.num_topics_slid
 if 'num_top_words_slider' not in st.session_state: st.session_state.num_top_words_slider = 10
 if 'last_num_topics' not in st.session_state: st.session_state.last_num_topics = None
 if 'last_num_top_words' not in st.session_state: st.session_state.last_num_top_words = None
-# --- Clear Button Function (MODIFIED) ---
 def clear_text():
     """Clears the text area (sets it to an empty string) and hides results."""
     st.session_state['my_text_area'] = ""
@@ -586,81 +613,72 @@ if run_button:
         st.session_state.is_custom_mode = False
     active_labels = st.session_state.active_labels_list
-    if not text.strip():
-        st.warning("Please enter some text to extract entities.")
-        st.session_state.show_results = False
-    elif word_count > word_limit:
-        st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
-        st.session_state.show_results = False
-    else:
-        # Define a safe threshold for when to start chunking (e.g., above 500 words)
         CHUNKING_THRESHOLD = 500
         should_chunk = word_count > CHUNKING_THRESHOLD
         mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
         if should_chunk:
             mode_msg += " with **chunking** for large text"
-        # --- Topic Modeling Input Retrieval (Using default or current state values) ---
-        # The actual sliders are only visible after results are shown, so here we use the state defaults
-        # or the last successfully run values to check for changes and run the model.
-        # Use the key that holds the current value, which is initialized at the top level
-        current_num_topics = st.session_state.num_topics_slider
-        current_num_top_words = st.session_state.num_top_words_slider
-        with st.spinner(f"Extracting entities using {mode_msg}...", show_time=True):
-            # Re-run prediction only if text, active labels, OR topic parameters have changed
-            current_settings = (text, tuple(active_labels), current_num_topics, current_num_top_words)
-            # Add topic settings to last_settings check
-            last_settings = (
-                st.session_state.last_text,
-                tuple(st.session_state.get('last_active_labels', [])),
-                st.session_state.get('last_num_topics', None),
-                st.session_state.get('last_num_top_words', None)
-            )
-            if current_settings != last_settings:
-                start_time = time.time()
-                ner_model = load_ner_model(labels=active_labels)
-                # 2. Perform NER Extraction
-                if should_chunk:
-                    all_entities_list = process_chunked_text(text, active_labels, ner_model)
-                else:
-                    all_entities_list = ner_model.predict_entities(text, active_labels)
-                df = pd.DataFrame(all_entities_list)
-                if df.empty:
-                    df_topic_data = None
                 else:
-                    # 3. Add Category Mapping
-                    df['category'] = df['label'].apply(
-                        lambda l: REVERSE_FIXED_CATEGORY_MAPPING.get(l, "User Defined Entities")
-                    )
-                    # 4. Perform Topic Modeling (Passing the new parameters)
-                    df_topic_data = perform_topic_modeling(
-                        df_entities=df,
-                        num_topics=current_num_topics, # PARAMETER
-                        num_top_words=current_num_top_words # PARAMETER
-                    )
-                end_time = time.time()
-                elapsed_time = end_time - start_time
-                # 5. Save Results to Session State
-                st.session_state.results_df = df
-                st.session_state.topic_results = df_topic_data
-                st.session_state.elapsed_time = elapsed_time
-                st.session_state.last_text = text
-                st.session_state.show_results = True
-                st.session_state.last_active_labels = active_labels
-                st.session_state.last_num_topics = current_num_topics # Save topic settings
-                st.session_state.last_num_top_words = current_num_top_words # Save topic settings
             else:
-                st.info("Results already calculated for the current text and settings.")
-                st.session_state.show_results = True
 # --- Display Download Link and Results (Updated with White-Label inputs) ---
 if st.session_state.show_results:
@@ -677,9 +695,11 @@ if st.session_state.show_results:
         # 1. Highlighted Text
         st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
         st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
         # 2. Detailed Entity Analysis Tabs
         st.markdown("### 2. Detailed Entity Analysis")
         tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
         # Determine which categories to use for the tabs
         if st.session_state.is_custom_mode:
             unique_categories = ["User Defined Entities"]
@@ -687,95 +707,77 @@ if st.session_state.show_results:
             st.markdown(f"**Custom Labels Detected: {', '.join(tabs_to_show)}**")
         else:
             unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
-        # --- Section 2a: Detailed Tables by Category/Label ---
-# --- Function to Apply Conditional Coloring to Scores ---
-def color_score_gradient(df):
-    """
-    Applies a color gradient to the 'score' column using Pandas Styler.
-    High scores (closer to 1.0) will be darker/more saturated.
-    """
-    # Use 'YlGnBu' (Yellow-Green-Blue) gradient.
-    # We apply the gradient only to the 'score' column subset.
-    return df.style.background_gradient(
-        cmap='YlGnBu',
-        subset=['score']
-    ).format(
-        {'score': '{:.4f}'} # Re-apply the four decimal place format
-    )
-# --- Your Main Tab Detail Logic ---
-# Note: This code assumes 'df', 'st.session_state.is_custom_mode', and 'unique_categories'
-# are already defined earlier in your Streamlit application.
-tab_category_details:
-    st.markdown("#### Detailed Entities Table (Grouped by Category)")
-    if st.session_state.is_custom_mode:
-        # In custom mode, group by the actual label since the category is just "User Defined Entities"
-        tabs_list = df['label'].unique().tolist()
-        tabs_category = st.tabs(tabs_list)
-        for label, tab in zip(tabs_list, tabs_category):
-            # Prepare the DataFrame for the current label
-            df_label = df[df['label'] == label][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
-            # Apply the coloring function
-            styled_df_label = color_score_gradient(df_label)
-            with tab:
-                st.markdown(f"##### {label.capitalize()} Entities ({len(df_label)} total)")
-                st.dataframe(
-                    # Pass the STYLED DataFrame object to Streamlit
-                    styled_df_label,
-                    use_container_width=True,
-                    # NOTE: st.column_config for 'score' is removed because Pandas Styler handles formatting and coloring
-                )
-    else:
-        # In fixed mode, group by the category defined in FIXED_CATEGORY_MAPPING
-        tabs_category = st.tabs(unique_categories)
-        for category, tab in zip(unique_categories, tabs_category):
-            # Prepare the DataFrame for the current category
-            df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
-            # Apply the coloring function
-            styled_df_category = color_score_gradient(df_category)
-            with tab:
-                st.markdown(f"##### {category} Entities ({len(df_category)} total)")
-                if not df_category.empty:
-                    st.dataframe(
-                        # Pass the STYLED DataFrame object to Streamlit
-                        styled_df_category,
-                        use_container_width=True,
-                        # NOTE: st.column_config for 'score' is removed
-                    )
-                else:
-                    st.info(f"No entities of category **{category}** were found in the text.")
             # --- INSERTED GLOSSARY HERE ---
             with st.expander("See Glossary of tags"):
-                st.write('''- **text**: ['entity extracted from your text data']- **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']- **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']- **start**: ['index of the start of the corresponding entity']- **end**: ['index of the end of the corresponding entity']''')
             # --- END GLOSSARY INSERTION ---
         # --- Section 2b: Treemap Visualization ---
         with tab_treemap_viz:
             st.markdown("#### Treemap: Entity Distribution")
@@ -788,6 +790,7 @@ tab_category_details:
             )
             fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
             st.plotly_chart(fig_treemap, use_container_width=True)
         # --- Section 3: Comparative Charts (COMPLETED) ---
         st.markdown("---")
         st.markdown("### 3. Comparative Charts")
@@ -796,15 +799,18 @@ tab_category_details:
         grouped_counts.columns = ['Category', 'Count']
         # Determine color sequence for charts
         chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
         with col1: # Pie Chart
             fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
             fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
             st.plotly_chart(fig_pie, use_container_width=True)
         with col2: # Bar Chart by Category
             st.markdown("#### Entity Count by Category")
             fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
             fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
             st.plotly_chart(fig_bar_category, use_container_width=True)
         with col3: # Bar Chart for Most Frequent Entities
             st.markdown("#### Top 10 Most Frequent Entities")
             word_counts = df['text'].value_counts().reset_index()
@@ -832,7 +838,6 @@ tab_category_details:
             st.markdown("Adjust the settings below and click **'Re-Run Topic Model'** to instantly update the visualization based on the extracted entities.")
             col_slider_topic, col_slider_words, col_rerun_btn = st.columns([1, 1, 0.5])
             with col_slider_topic:
                 new_num_topics = st.slider(
                     "Number of Topics",
@@ -859,7 +864,6 @@ tab_category_details:
                 # Update session state with the new slider values
                 st.session_state.num_topics_slider = st.session_state.num_topics_slider_new
                 st.session_state.num_top_words_slider = st.session_state.num_top_words_slider_new
                 # Recalculate topic modeling results
                 if not st.session_state.results_df.empty:
                     df_topic_data_new = perform_topic_modeling(
@@ -884,7 +888,6 @@ tab_category_details:
             * Topics: **{st.session_state.last_num_topics}**
             * Top Words: **{st.session_state.last_num_top_words}**
             """)
             df_topic_data = st.session_state.topic_results # Get the potentially updated results
             if df_topic_data is not None and not df_topic_data.empty:
                 st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
@@ -892,7 +895,6 @@ tab_category_details:
             else:
                 st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
         # --- 5. White-Label Configuration (NEW SECTION FOR CUSTOM BRANDING) ---
         st.markdown("---")
         st.markdown("### 5. White-Label Report Configuration 🎨")
@@ -909,10 +911,12 @@ tab_category_details:
             key='custom_branding_input',
             help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
         )
         # 6. Downloads (Updated to pass custom variables)
         st.markdown("---")
         st.markdown("### 6. Downloads")
         col_csv, col_html = st.columns(2)
         # CSV Download
         csv_buffer = generate_entity_csv(df)
         with col_csv:
@@ -923,9 +927,11 @@ tab_category_details:
                 mime="text/csv",
                 use_container_width=True
             )
         # --- NEW LOGIC: Wrap the simple text input into proper HTML for the report ---
         # We wrap the user's plain text in a styled HTML paragraph element
         branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
         # HTML Download (Passing custom white-label parameters)
         html_content = generate_html_report(
             df,
@@ -944,4 +950,12 @@ tab_category_details:
                 file_name="ner_topic_full_report.html",
                 mime="text/html",
                 use_container_width=True
-            )

 # ------------------------------
 from gliner import GLiNER
 from streamlit_extras.stylable_container import stylable_container
 # Using a try/except for comet_ml import
 try:
     from comet_ml import Experiment
         def log_parameter(self, *args): pass
         def log_table(self, *args): pass
         def end(self): pass
 # --- Model Home Directory (Fix for deployment environments) ---
 os.environ['HF_HOME'] = '/tmp'
 # --- Fixed Label Definitions and Mappings (Used as Fallback) ---
 FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
 FIXED_ENTITY_COLOR_MAP = {
   "People & Roles": ["person", "organization", "position"],
   "Locations": ["country", "city"],
   "Time & Dates": ["date", "time"],
+  "Numbers & Finance": ["money", "cardinal"]
+}
 REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
 # --- Dynamic Color Generator for Custom Labels ---
+# Use Plotly's Alphabet set for a large pool of distinct colors
 COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
 def extract_label(node_name):
     """Extracts the label from a node string like 'Text (Label)'."""
     match = re.search(r'\(([^)]+)\)$', node_name)
     return match.group(1) if match else "Unknown"
 def remove_trailing_punctuation(text_string):
     """Removes trailing punctuation from a string."""
     return text_string.rstrip(string.punctuation)
 def get_dynamic_color_map(active_labels, fixed_map):
     """Generates a color map, using fixed colors if available, otherwise dynamic colors."""
     color_map = {}
             # Generate a new color from the palette
             color_map[label] = next(COLOR_PALETTE)
     return color_map
 def highlight_entities(text, df_entities, entity_color_map):
     """
     Generates HTML to display text with entities highlighted and colored.
         highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
     # Use a div to mimic the Streamlit input box style for the report
     return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     """Performs basic Topic Modeling using LDA."""
     documents = df_entities['text'].unique().tolist()
         return pd.DataFrame(topic_data_list)
     except Exception as e:
         return None
 def create_topic_word_bubbles(df_topic_data):
     """Generates a Plotly Bubble Chart for top words across all topics."""
     df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
         marker=dict(line=dict(width=1, color='DarkSlateGrey'))
     )
     return fig
 def generate_network_graph(df, raw_text, entity_color_map):
     """Generates a network graph visualization (Node Plot) with edges based on entity co-occurrence in sentences."""
     entity_counts = df['text'].value_counts().reset_index()
         if n1 in pos_map and n2 in pos_map:
             edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
             edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
     fig = go.Figure()
     edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
     fig.add_trace(edge_trace)
         margin=dict(t=50, b=10, l=10, r=10), height=600
     )
     return fig
 def generate_entity_csv(df):
     """Generates a CSV file of the extracted entities in an in-memory buffer."""
     csv_buffer = BytesIO()
     csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
     csv_buffer.seek(0)
     return csv_buffer
 # -----------------------------------
 # --- HTML REPORT GENERATION FUNCTION (MODIFIED FOR WHITE-LABEL) ---
 def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
     """
     # Use the category values from the DataFrame to ensure the report matches the app's current mode (fixed or custom)
     unique_categories = df['category'].unique()
     # 1. Generate Visualizations (Plotly HTML)
     # 1a. Treemap
     fig_treemap = px.treemap(
         color_discrete_sequence=px.colors.qualitative.Dark24
     )
     fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+    treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
+    # 1b. Pie Chart
     grouped_counts = df['category'].value_counts().reset_index()
     grouped_counts.columns = ['Category', 'Count']
     color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
     fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
     fig_pie.update_layout(margin=dict(t=50, b=10))
     pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
     # 1c. Bar Chart (Category Count)
     fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
     fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
     bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
     # 1d. Bar Chart (Most Frequent Entities)
     word_counts = df['text'].value_counts().reset_index()
     word_counts.columns = ['Entity', 'Count']
         fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
         fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
         bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
     # 1e. Network Graph HTML - IMPORTANT: Pass color map
     network_fig = generate_network_graph(df, text_input, entity_color_map)
     network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
     topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
     if df_topic_data is not None and not df_topic_data.empty:
         bubble_figure = create_topic_word_bubbles(df_topic_data)
         topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
         topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
         topic_charts_html += '</div>'
     # 2. Get Highlighted Text - IMPORTANT: Pass color map
     highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
     # 3. Entity Tables (Pandas to HTML)
     entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
         classes='table table-striped',
         index=False
     )
     # 4. Construct the Final HTML (UPDATED FOR WHITE-LABELING)
     html_content = f"""<!DOCTYPE html><html lang="en"><head>
         <meta charset="UTF-8">
         <div class="container">
             <h1>{report_title}</h1>
             <div class="metadata">
+                {branding_html}
+                <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
                 <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
             </div>
             <h2>1. Analyzed Text & Extracted Entities</h2>
             <div class="highlighted-text-container">
                  {highlighted_text_html}
             </div>
+            <h2>2. Full Extracted Entities Table           </h2>
             {entity_table_html}
             <h2>3. Data Visualizations</h2>
             <h3>3.1 Entity Distribution Treemap</h3>
     </html>
     """
     return html_content
 def chunk_text(text, max_chunk_size=1500):
     """Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
     # Split by double newline (paragraph) or sentence-like separators
     if current_chunk:
         chunks.append((current_chunk, current_offset))
     return chunks
 def process_chunked_text(text, labels, model):
     """Processes large text in chunks and aggregates/offsets the entities."""
     # GLiNER model context size can be around 1024-1500 tokens/words. We use a generous char limit.
             entity['end'] += chunk_offset
             all_entities.append(entity)
     return all_entities
 st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 # --- Conditional Mobile Warning ---
 st.markdown(
     """
     unsafe_allow_html=True)
 # --- Topic Modeling Settings (Moved to main body, but need to initialize key outside of 'if st.session_state.show_results:') ---
+st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue")
 tab1, tab2 = st.tabs(["Embed", "Important Notes"])
 with tab1:
     with st.expander("Embed"):
     ></iframe>
     '''
         st.code(code, language="html")
 with tab2:
     expander = st.expander("**Important Notes**")
     expander.markdown("""
     **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
     """)
     st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)") # Updated contact info
 # --- Comet ML Setup (Placeholder/Conditional) ---
 COMET_API_KEY = os.environ.get("COMET_API_KEY")
 COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
         print(f"FATAL ERROR: Failed to load NER model: {e}")
         st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
         st.stop()
 # --- LONG DEFAULT TEXT (178 Words) ---
 DEFAULT_TEXT = (
     "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
     "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
     "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
     "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
 # -----------------------------------
 # --- Session State Initialization (CRITICAL FIX) ---
 if 'show_results' not in st.session_state: st.session_state.show_results = False
 if 'num_top_words_slider' not in st.session_state: st.session_state.num_top_words_slider = 10
 if 'last_num_topics' not in st.session_state: st.session_state.last_num_topics = None
 if 'last_num_top_words' not in st.session_state: st.session_state.last_num_top_words = None
+if 'last_active_labels' not in st.session_state: st.session_state.last_active_labels = None # Added for results comparison
 def clear_text():
     """Clears the text area (sets it to an empty string) and hides results."""
     st.session_state['my_text_area'] = ""
         st.session_state.is_custom_mode = False
     active_labels = st.session_state.active_labels_list
+    # Get current topic modeling settings (used for caching logic)
+    current_num_topics = st.session_state.num_topics_slider
+    current_num_top_words = st.session_state.num_top_words_slider
+    # Caching Logic: Check if we need to re-run the full process
+    should_rerun_full_analysis = (
+        text.strip() != st.session_state.last_text.strip() or
+        active_labels != st.session_state.last_active_labels
+    )
+    if should_rerun_full_analysis and text.strip() and word_count <= word_limit:
+        # 2. Rerunning Full Analysis
         CHUNKING_THRESHOLD = 500
         should_chunk = word_count > CHUNKING_THRESHOLD
         mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
         if should_chunk:
             mode_msg += " with **chunking** for large text"
+        with st.spinner(f"Analyzing text with {mode_msg}..."):
+            start_time = time.time()
+            # 2a. Load Model (Model constraints are updated based on active labels)
+            # NOTE: Load time is cached, so this is fast on subsequent runs.
+            model = load_ner_model(active_labels)
+            # 2b. Extract Entities (using chunking if necessary)
+            if should_chunk:
+                all_entities = process_chunked_text(text, active_labels, model)
+            else:
+                all_entities = model.predict_entities(text, active_labels)
+            end_time = time.time()
+            elapsed_time = end_time - start_time
+            # 2c. Prepare DataFrame
+            df = pd.DataFrame(all_entities)
+            if not df.empty:
+                # Add category mapping
+                if st.session_state.is_custom_mode:
+                    df['category'] = 'User Defined Entities'
                 else:
+                    df['category'] = df['label'].map(REVERSE_FIXED_CATEGORY_MAPPING).fillna('Other')
+                # Clean up extracted text
+                df['text'] = df['text'].apply(remove_trailing_punctuation)
+                # 2d. Perform Topic Modeling on extracted entities
+                df_topic_data = perform_topic_modeling(df, num_topics=current_num_topics, num_top_words=current_num_top_words)
             else:
+                df_topic_data = None
+            # 5. Save Results to Session State
+            st.session_state.results_df = df
+            st.session_state.topic_results = df_topic_data
+            st.session_state.elapsed_time = elapsed_time
+            st.session_state.last_text = text
+            st.session_state.show_results = True
+            st.session_state.last_active_labels = active_labels
+            st.session_state.last_num_topics = current_num_topics # Save topic settings
+            st.session_state.last_num_top_words = current_num_top_words # Save topic settings
+        else:
+            st.info("Results already calculated for the current text and settings.")
+            st.session_state.show_results = True
 # --- Display Download Link and Results (Updated with White-Label inputs) ---
 if st.session_state.show_results:
         # 1. Highlighted Text
         st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
         st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
         # 2. Detailed Entity Analysis Tabs
         st.markdown("### 2. Detailed Entity Analysis")
         tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
         # Determine which categories to use for the tabs
         if st.session_state.is_custom_mode:
             unique_categories = ["User Defined Entities"]
             st.markdown(f"**Custom Labels Detected: {', '.join(tabs_to_show)}**")
         else:
             unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
+        # --- Section 2a: Detailed Tables by Category/Label ---
+        # --- Function to Apply Conditional Coloring to Scores ---
+        def color_score_gradient(df):
+            """
+            Applies a color gradient to the 'score' column using Pandas Styler.
+            High scores (closer to 1.0) will be darker/more saturated.
+            """
+            # Use 'YlGnBu' (Yellow-Green-Blue) gradient.
+            # We apply the gradient only to the 'score' column subset.
+            return df.style.background_gradient(
+                cmap='YlGnBu',
+                subset=['score']
+            ).format(
+                {'score': '{:.4f}'} # Re-apply the four decimal place format
+            )
+        # --- Your Main Tab Detail Logic ---
+        with tab_category_details:
+            st.markdown("#### Detailed Entities Table (Grouped by Category)")
+            if st.session_state.is_custom_mode:
+                # In custom mode, group by the actual label since the category is just "User Defined Entities"
+                tabs_list = df['label'].unique().tolist()
+                tabs_category = st.tabs(tabs_list)
+                for label, tab in zip(tabs_list, tabs_category):
+                    # Prepare the DataFrame for the current label
+                    df_label = df[df['label'] == label][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
+                    # Apply the coloring function
+                    styled_df_label = color_score_gradient(df_label)
+                    with tab:
+                        st.markdown(f"##### {label.capitalize()} Entities ({len(df_label)} total)")
+                        st.dataframe(
+                            # Pass the STYLED DataFrame object to Streamlit
+                            styled_df_label,
+                            use_container_width=True,
+                            # NOTE: st.column_config for 'score' is removed because Pandas Styler handles formatting and coloring
+                        )
+            else:
+                # In fixed mode, group by the category defined in FIXED_CATEGORY_MAPPING
+                tabs_category = st.tabs(unique_categories)
+                for category, tab in zip(unique_categories, tabs_category):
+                    # Prepare the DataFrame for the current category
+                    df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
+                    # Apply the coloring function
+                    styled_df_category = color_score_gradient(df_category)
+                    with tab:
+                        st.markdown(f"##### {category} Entities ({len(df_category)} total)")
+                        if not df_category.empty:
+                            st.dataframe(
+                                # Pass the STYLED DataFrame object to Streamlit
+                                styled_df_category,
+                                use_container_width=True,
+                                # NOTE: st.column_config for 'score' is removed
+                            )
+                        else:
+                            st.info(f"No entities of category **{category}** were found in the text.")
             # --- INSERTED GLOSSARY HERE ---
             with st.expander("See Glossary of tags"):
+                st.write('''- **text**: ['entity extracted from your text data']
+- **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']
+- **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']
+- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
+- **start**: ['index of the start of the corresponding entity']
+- **end**: ['index of the end of the corresponding entity']''')
             # --- END GLOSSARY INSERTION ---
         # --- Section 2b: Treemap Visualization ---
         with tab_treemap_viz:
             st.markdown("#### Treemap: Entity Distribution")
             )
             fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
             st.plotly_chart(fig_treemap, use_container_width=True)
         # --- Section 3: Comparative Charts (COMPLETED) ---
         st.markdown("---")
         st.markdown("### 3. Comparative Charts")
         grouped_counts.columns = ['Category', 'Count']
         # Determine color sequence for charts
         chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
         with col1: # Pie Chart
             fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
             fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
             st.plotly_chart(fig_pie, use_container_width=True)
         with col2: # Bar Chart by Category
             st.markdown("#### Entity Count by Category")
             fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
             fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
             st.plotly_chart(fig_bar_category, use_container_width=True)
         with col3: # Bar Chart for Most Frequent Entities
             st.markdown("#### Top 10 Most Frequent Entities")
             word_counts = df['text'].value_counts().reset_index()
             st.markdown("Adjust the settings below and click **'Re-Run Topic Model'** to instantly update the visualization based on the extracted entities.")
             col_slider_topic, col_slider_words, col_rerun_btn = st.columns([1, 1, 0.5])
             with col_slider_topic:
                 new_num_topics = st.slider(
                     "Number of Topics",
                 # Update session state with the new slider values
                 st.session_state.num_topics_slider = st.session_state.num_topics_slider_new
                 st.session_state.num_top_words_slider = st.session_state.num_top_words_slider_new
                 # Recalculate topic modeling results
                 if not st.session_state.results_df.empty:
                     df_topic_data_new = perform_topic_modeling(
             * Topics: **{st.session_state.last_num_topics}**
             * Top Words: **{st.session_state.last_num_top_words}**
             """)
             df_topic_data = st.session_state.topic_results # Get the potentially updated results
             if df_topic_data is not None and not df_topic_data.empty:
                 st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
             else:
                 st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
         # --- 5. White-Label Configuration (NEW SECTION FOR CUSTOM BRANDING) ---
         st.markdown("---")
         st.markdown("### 5. White-Label Report Configuration 🎨")
             key='custom_branding_input',
             help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
         )
         # 6. Downloads (Updated to pass custom variables)
         st.markdown("---")
         st.markdown("### 6. Downloads")
         col_csv, col_html = st.columns(2)
         # CSV Download
         csv_buffer = generate_entity_csv(df)
         with col_csv:
                 mime="text/csv",
                 use_container_width=True
             )
         # --- NEW LOGIC: Wrap the simple text input into proper HTML for the report ---
         # We wrap the user's plain text in a styled HTML paragraph element
         branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
         # HTML Download (Passing custom white-label parameters)
         html_content = generate_html_report(
             df,
                 file_name="ner_topic_full_report.html",
                 mime="text/html",
                 use_container_width=True
+            )