Spaces:

AIEcosystem
/

render4

Runtime error

App Files Files Community

AIEcosystem commited on Nov 5, 2025

Commit

3dbd695

verified ·

1 Parent(s): b3682ae

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +82 -143

src/streamlit_app.py CHANGED Viewed

@@ -22,7 +22,6 @@ from sklearn.decomposition import LatentDirichletAllocation
 # ------------------------------
 from gliner import GLiNER
 from streamlit_extras.stylable_container import stylable_container
 # Using a try/except for comet_ml import
 try:
     from comet_ml import Experiment
@@ -32,10 +31,8 @@ except ImportError:
         def log_parameter(self, *args): pass
         def log_table(self, *args): pass
         def end(self): pass
 # --- Model Home Directory (Fix for deployment environments) ---
 os.environ['HF_HOME'] = '/tmp'
 # --- Fixed Label Definitions and Mappings (Used as Fallback) ---
 FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
 FIXED_ENTITY_COLOR_MAP = {
@@ -49,7 +46,6 @@ FIXED_ENTITY_COLOR_MAP = {
     "money": "#f43f5e", # Red
     "position": "#a855f7", # Violet
 }
 # --- Fixed Category Mapping ---
 FIXED_CATEGORY_MAPPING = {
   "People & Roles": ["person", "organization", "position"],
@@ -57,20 +53,16 @@ FIXED_CATEGORY_MAPPING = {
   "Time & Dates": ["date", "time"],
   "Numbers & Finance": ["money", "cardinal"]}
 REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
 # --- Dynamic Color Generator for Custom Labels ---
 # Use Plotly's Alphabet set for a large pool of distinct colors
 COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
 def extract_label(node_name):
     """Extracts the label from a node string like 'Text (Label)'."""
     match = re.search(r'\(([^)]+)\)$', node_name)
     return match.group(1) if match else "Unknown"
 def remove_trailing_punctuation(text_string):
     """Removes trailing punctuation from a string."""
     return text_string.rstrip(string.punctuation)
 def get_dynamic_color_map(active_labels, fixed_map):
     """Generates a color map, using fixed colors if available, otherwise dynamic colors."""
     color_map = {}
@@ -86,7 +78,6 @@ def get_dynamic_color_map(active_labels, fixed_map):
             # Generate a new color from the palette
             color_map[label] = next(COLOR_PALETTE)
     return color_map
 def highlight_entities(text, df_entities, entity_color_map):
     """
     Generates HTML to display text with entities highlighted and colored.
@@ -101,11 +92,9 @@ def highlight_entities(text, df_entities, entity_color_map):
         # Ensure the entity indices are within the bounds of the full text
         start = max(0, entity['start'])
         end = min(len(text), entity['end'])
         # Get entity text from the full document based on its indices
         # The 'text' column in the dataframe is now an attribute of the chunked text, not the original span
         entity_text_from_full_doc = text[start:end]
         label = entity['label']
         color = entity_color_map.get(label, '#000000')
         # Create a span with background color and tooltip
@@ -114,7 +103,6 @@ def highlight_entities(text, df_entities, entity_color_map):
         highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
     # Use a div to mimic the Streamlit input box style for the report
     return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     """Performs basic Topic Modeling using LDA."""
     documents = df_entities['text'].unique().tolist()
@@ -122,29 +110,24 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     # but here we use the extracted entity texts as per the original code's intent.
     if len(documents) < 2:
         return None
     N = min(num_top_words, len(documents))
     try:
         tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(1, 3))
         tfidf = tfidf_vectorizer.fit_transform(documents)
         tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
         if len(tfidf_feature_names) < num_topics:
             tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3))
             tfidf = tfidf_vectorizer.fit_transform(documents)
             tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
             if len(tfidf_feature_names) < num_topics:
                  return None
         lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1)
         lda.fit(tfidf)
         topic_data_list = []
         for topic_idx, topic in enumerate(lda.components_):
             top_words_indices = topic.argsort()[:-N - 1:-1]
             top_words = [tfidf_feature_names[i] for i in top_words_indices]
             word_weights = [topic[i] for i in top_words_indices]
             for word, weight in zip(top_words, word_weights):
                  topic_data_list.append({
                      'Topic_ID': f'Topic #{topic_idx + 1}',
@@ -152,17 +135,14 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
                      'Weight': weight,
                  })
         return pd.DataFrame(topic_data_list)
     except Exception as e:
         return None
 def create_topic_word_bubbles(df_topic_data):
     """Generates a Plotly Bubble Chart for top words across all topics."""
     df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
     df_topic_data['x_pos'] = df_topic_data.index
     if df_topic_data.empty:
         return None
     fig = px.scatter(
         df_topic_data,
         x='x_pos', y='weight', size='weight', color='topic', text='word', hover_name='word', size_max=40,
@@ -186,7 +166,6 @@ def create_topic_word_bubbles(df_topic_data):
         marker=dict(line=dict(width=1, color='DarkSlateGrey'))
     )
     return fig
 def generate_network_graph(df, raw_text, entity_color_map):
     """Generates a network graph visualization (Node Plot) with edges based on entity co-occurrence in sentences."""
     entity_counts = df['text'].value_counts().reset_index()
@@ -194,7 +173,6 @@ def generate_network_graph(df, raw_text, entity_color_map):
     unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
     if unique_entities.shape[0] < 2:
         return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
     num_nodes = len(unique_entities)
     thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
     radius = 10
@@ -217,7 +195,6 @@ def generate_network_graph(df, raw_text, entity_color_map):
                 node2 = unique_entities_in_sentence[j]
                 edge_tuple = tuple(sorted((node1, node2)))
                 edges.add(edge_tuple)
     edge_x = []
     edge_y = []
     for edge in edges:
@@ -225,11 +202,9 @@ def generate_network_graph(df, raw_text, entity_color_map):
         if n1 in pos_map and n2 in pos_map:
             edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
             edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
     fig = go.Figure()
     edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
     fig.add_trace(edge_trace)
     fig.add_trace(go.Scatter(
         x=unique_entities['x'], y=unique_entities['y'], mode='markers+text', name='Entities', text=unique_entities['text'], textposition="top center", showlegend=False,
         marker=dict(
@@ -241,7 +216,6 @@ def generate_network_graph(df, raw_text, entity_color_map):
         customdata=unique_entities[['label', 'score', 'frequency']],
         hovertemplate=("<b>%{text}</b><br>Label: %{customdata[0]}<br>Score: %{customdata[1]:.2f}<br>Frequency: %{customdata[2]}<extra></extra>")
     ))
     legend_traces = []
     seen_labels = set()
     for index, row in unique_entities.iterrows():
@@ -250,10 +224,8 @@ def generate_network_graph(df, raw_text, entity_color_map):
             seen_labels.add(label)
             color = entity_color_map.get(label, '#cccccc')
             legend_traces.append(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True))
     for trace in legend_traces:
         fig.add_trace(trace)
     fig.update_layout(
         title='Entity Co-occurrence Network (Edges = Same Sentence)',
         showlegend=True, hovermode='closest',
@@ -263,7 +235,6 @@ def generate_network_graph(df, raw_text, entity_color_map):
         margin=dict(t=50, b=10, l=10, r=10), height=600
     )
     return fig
 # --- CSV GENERATION FUNCTION ---
 def generate_entity_csv(df):
     """Generates a CSV file of the extracted entities in an in-memory buffer."""
@@ -273,7 +244,6 @@ def generate_entity_csv(df):
     csv_buffer.seek(0)
     return csv_buffer
 # -----------------------------------
 # --- HTML REPORT GENERATION FUNCTION (MODIFIED FOR WHITE-LABEL) ---
 def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
     """
@@ -282,7 +252,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
     """
     # Use the category values from the DataFrame to ensure the report matches the app's current mode (fixed or custom)
     unique_categories = df['category'].unique()
     # 1. Generate Visualizations (Plotly HTML)
     # 1a. Treemap
     fig_treemap = px.treemap(
@@ -294,21 +263,17 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
         color_discrete_sequence=px.colors.qualitative.Dark24
     )
     fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
-    treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
-    # 1b. Pie Chart
     grouped_counts = df['category'].value_counts().reset_index()
     grouped_counts.columns = ['Category', 'Count']
     color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
     fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
     fig_pie.update_layout(margin=dict(t=50, b=10))
     pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
     # 1c. Bar Chart (Category Count)
     fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
     fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
     bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
     # 1d. Bar Chart (Most Frequent Entities)
     word_counts = df['text'].value_counts().reset_index()
     word_counts.columns = ['Entity', 'Count']
@@ -318,12 +283,10 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
         fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
         fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
         bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
     # 1e. Network Graph HTML - IMPORTANT: Pass color map
     network_fig = generate_network_graph(df, text_input, entity_color_map)
     network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
-    # 1f. Topic Charts HTML
     topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
     if df_topic_data is not None and not df_topic_data.empty:
         bubble_figure = create_topic_word_bubbles(df_topic_data)
@@ -336,16 +299,13 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
         topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
         topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
         topic_charts_html += '</div>'
     # 2. Get Highlighted Text - IMPORTANT: Pass color map
     highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
     # 3. Entity Tables (Pandas to HTML)
     entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
         classes='table table-striped',
         index=False
     )
     # 4. Construct the Final HTML (UPDATED FOR WHITE-LABELING)
     html_content = f"""<!DOCTYPE html><html lang="en"><head>
         <meta charset="UTF-8">
@@ -370,8 +330,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
         <div class="container">
             <h1>{report_title}</h1>
             <div class="metadata">
-                {branding_html} <!-- CUSTOM BRANDING INSERTED HERE -->
-                <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
                 <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
             </div>
             <h2>1. Analyzed Text & Extracted Entities</h2>
@@ -399,8 +358,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
     </html>
     """
     return html_content
-# -----------------------------------
 # --- CHUNKING IMPLEMENTATION FOR LARGE TEXT ---
 def chunk_text(text, max_chunk_size=1500):
     """Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
@@ -409,10 +366,8 @@ def chunk_text(text, max_chunk_size=1500):
     chunks = []
     current_chunk = ""
     current_offset = 0
     for segment in segments:
         if not segment: continue
         if len(current_chunk) + len(segment) > max_chunk_size and current_chunk:
             # Save the current chunk and its starting offset
             chunks.append((current_chunk, current_offset))
@@ -422,34 +377,26 @@ def chunk_text(text, max_chunk_size=1500):
             current_chunk += segment
     if current_chunk:
         chunks.append((current_chunk, current_offset))
     return chunks
 def process_chunked_text(text, labels, model):
     """Processes large text in chunks and aggregates/offsets the entities."""
     # GLiNER model context size can be around 1024-1500 tokens/words. We use a generous char limit.
     # The word count limit is 10000, but we chunk around 500 words for safety/performance.
     MAX_CHUNK_CHARS = 3500
     chunks = chunk_text(text, max_chunk_size=MAX_CHUNK_CHARS)
     all_entities = []
     for chunk_text, chunk_offset in chunks:
         # Predict entities on the small chunk
         chunk_entities = model.predict_entities(chunk_text, labels)
         # Offset the start and end indices to match the original document
         for entity in chunk_entities:
             entity['start'] += chunk_offset
             entity['end'] += chunk_offset
             all_entities.append(entity)
     return all_entities
 # -----------------------------------
 # --- Page Configuration and Styling (No Sidebar) ---
 st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 # --- Conditional Mobile Warning ---
 st.markdown(
     """
@@ -463,7 +410,6 @@ st.markdown(
     [data-testid="stAppViewBlock"] {
         background-color: #ffffff !important;
     }
     /* CSS Media Query: Only show the content inside this selector when the screen width is 600px or less (typical mobile size) */
     @media (max-width: 600px) {
         #mobile-warning-container {
@@ -506,10 +452,32 @@ st.markdown(
     </div>
     """,
     unsafe_allow_html=True)
-# ----------------------------------
 st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
 # Removed st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") for white-labeling
 tab1, tab2 = st.tabs(["Embed", "Important Notes"])
 with tab1:
     with st.expander("Embed"):
@@ -523,20 +491,15 @@ with tab1:
     ></iframe>
     '''
         st.code(code, language="html")
 with tab2:
     expander = st.expander("**Important Notes**")
     expander.markdown("""
     **Named Entities (Fixed Mode):** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position".
     **Custom Labels Mode:** You can define your own comma-separated labels (e.g., `product, symptom, client_id`) in the input box below.
     **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
     **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
     """)
     st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)") # Updated contact info
 # --- Comet ML Setup (Placeholder/Conditional) ---
 COMET_API_KEY = os.environ.get("COMET_API_KEY")
 COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
@@ -544,7 +507,7 @@ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
 comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 # --- Model Loading ---
-@st.cache_resource
 def load_ner_model(labels):
     """Loads the GLiNER model and caches it."""
     try:
@@ -552,10 +515,9 @@ def load_ner_model(labels):
         return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
     except Exception as e:
         # Log the actual error to the console for debugging
-        print(f"FATAL ERROR: Failed to load NER model: {e}")
         st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
         st.stop()
 # --- LONG DEFAULT TEXT (178 Words) ---
 DEFAULT_TEXT = (
     "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
@@ -573,7 +535,6 @@ DEFAULT_TEXT = (
     "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
     "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
 # -----------------------------------
 # --- Session State Initialization (CRITICAL FIX) ---
 if 'show_results' not in st.session_state: st.session_state.show_results = False
 if 'last_text' not in st.session_state: st.session_state.last_text = ""
@@ -620,7 +581,7 @@ with col_results:
 with col_clear:
     st.button("Clear text", on_click=clear_text, use_container_width=True)
-# --- Results Trigger and Processing (Updated Logic with Chunking) ---
 if run_button:
     # 1. Determine Active Labels and Mode
     custom_labels_raw = st.session_state.custom_labels_input
@@ -635,7 +596,6 @@ if run_button:
         else:
             st.session_state.active_labels_list = custom_labels_list
             st.session_state.is_custom_mode = True
     else:
         st.session_state.active_labels_list = FIXED_LABELS
         st.session_state.is_custom_mode = False
@@ -652,77 +612,73 @@ if run_button:
         # Define a safe threshold for when to start chunking (e.g., above 500 words)
         CHUNKING_THRESHOLD = 500
         should_chunk = word_count > CHUNKING_THRESHOLD
         mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
         if should_chunk:
             mode_msg += " with **chunking** for large text"
-        with st.spinner(f"Extracting entities using {mode_msg}...", show_time=True):
-            # Re-run prediction only if text or active labels have changed
-            current_settings = (text, tuple(active_labels))
-            last_settings = (st.session_state.last_text, tuple(st.session_state.get('last_active_labels', [])))
             if current_settings != last_settings:
-                st.session_state.last_text = text
-                st.session_state['last_active_labels'] = active_labels
                 start_time = time.time()
-                # Load model using the determined active labels
-                model = load_ner_model(active_labels)
-                # --- Model Prediction & Dataframe Creation (Using Chunking if needed) ---
                 if should_chunk:
-                    entities = process_chunked_text(text, active_labels, model)
-                    st.info(f"Text was split into {len(chunk_text(text))} chunks for processing.")
                 else:
-                    # Original logic for small texts
-                    entities = model.predict_entities(text, active_labels)
-                elapsed_time = time.time() - start_time
-                st.session_state.elapsed_time = elapsed_time
-                # --- DataFrame Construction ---
-                df = pd.DataFrame(entities)
                 if df.empty:
-                    st.session_state.results_df = df
-                    st.session_state.topic_results = None
-                    st.session_state.show_results = True
                 else:
-                    # Clean up entity text (optional, but good practice)
-                    df['text'] = df['text'].apply(remove_trailing_punctuation)
-                    # Map entities to categories
-                    if st.session_state.is_custom_mode:
-                        # For custom labels, group everything under a single category
-                        df['category'] = "User Defined Entities"
-                    else:
-                        # For fixed labels, use the fixed mapping
-                        df['category'] = df['label'].map(REVERSE_FIXED_CATEGORY_MAPPING).fillna('Other')
-                    # Remove duplicates for topics/frequency analysis, keeping the highest score
-                    df_unique_entities = df.sort_values('score', ascending=False).drop_duplicates(subset=['text', 'label'])
-                    # --- Topic Modeling ---
-                    # We use the unique entities as input for the topic modeling
-                    df_topic_data = perform_topic_modeling(df_unique_entities, num_topics=min(3, len(df_unique_entities.text.unique())), num_top_words=10)
-                    # Update session state
-                    st.session_state.results_df = df
-                    st.session_state.topic_results = df_topic_data
-                    st.session_state.show_results = True
             else:
-                # If settings haven't changed, just show the last results
                 st.session_state.show_results = True
 # --- Display Download Link and Results (Updated with White-Label inputs) ---
 if st.session_state.show_results:
     df = st.session_state.results_df
     df_topic_data = st.session_state.topic_results
     # Generate the color map based on the results DF labels
     current_labels_in_df = df['label'].unique().tolist()
     entity_color_map = get_dynamic_color_map(current_labels_in_df, FIXED_ENTITY_COLOR_MAP)
@@ -731,15 +687,12 @@ if st.session_state.show_results:
         st.warning("No entities were found in the provided text with the current label set.")
     else:
         st.subheader("Analysis Results", divider="blue")
         # 1. Highlighted Text
         st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
         st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
         # 2. Detailed Entity Analysis Tabs
         st.markdown("### 2. Detailed Entity Analysis")
         tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
         # Determine which categories to use for the tabs
         if st.session_state.is_custom_mode:
             unique_categories = ["User Defined Entities"]
@@ -747,11 +700,9 @@ if st.session_state.show_results:
             st.markdown(f"**Custom Labels Detected: {', '.join(tabs_to_show)}**")
         else:
             unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
         # --- Section 2a: Detailed Tables by Category/Label ---
         with tab_category_details:
             st.markdown("#### Detailed Entities Table (Grouped by Category)")
             if st.session_state.is_custom_mode:
                 # In custom mode, group by the actual label since the category is just "User Defined Entities"
                 tabs_list = df['label'].unique().tolist()
@@ -780,12 +731,10 @@ if st.session_state.show_results:
                             )
                         else:
                             st.info(f"No entities of category **{category}** were found in the text.")
             # --- INSERTED GLOSSARY HERE ---
             with st.expander("See Glossary of tags"):
                 st.write('''- **text**: ['entity extracted from your text data']- **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']- **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']- **start**: ['index of the start of the corresponding entity']- **end**: ['index of the end of the corresponding entity']''')
             # --- END GLOSSARY INSERTION ---
         # --- Section 2b: Treemap Visualization ---
         with tab_treemap_viz:
             st.markdown("#### Treemap: Entity Distribution")
@@ -798,28 +747,23 @@ if st.session_state.show_results:
             )
             fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
             st.plotly_chart(fig_treemap, use_container_width=True)
         # --- Section 3: Comparative Charts (COMPLETED) ---
         st.markdown("---")
         st.markdown("### 3. Comparative Charts")
         col1, col2, col3 = st.columns(3)
         grouped_counts = df['category'].value_counts().reset_index()
         grouped_counts.columns = ['Category', 'Count']
         # Determine color sequence for charts
         chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
         with col1: # Pie Chart
             fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
             fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
             st.plotly_chart(fig_pie, use_container_width=True)
         with col2: # Bar Chart by Category
             st.markdown("#### Entity Count by Category")
             fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
             fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
             st.plotly_chart(fig_bar_category, use_container_width=True)
         with col3: # Bar Chart for Most Frequent Entities
             st.markdown("#### Top 10 Most Frequent Entities")
             word_counts = df['text'].value_counts().reset_index()
@@ -831,35 +775,35 @@ if st.session_state.show_results:
                 st.plotly_chart(fig_bar_freq, use_container_width=True)
             else:
                 st.info("No entities were repeated enough for a Top 10 frequency chart.")
         # 4. Network Graph and Topic Modeling
         st.markdown("---")
         st.markdown("### 4. Advanced Analysis")
         col_network, col_topic = st.columns(2)
         with col_network:
             with st.expander("🔗 Entity Co-occurrence Network Graph", expanded=True):
                 st.plotly_chart(generate_network_graph(df, st.session_state.last_text, entity_color_map), use_container_width=True)
         with col_topic:
             with st.expander("💡 Topic Modeling (LDA)", expanded=True):
                 if df_topic_data is not None and not df_topic_data.empty:
                     st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
                     st.markdown("This chart visualizes the key words driving the identified topics, based on extracted entities.")
                 else:
                     st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
         # --- 5. White-Label Configuration (NEW SECTION FOR CUSTOM BRANDING) ---
         st.markdown("---")
         st.markdown("### 5. White-Label Report Configuration 🎨")
         # Set a dynamic default title based on the mode
         default_report_title = f"{'Custom' if st.session_state.is_custom_mode else 'Fixed'} Entity Analysis Report"
         custom_report_title = st.text_input(
             "Type Your Report Title (for HTML Report), and then press Enter.",
             value=default_report_title
         )
         # UPDATED: Simplified input for the user
         custom_branding_text_input = st.text_area(
             "Type Your Brand Name or Tagline (Appears below the title in the report), and then press Enter.",
@@ -867,13 +811,10 @@ if st.session_state.show_results:
             key='custom_branding_input',
             help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
         )
         # 6. Downloads (Updated to pass custom variables)
         st.markdown("---")
         st.markdown("### 6. Downloads")
         col_csv, col_html = st.columns(2)
         # CSV Download
         csv_buffer = generate_entity_csv(df)
         with col_csv:
@@ -884,11 +825,9 @@ if st.session_state.show_results:
                 mime="text/csv",
                 use_container_width=True
             )
         # --- NEW LOGIC: Wrap the simple text input into proper HTML for the report ---
         # We wrap the user's plain text in a styled HTML paragraph element
         branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
         # HTML Download (Passing custom white-label parameters)
         html_content = generate_html_report(
             df,
@@ -907,4 +846,4 @@ if st.session_state.show_results:
                 file_name="ner_topic_full_report.html",
                 mime="text/html",
                 use_container_width=True
-            )

 # ------------------------------
 from gliner import GLiNER
 from streamlit_extras.stylable_container import stylable_container
 # Using a try/except for comet_ml import
 try:
     from comet_ml import Experiment
         def log_parameter(self, *args): pass
         def log_table(self, *args): pass
         def end(self): pass
 # --- Model Home Directory (Fix for deployment environments) ---
 os.environ['HF_HOME'] = '/tmp'
 # --- Fixed Label Definitions and Mappings (Used as Fallback) ---
 FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
 FIXED_ENTITY_COLOR_MAP = {
     "money": "#f43f5e", # Red
     "position": "#a855f7", # Violet
 }
 # --- Fixed Category Mapping ---
 FIXED_CATEGORY_MAPPING = {
   "People & Roles": ["person", "organization", "position"],
   "Time & Dates": ["date", "time"],
   "Numbers & Finance": ["money", "cardinal"]}
 REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
 # --- Dynamic Color Generator for Custom Labels ---
 # Use Plotly's Alphabet set for a large pool of distinct colors
 COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
 def extract_label(node_name):
     """Extracts the label from a node string like 'Text (Label)'."""
     match = re.search(r'\(([^)]+)\)$', node_name)
     return match.group(1) if match else "Unknown"
 def remove_trailing_punctuation(text_string):
     """Removes trailing punctuation from a string."""
     return text_string.rstrip(string.punctuation)
 def get_dynamic_color_map(active_labels, fixed_map):
     """Generates a color map, using fixed colors if available, otherwise dynamic colors."""
     color_map = {}
             # Generate a new color from the palette
             color_map[label] = next(COLOR_PALETTE)
     return color_map
 def highlight_entities(text, df_entities, entity_color_map):
     """
     Generates HTML to display text with entities highlighted and colored.
         # Ensure the entity indices are within the bounds of the full text
         start = max(0, entity['start'])
         end = min(len(text), entity['end'])
         # Get entity text from the full document based on its indices
         # The 'text' column in the dataframe is now an attribute of the chunked text, not the original span
         entity_text_from_full_doc = text[start:end]
         label = entity['label']
         color = entity_color_map.get(label, '#000000')
         # Create a span with background color and tooltip
         highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
     # Use a div to mimic the Streamlit input box style for the report
     return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     """Performs basic Topic Modeling using LDA."""
     documents = df_entities['text'].unique().tolist()
     # but here we use the extracted entity texts as per the original code's intent.
     if len(documents) < 2:
         return None
     N = min(num_top_words, len(documents))
     try:
         tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(1, 3))
         tfidf = tfidf_vectorizer.fit_transform(documents)
         tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
         if len(tfidf_feature_names) < num_topics:
             tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3))
             tfidf = tfidf_vectorizer.fit_transform(documents)
             tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
             if len(tfidf_feature_names) < num_topics:
                  return None
         lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1)
         lda.fit(tfidf)
         topic_data_list = []
         for topic_idx, topic in enumerate(lda.components_):
             top_words_indices = topic.argsort()[:-N - 1:-1]
             top_words = [tfidf_feature_names[i] for i in top_words_indices]
             word_weights = [topic[i] for i in top_words_indices]
             for word, weight in zip(top_words, word_weights):
                  topic_data_list.append({
                      'Topic_ID': f'Topic #{topic_idx + 1}',
                      'Weight': weight,
                  })
         return pd.DataFrame(topic_data_list)
     except Exception as e:
         return None
 def create_topic_word_bubbles(df_topic_data):
     """Generates a Plotly Bubble Chart for top words across all topics."""
     df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
     df_topic_data['x_pos'] = df_topic_data.index
     if df_topic_data.empty:
         return None
     fig = px.scatter(
         df_topic_data,
         x='x_pos', y='weight', size='weight', color='topic', text='word', hover_name='word', size_max=40,
         marker=dict(line=dict(width=1, color='DarkSlateGrey'))
     )
     return fig
 def generate_network_graph(df, raw_text, entity_color_map):
     """Generates a network graph visualization (Node Plot) with edges based on entity co-occurrence in sentences."""
     entity_counts = df['text'].value_counts().reset_index()
     unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
     if unique_entities.shape[0] < 2:
         return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
     num_nodes = len(unique_entities)
     thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
     radius = 10
                 node2 = unique_entities_in_sentence[j]
                 edge_tuple = tuple(sorted((node1, node2)))
                 edges.add(edge_tuple)
     edge_x = []
     edge_y = []
     for edge in edges:
         if n1 in pos_map and n2 in pos_map:
             edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
             edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
     fig = go.Figure()
     edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
     fig.add_trace(edge_trace)
     fig.add_trace(go.Scatter(
         x=unique_entities['x'], y=unique_entities['y'], mode='markers+text', name='Entities', text=unique_entities['text'], textposition="top center", showlegend=False,
         marker=dict(
         customdata=unique_entities[['label', 'score', 'frequency']],
         hovertemplate=("<b>%{text}</b><br>Label: %{customdata[0]}<br>Score: %{customdata[1]:.2f}<br>Frequency: %{customdata[2]}<extra></extra>")
     ))
     legend_traces = []
     seen_labels = set()
     for index, row in unique_entities.iterrows():
             seen_labels.add(label)
             color = entity_color_map.get(label, '#cccccc')
             legend_traces.append(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True))
     for trace in legend_traces:
         fig.add_trace(trace)
     fig.update_layout(
         title='Entity Co-occurrence Network (Edges = Same Sentence)',
         showlegend=True, hovermode='closest',
         margin=dict(t=50, b=10, l=10, r=10), height=600
     )
     return fig
 # --- CSV GENERATION FUNCTION ---
 def generate_entity_csv(df):
     """Generates a CSV file of the extracted entities in an in-memory buffer."""
     csv_buffer.seek(0)
     return csv_buffer
 # -----------------------------------
 # --- HTML REPORT GENERATION FUNCTION (MODIFIED FOR WHITE-LABEL) ---
 def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
     """
     """
     # Use the category values from the DataFrame to ensure the report matches the app's current mode (fixed or custom)
     unique_categories = df['category'].unique()
     # 1. Generate Visualizations (Plotly HTML)
     # 1a. Treemap
     fig_treemap = px.treemap(
         color_discrete_sequence=px.colors.qualitative.Dark24
     )
     fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+    treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn') # 1b. Pie Chart
     grouped_counts = df['category'].value_counts().reset_index()
     grouped_counts.columns = ['Category', 'Count']
     color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
     fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
     fig_pie.update_layout(margin=dict(t=50, b=10))
     pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
     # 1c. Bar Chart (Category Count)
     fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
     fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
     bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
     # 1d. Bar Chart (Most Frequent Entities)
     word_counts = df['text'].value_counts().reset_index()
     word_counts.columns = ['Entity', 'Count']
         fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
         fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
         bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
     # 1e. Network Graph HTML - IMPORTANT: Pass color map
     network_fig = generate_network_graph(df, text_input, entity_color_map)
     network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
+   # 1f. Topic Charts HTML
     topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
     if df_topic_data is not None and not df_topic_data.empty:
         bubble_figure = create_topic_word_bubbles(df_topic_data)
         topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
         topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
         topic_charts_html += '</div>'
     # 2. Get Highlighted Text - IMPORTANT: Pass color map
     highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
     # 3. Entity Tables (Pandas to HTML)
     entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
         classes='table table-striped',
         index=False
     )
     # 4. Construct the Final HTML (UPDATED FOR WHITE-LABELING)
     html_content = f"""<!DOCTYPE html><html lang="en"><head>
         <meta charset="UTF-8">
         <div class="container">
             <h1>{report_title}</h1>
             <div class="metadata">
+                {branding_html} <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
                 <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
             </div>
             <h2>1. Analyzed Text & Extracted Entities</h2>
     </html>
     """
     return html_content
 # --- CHUNKING IMPLEMENTATION FOR LARGE TEXT ---
 def chunk_text(text, max_chunk_size=1500):
     """Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
     chunks = []
     current_chunk = ""
     current_offset = 0
     for segment in segments:
         if not segment: continue
         if len(current_chunk) + len(segment) > max_chunk_size and current_chunk:
             # Save the current chunk and its starting offset
             chunks.append((current_chunk, current_offset))
             current_chunk += segment
     if current_chunk:
         chunks.append((current_chunk, current_offset))
     return chunks
 def process_chunked_text(text, labels, model):
     """Processes large text in chunks and aggregates/offsets the entities."""
     # GLiNER model context size can be around 1024-1500 tokens/words. We use a generous char limit.
     # The word count limit is 10000, but we chunk around 500 words for safety/performance.
     MAX_CHUNK_CHARS = 3500
     chunks = chunk_text(text, max_chunk_size=MAX_CHUNK_CHARS)
     all_entities = []
     for chunk_text, chunk_offset in chunks:
         # Predict entities on the small chunk
         chunk_entities = model.predict_entities(chunk_text, labels)
         # Offset the start and end indices to match the original document
         for entity in chunk_entities:
             entity['start'] += chunk_offset
             entity['end'] += chunk_offset
             all_entities.append(entity)
     return all_entities
 # -----------------------------------
 # --- Page Configuration and Styling (No Sidebar) ---
 st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 # --- Conditional Mobile Warning ---
 st.markdown(
     """
     [data-testid="stAppViewBlock"] {
         background-color: #ffffff !important;
     }
     /* CSS Media Query: Only show the content inside this selector when the screen width is 600px or less (typical mobile size) */
     @media (max-width: 600px) {
         #mobile-warning-container {
     </div>
     """,
     unsafe_allow_html=True)
+# --- Sidebar Inputs for Topic Modeling (NEW) ---
+st.sidebar.header("Topic Modeling Settings 💡")
+num_topics_input = st.sidebar.slider(
+    "Number of Topics",
+    min_value=2,
+    max_value=10,
+    value=5,
+    step=1,
+    key='num_topics_slider',
+    help="The number of underlying topics (clusters) to discover in the entity data (LDA)."
+)
+num_top_words_input = st.sidebar.slider(
+    "Number of Top Words per Topic",
+    min_value=5,
+    max_value=20,
+    value=10,
+    step=1,
+    key='num_top_words_slider',
+    help="The number of most important words to display for each topic."
+)
+st.sidebar.markdown("---")
+# -----------------------------------------------
 st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
 # Removed st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") for white-labeling
 tab1, tab2 = st.tabs(["Embed", "Important Notes"])
 with tab1:
     with st.expander("Embed"):
     ></iframe>
     '''
         st.code(code, language="html")
 with tab2:
     expander = st.expander("**Important Notes**")
     expander.markdown("""
     **Named Entities (Fixed Mode):** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position".
     **Custom Labels Mode:** You can define your own comma-separated labels (e.g., `product, symptom, client_id`) in the input box below.
     **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
     **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
     """)
     st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)") # Updated contact info
 # --- Comet ML Setup (Placeholder/Conditional) ---
 COMET_API_KEY = os.environ.get("COMET_API_KEY")
 COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 # --- Model Loading ---
+@st.cache_resourced
 def load_ner_model(labels):
     """Loads the GLiNER model and caches it."""
     try:
         return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
     except Exception as e:
         # Log the actual error to the console for debugging
+        print(f"FATAL ERROR: Failed to load NER model: {e}")
         st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
         st.stop()
 # --- LONG DEFAULT TEXT (178 Words) ---
 DEFAULT_TEXT = (
     "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
     "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
     "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
 # -----------------------------------
 # --- Session State Initialization (CRITICAL FIX) ---
 if 'show_results' not in st.session_state: st.session_state.show_results = False
 if 'last_text' not in st.session_state: st.session_state.last_text = ""
 with col_clear:
     st.button("Clear text", on_click=clear_text, use_container_width=True)
+# --- Results Trigger and Processing (Completed Logic with Chunking and Topic Vars) ---
 if run_button:
     # 1. Determine Active Labels and Mode
     custom_labels_raw = st.session_state.custom_labels_input
         else:
             st.session_state.active_labels_list = custom_labels_list
             st.session_state.is_custom_mode = True
     else:
         st.session_state.active_labels_list = FIXED_LABELS
         st.session_state.is_custom_mode = False
         # Define a safe threshold for when to start chunking (e.g., above 500 words)
         CHUNKING_THRESHOLD = 500
         should_chunk = word_count > CHUNKING_THRESHOLD
         mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
         if should_chunk:
             mode_msg += " with **chunking** for large text"
+        # --- Topic Modeling Input Retrieval ---
+        # Get the current slider values
+        current_num_topics = st.session_state.num_topics_slider
+        current_num_top_words = st.session_state.num_top_words_slider
+        with st.spinner(f"Extracting entities using {mode_msg}...", show_time=True):
+            # Re-run prediction only if text, active labels, OR topic parameters have changed
+            current_settings = (text, tuple(active_labels), current_num_topics, current_num_top_words)
+            # Add topic settings to last_settings check
+            last_settings = (
+                st.session_state.last_text,
+                tuple(st.session_state.get('last_active_labels', [])),
+                st.session_state.get('last_num_topics', None),
+                st.session_state.get('last_num_top_words', None)
+            )
             if current_settings != last_settings:
                 start_time = time.time()
+                ner_model = load_ner_model(labels=active_labels)
+                # 2. Perform NER Extraction
                 if should_chunk:
+                    all_entities_list = process_chunked_text(text, active_labels, ner_model)
                 else:
+                    all_entities_list = ner_model.predict_entities(text, active_labels)
+                df = pd.DataFrame(all_entities_list)
                 if df.empty:
+                    df_topic_data = None
                 else:
+                    # 3. Add Category Mapping
+                    df['category'] = df['label'].apply(
+                        lambda l: REVERSE_FIXED_CATEGORY_MAPPING.get(l, "User Defined Entities")
+                    )
+                    # 4. Perform Topic Modeling (Passing the new parameters)
+                    df_topic_data = perform_topic_modeling(
+                        df_entities=df,
+                        num_topics=current_num_topics, # NEW PARAMETER
+                        num_top_words=current_num_top_words # NEW PARAMETER
+                    )
+                end_time = time.time()
+                elapsed_time = end_time - start_time
+                # 5. Save Results to Session State
+                st.session_state.results_df = df
+                st.session_state.topic_results = df_topic_data
+                st.session_state.elapsed_time = elapsed_time
+                st.session_state.last_text = text
+                st.session_state.show_results = True
+                st.session_state.last_active_labels = active_labels
+                st.session_state.last_num_topics = current_num_topics # Save topic settings
+                st.session_state.last_num_top_words = current_num_top_words # Save topic settings
             else:
+                st.info("Results already calculated for the current text and settings.")
                 st.session_state.show_results = True
 # --- Display Download Link and Results (Updated with White-Label inputs) ---
 if st.session_state.show_results:
     df = st.session_state.results_df
     df_topic_data = st.session_state.topic_results
     # Generate the color map based on the results DF labels
     current_labels_in_df = df['label'].unique().tolist()
     entity_color_map = get_dynamic_color_map(current_labels_in_df, FIXED_ENTITY_COLOR_MAP)
         st.warning("No entities were found in the provided text with the current label set.")
     else:
         st.subheader("Analysis Results", divider="blue")
         # 1. Highlighted Text
         st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
         st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
         # 2. Detailed Entity Analysis Tabs
         st.markdown("### 2. Detailed Entity Analysis")
         tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
         # Determine which categories to use for the tabs
         if st.session_state.is_custom_mode:
             unique_categories = ["User Defined Entities"]
             st.markdown(f"**Custom Labels Detected: {', '.join(tabs_to_show)}**")
         else:
             unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
         # --- Section 2a: Detailed Tables by Category/Label ---
         with tab_category_details:
             st.markdown("#### Detailed Entities Table (Grouped by Category)")
             if st.session_state.is_custom_mode:
                 # In custom mode, group by the actual label since the category is just "User Defined Entities"
                 tabs_list = df['label'].unique().tolist()
                             )
                         else:
                             st.info(f"No entities of category **{category}** were found in the text.")
             # --- INSERTED GLOSSARY HERE ---
             with st.expander("See Glossary of tags"):
                 st.write('''- **text**: ['entity extracted from your text data']- **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']- **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']- **start**: ['index of the start of the corresponding entity']- **end**: ['index of the end of the corresponding entity']''')
             # --- END GLOSSARY INSERTION ---
         # --- Section 2b: Treemap Visualization ---
         with tab_treemap_viz:
             st.markdown("#### Treemap: Entity Distribution")
             )
             fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
             st.plotly_chart(fig_treemap, use_container_width=True)
         # --- Section 3: Comparative Charts (COMPLETED) ---
         st.markdown("---")
         st.markdown("### 3. Comparative Charts")
         col1, col2, col3 = st.columns(3)
         grouped_counts = df['category'].value_counts().reset_index()
         grouped_counts.columns = ['Category', 'Count']
         # Determine color sequence for charts
         chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
         with col1: # Pie Chart
             fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
             fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
             st.plotly_chart(fig_pie, use_container_width=True)
         with col2: # Bar Chart by Category
             st.markdown("#### Entity Count by Category")
             fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
             fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
             st.plotly_chart(fig_bar_category, use_container_width=True)
         with col3: # Bar Chart for Most Frequent Entities
             st.markdown("#### Top 10 Most Frequent Entities")
             word_counts = df['text'].value_counts().reset_index()
                 st.plotly_chart(fig_bar_freq, use_container_width=True)
             else:
                 st.info("No entities were repeated enough for a Top 10 frequency chart.")
         # 4. Network Graph and Topic Modeling
         st.markdown("---")
         st.markdown("### 4. Advanced Analysis")
         col_network, col_topic = st.columns(2)
         with col_network:
             with st.expander("🔗 Entity Co-occurrence Network Graph", expanded=True):
                 st.plotly_chart(generate_network_graph(df, st.session_state.last_text, entity_color_map), use_container_width=True)
         with col_topic:
             with st.expander("💡 Topic Modeling (LDA)", expanded=True):
+                # Display the current settings used for the topic modeling result
+                st.markdown(f"""
+                **LDA Parameters:**
+                * Topics: **{st.session_state.last_num_topics}**
+                * Top Words: **{st.session_state.last_num_top_words}**
+                """)
                 if df_topic_data is not None and not df_topic_data.empty:
                     st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
                     st.markdown("This chart visualizes the key words driving the identified topics, based on extracted entities.")
                 else:
                     st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
         # --- 5. White-Label Configuration (NEW SECTION FOR CUSTOM BRANDING) ---
         st.markdown("---")
         st.markdown("### 5. White-Label Report Configuration 🎨")
         # Set a dynamic default title based on the mode
         default_report_title = f"{'Custom' if st.session_state.is_custom_mode else 'Fixed'} Entity Analysis Report"
         custom_report_title = st.text_input(
             "Type Your Report Title (for HTML Report), and then press Enter.",
             value=default_report_title
         )
         # UPDATED: Simplified input for the user
         custom_branding_text_input = st.text_area(
             "Type Your Brand Name or Tagline (Appears below the title in the report), and then press Enter.",
             key='custom_branding_input',
             help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
         )
         # 6. Downloads (Updated to pass custom variables)
         st.markdown("---")
         st.markdown("### 6. Downloads")
         col_csv, col_html = st.columns(2)
         # CSV Download
         csv_buffer = generate_entity_csv(df)
         with col_csv:
                 mime="text/csv",
                 use_container_width=True
             )
         # --- NEW LOGIC: Wrap the simple text input into proper HTML for the report ---
         # We wrap the user's plain text in a styled HTML paragraph element
         branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
         # HTML Download (Passing custom white-label parameters)
         html_content = generate_html_report(
             df,
                 file_name="ner_topic_full_report.html",
                 mime="text/html",
                 use_container_width=True
+            )