AIEcosystem commited on
Commit
ce1b83d
·
verified ·
1 Parent(s): 4f11778

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +184 -170
src/streamlit_app.py CHANGED
@@ -22,6 +22,7 @@ from sklearn.decomposition import LatentDirichletAllocation
22
  # ------------------------------
23
  from gliner import GLiNER
24
  from streamlit_extras.stylable_container import stylable_container
 
25
  # Using a try/except for comet_ml import
26
  try:
27
  from comet_ml import Experiment
@@ -31,8 +32,10 @@ except ImportError:
31
  def log_parameter(self, *args): pass
32
  def log_table(self, *args): pass
33
  def end(self): pass
 
34
  # --- Model Home Directory (Fix for deployment environments) ---
35
  os.environ['HF_HOME'] = '/tmp'
 
36
  # --- Fixed Label Definitions and Mappings (Used as Fallback) ---
37
  FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
38
  FIXED_ENTITY_COLOR_MAP = {
@@ -51,18 +54,23 @@ FIXED_CATEGORY_MAPPING = {
51
  "People & Roles": ["person", "organization", "position"],
52
  "Locations": ["country", "city"],
53
  "Time & Dates": ["date", "time"],
54
- "Numbers & Finance": ["money", "cardinal"]}
 
55
  REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
 
56
  # --- Dynamic Color Generator for Custom Labels ---
57
- # Use Plotly's Alphabet set for a large pool of distinct colors
58
  COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
 
59
  def extract_label(node_name):
60
  """Extracts the label from a node string like 'Text (Label)'."""
61
  match = re.search(r'\(([^)]+)\)$', node_name)
62
  return match.group(1) if match else "Unknown"
 
63
  def remove_trailing_punctuation(text_string):
64
  """Removes trailing punctuation from a string."""
65
  return text_string.rstrip(string.punctuation)
 
66
  def get_dynamic_color_map(active_labels, fixed_map):
67
  """Generates a color map, using fixed colors if available, otherwise dynamic colors."""
68
  color_map = {}
@@ -78,6 +86,7 @@ def get_dynamic_color_map(active_labels, fixed_map):
78
  # Generate a new color from the palette
79
  color_map[label] = next(COLOR_PALETTE)
80
  return color_map
 
81
  def highlight_entities(text, df_entities, entity_color_map):
82
  """
83
  Generates HTML to display text with entities highlighted and colored.
@@ -103,6 +112,7 @@ def highlight_entities(text, df_entities, entity_color_map):
103
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
104
  # Use a div to mimic the Streamlit input box style for the report
105
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 
106
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
107
  """Performs basic Topic Modeling using LDA."""
108
  documents = df_entities['text'].unique().tolist()
@@ -137,6 +147,7 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
137
  return pd.DataFrame(topic_data_list)
138
  except Exception as e:
139
  return None
 
140
  def create_topic_word_bubbles(df_topic_data):
141
  """Generates a Plotly Bubble Chart for top words across all topics."""
142
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
@@ -166,6 +177,7 @@ def create_topic_word_bubbles(df_topic_data):
166
  marker=dict(line=dict(width=1, color='DarkSlateGrey'))
167
  )
168
  return fig
 
169
  def generate_network_graph(df, raw_text, entity_color_map):
170
  """Generates a network graph visualization (Node Plot) with edges based on entity co-occurrence in sentences."""
171
  entity_counts = df['text'].value_counts().reset_index()
@@ -202,6 +214,7 @@ def generate_network_graph(df, raw_text, entity_color_map):
202
  if n1 in pos_map and n2 in pos_map:
203
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
204
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
 
205
  fig = go.Figure()
206
  edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
207
  fig.add_trace(edge_trace)
@@ -235,7 +248,7 @@ def generate_network_graph(df, raw_text, entity_color_map):
235
  margin=dict(t=50, b=10, l=10, r=10), height=600
236
  )
237
  return fig
238
- # --- CSV GENERATION FUNCTION ---
239
  def generate_entity_csv(df):
240
  """Generates a CSV file of the extracted entities in an in-memory buffer."""
241
  csv_buffer = BytesIO()
@@ -243,6 +256,7 @@ def generate_entity_csv(df):
243
  csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
244
  csv_buffer.seek(0)
245
  return csv_buffer
 
246
  # -----------------------------------
247
  # --- HTML REPORT GENERATION FUNCTION (MODIFIED FOR WHITE-LABEL) ---
248
  def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
@@ -252,6 +266,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
252
  """
253
  # Use the category values from the DataFrame to ensure the report matches the app's current mode (fixed or custom)
254
  unique_categories = df['category'].unique()
 
255
  # 1. Generate Visualizations (Plotly HTML)
256
  # 1a. Treemap
257
  fig_treemap = px.treemap(
@@ -263,17 +278,21 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
263
  color_discrete_sequence=px.colors.qualitative.Dark24
264
  )
265
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
266
- treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn') # 1b. Pie Chart
 
 
267
  grouped_counts = df['category'].value_counts().reset_index()
268
  grouped_counts.columns = ['Category', 'Count']
269
  color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
270
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
271
  fig_pie.update_layout(margin=dict(t=50, b=10))
272
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
 
273
  # 1c. Bar Chart (Category Count)
274
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
275
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
276
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
 
277
  # 1d. Bar Chart (Most Frequent Entities)
278
  word_counts = df['text'].value_counts().reset_index()
279
  word_counts.columns = ['Entity', 'Count']
@@ -283,10 +302,11 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
283
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
284
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
285
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
 
286
  # 1e. Network Graph HTML - IMPORTANT: Pass color map
287
  network_fig = generate_network_graph(df, text_input, entity_color_map)
288
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
289
- # 1f. Topic Charts HTML
290
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
291
  if df_topic_data is not None and not df_topic_data.empty:
292
  bubble_figure = create_topic_word_bubbles(df_topic_data)
@@ -299,13 +319,16 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
299
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
300
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
301
  topic_charts_html += '</div>'
 
302
  # 2. Get Highlighted Text - IMPORTANT: Pass color map
303
  highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
 
304
  # 3. Entity Tables (Pandas to HTML)
305
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
306
  classes='table table-striped',
307
  index=False
308
  )
 
309
  # 4. Construct the Final HTML (UPDATED FOR WHITE-LABELING)
310
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
311
  <meta charset="UTF-8">
@@ -330,7 +353,8 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
330
  <div class="container">
331
  <h1>{report_title}</h1>
332
  <div class="metadata">
333
- {branding_html} <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
 
334
  <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
335
  </div>
336
  <h2>1. Analyzed Text & Extracted Entities</h2>
@@ -338,8 +362,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
338
  <div class="highlighted-text-container">
339
  {highlighted_text_html}
340
  </div>
341
- <h2>2. Full Extracted Entities Table
342
- </h2>
343
  {entity_table_html}
344
  <h2>3. Data Visualizations</h2>
345
  <h3>3.1 Entity Distribution Treemap</h3>
@@ -358,7 +381,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
358
  </html>
359
  """
360
  return html_content
361
- # --- CHUNKING IMPLEMENTATION FOR LARGE TEXT ---
362
  def chunk_text(text, max_chunk_size=1500):
363
  """Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
364
  # Split by double newline (paragraph) or sentence-like separators
@@ -378,6 +401,7 @@ def chunk_text(text, max_chunk_size=1500):
378
  if current_chunk:
379
  chunks.append((current_chunk, current_offset))
380
  return chunks
 
381
  def process_chunked_text(text, labels, model):
382
  """Processes large text in chunks and aggregates/offsets the entities."""
383
  # GLiNER model context size can be around 1024-1500 tokens/words. We use a generous char limit.
@@ -394,9 +418,9 @@ def process_chunked_text(text, labels, model):
394
  entity['end'] += chunk_offset
395
  all_entities.append(entity)
396
  return all_entities
397
- # -----------------------------------
398
- # --- Page Configuration and Styling (No Sidebar) ---
399
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 
400
  # --- Conditional Mobile Warning ---
401
  st.markdown(
402
  """
@@ -454,10 +478,8 @@ st.markdown(
454
  unsafe_allow_html=True)
455
 
456
  # --- Topic Modeling Settings (Moved to main body, but need to initialize key outside of 'if st.session_state.show_results:') ---
457
- # st.sidebar.header("Topic Modeling Settings 💡") # Removed sidebar header
458
 
459
- st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
460
- # Removed st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") for white-labeling
461
  tab1, tab2 = st.tabs(["Embed", "Important Notes"])
462
  with tab1:
463
  with st.expander("Embed"):
@@ -471,6 +493,7 @@ with tab1:
471
  ></iframe>
472
  '''
473
  st.code(code, language="html")
 
474
  with tab2:
475
  expander = st.expander("**Important Notes**")
476
  expander.markdown("""
@@ -480,6 +503,7 @@ with tab2:
480
  **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
481
  """)
482
  st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)") # Updated contact info
 
483
  # --- Comet ML Setup (Placeholder/Conditional) ---
484
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
485
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
@@ -498,6 +522,7 @@ def load_ner_model(labels):
498
  print(f"FATAL ERROR: Failed to load NER model: {e}")
499
  st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
500
  st.stop()
 
501
  # --- LONG DEFAULT TEXT (178 Words) ---
502
  DEFAULT_TEXT = (
503
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
@@ -514,6 +539,7 @@ DEFAULT_TEXT = (
514
  "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
515
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
516
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
 
517
  # -----------------------------------
518
  # --- Session State Initialization (CRITICAL FIX) ---
519
  if 'show_results' not in st.session_state: st.session_state.show_results = False
@@ -530,8 +556,9 @@ if 'num_topics_slider' not in st.session_state: st.session_state.num_topics_slid
530
  if 'num_top_words_slider' not in st.session_state: st.session_state.num_top_words_slider = 10
531
  if 'last_num_topics' not in st.session_state: st.session_state.last_num_topics = None
532
  if 'last_num_top_words' not in st.session_state: st.session_state.last_num_top_words = None
 
 
533
 
534
- # --- Clear Button Function (MODIFIED) ---
535
  def clear_text():
536
  """Clears the text area (sets it to an empty string) and hides results."""
537
  st.session_state['my_text_area'] = ""
@@ -586,81 +613,72 @@ if run_button:
586
  st.session_state.is_custom_mode = False
587
 
588
  active_labels = st.session_state.active_labels_list
589
-
590
- if not text.strip():
591
- st.warning("Please enter some text to extract entities.")
592
- st.session_state.show_results = False
593
- elif word_count > word_limit:
594
- st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
595
- st.session_state.show_results = False
596
- else:
597
- # Define a safe threshold for when to start chunking (e.g., above 500 words)
 
 
 
 
 
598
  CHUNKING_THRESHOLD = 500
599
  should_chunk = word_count > CHUNKING_THRESHOLD
600
  mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
601
  if should_chunk:
602
  mode_msg += " with **chunking** for large text"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
 
604
- # --- Topic Modeling Input Retrieval (Using default or current state values) ---
605
- # The actual sliders are only visible after results are shown, so here we use the state defaults
606
- # or the last successfully run values to check for changes and run the model.
607
- # Use the key that holds the current value, which is initialized at the top level
608
- current_num_topics = st.session_state.num_topics_slider
609
- current_num_top_words = st.session_state.num_top_words_slider
610
-
611
- with st.spinner(f"Extracting entities using {mode_msg}...", show_time=True):
612
- # Re-run prediction only if text, active labels, OR topic parameters have changed
613
- current_settings = (text, tuple(active_labels), current_num_topics, current_num_top_words)
614
- # Add topic settings to last_settings check
615
- last_settings = (
616
- st.session_state.last_text,
617
- tuple(st.session_state.get('last_active_labels', [])),
618
- st.session_state.get('last_num_topics', None),
619
- st.session_state.get('last_num_top_words', None)
620
- )
621
-
622
- if current_settings != last_settings:
623
- start_time = time.time()
624
- ner_model = load_ner_model(labels=active_labels)
625
-
626
- # 2. Perform NER Extraction
627
- if should_chunk:
628
- all_entities_list = process_chunked_text(text, active_labels, ner_model)
629
- else:
630
- all_entities_list = ner_model.predict_entities(text, active_labels)
631
-
632
- df = pd.DataFrame(all_entities_list)
633
-
634
- if df.empty:
635
- df_topic_data = None
636
  else:
637
- # 3. Add Category Mapping
638
- df['category'] = df['label'].apply(
639
- lambda l: REVERSE_FIXED_CATEGORY_MAPPING.get(l, "User Defined Entities")
640
- )
641
-
642
- # 4. Perform Topic Modeling (Passing the new parameters)
643
- df_topic_data = perform_topic_modeling(
644
- df_entities=df,
645
- num_topics=current_num_topics, # PARAMETER
646
- num_top_words=current_num_top_words # PARAMETER
647
- )
648
-
649
- end_time = time.time()
650
- elapsed_time = end_time - start_time
651
-
652
- # 5. Save Results to Session State
653
- st.session_state.results_df = df
654
- st.session_state.topic_results = df_topic_data
655
- st.session_state.elapsed_time = elapsed_time
656
- st.session_state.last_text = text
657
- st.session_state.show_results = True
658
- st.session_state.last_active_labels = active_labels
659
- st.session_state.last_num_topics = current_num_topics # Save topic settings
660
- st.session_state.last_num_top_words = current_num_top_words # Save topic settings
661
  else:
662
- st.info("Results already calculated for the current text and settings.")
663
- st.session_state.show_results = True
 
 
 
 
 
 
 
 
 
 
 
 
664
 
665
  # --- Display Download Link and Results (Updated with White-Label inputs) ---
666
  if st.session_state.show_results:
@@ -677,9 +695,11 @@ if st.session_state.show_results:
677
  # 1. Highlighted Text
678
  st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
679
  st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
 
680
  # 2. Detailed Entity Analysis Tabs
681
  st.markdown("### 2. Detailed Entity Analysis")
682
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
 
683
  # Determine which categories to use for the tabs
684
  if st.session_state.is_custom_mode:
685
  unique_categories = ["User Defined Entities"]
@@ -687,95 +707,77 @@ if st.session_state.show_results:
687
  st.markdown(f"**Custom Labels Detected: {', '.join(tabs_to_show)}**")
688
  else:
689
  unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
690
- # --- Section 2a: Detailed Tables by Category/Label ---
691
-
692
-
693
-
694
-
695
- # --- Function to Apply Conditional Coloring to Scores ---
696
- def color_score_gradient(df):
697
- """
698
- Applies a color gradient to the 'score' column using Pandas Styler.
699
- High scores (closer to 1.0) will be darker/more saturated.
700
- """
701
- # Use 'YlGnBu' (Yellow-Green-Blue) gradient.
702
- # We apply the gradient only to the 'score' column subset.
703
- return df.style.background_gradient(
704
- cmap='YlGnBu',
705
- subset=['score']
706
- ).format(
707
- {'score': '{:.4f}'} # Re-apply the four decimal place format
708
- )
709
-
710
- # --- Your Main Tab Detail Logic ---
711
-
712
- # Note: This code assumes 'df', 'st.session_state.is_custom_mode', and 'unique_categories'
713
- # are already defined earlier in your Streamlit application.
714
-
715
- tab_category_details:
716
- st.markdown("#### Detailed Entities Table (Grouped by Category)")
717
-
718
- if st.session_state.is_custom_mode:
719
- # In custom mode, group by the actual label since the category is just "User Defined Entities"
720
- tabs_list = df['label'].unique().tolist()
721
- tabs_category = st.tabs(tabs_list)
722
-
723
- for label, tab in zip(tabs_list, tabs_category):
724
- # Prepare the DataFrame for the current label
725
- df_label = df[df['label'] == label][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
726
-
727
- # Apply the coloring function
728
- styled_df_label = color_score_gradient(df_label)
729
-
730
- with tab:
731
- st.markdown(f"##### {label.capitalize()} Entities ({len(df_label)} total)")
732
- st.dataframe(
733
- # Pass the STYLED DataFrame object to Streamlit
734
- styled_df_label,
735
- use_container_width=True,
736
- # NOTE: st.column_config for 'score' is removed because Pandas Styler handles formatting and coloring
737
- )
738
- else:
739
- # In fixed mode, group by the category defined in FIXED_CATEGORY_MAPPING
740
- tabs_category = st.tabs(unique_categories)
741
-
742
- for category, tab in zip(unique_categories, tabs_category):
743
- # Prepare the DataFrame for the current category
744
- df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
745
-
746
- # Apply the coloring function
747
- styled_df_category = color_score_gradient(df_category)
748
-
749
- with tab:
750
- st.markdown(f"##### {category} Entities ({len(df_category)} total)")
751
- if not df_category.empty:
752
- st.dataframe(
753
- # Pass the STYLED DataFrame object to Streamlit
754
- styled_df_category,
755
- use_container_width=True,
756
- # NOTE: st.column_config for 'score' is removed
757
- )
758
- else:
759
- st.info(f"No entities of category **{category}** were found in the text.")
760
-
761
-
762
-
763
-
764
-
765
-
766
-
767
-
768
-
769
-
770
-
771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773
 
774
-
775
  # --- INSERTED GLOSSARY HERE ---
776
  with st.expander("See Glossary of tags"):
777
- st.write('''- **text**: ['entity extracted from your text data']- **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']- **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']- **start**: ['index of the start of the corresponding entity']- **end**: ['index of the end of the corresponding entity']''')
 
 
 
 
 
778
  # --- END GLOSSARY INSERTION ---
 
779
  # --- Section 2b: Treemap Visualization ---
780
  with tab_treemap_viz:
781
  st.markdown("#### Treemap: Entity Distribution")
@@ -788,6 +790,7 @@ tab_category_details:
788
  )
789
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
790
  st.plotly_chart(fig_treemap, use_container_width=True)
 
791
  # --- Section 3: Comparative Charts (COMPLETED) ---
792
  st.markdown("---")
793
  st.markdown("### 3. Comparative Charts")
@@ -796,15 +799,18 @@ tab_category_details:
796
  grouped_counts.columns = ['Category', 'Count']
797
  # Determine color sequence for charts
798
  chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
 
799
  with col1: # Pie Chart
800
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
801
  fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
802
  st.plotly_chart(fig_pie, use_container_width=True)
 
803
  with col2: # Bar Chart by Category
804
  st.markdown("#### Entity Count by Category")
805
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
806
  fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
807
  st.plotly_chart(fig_bar_category, use_container_width=True)
 
808
  with col3: # Bar Chart for Most Frequent Entities
809
  st.markdown("#### Top 10 Most Frequent Entities")
810
  word_counts = df['text'].value_counts().reset_index()
@@ -832,7 +838,6 @@ tab_category_details:
832
  st.markdown("Adjust the settings below and click **'Re-Run Topic Model'** to instantly update the visualization based on the extracted entities.")
833
 
834
  col_slider_topic, col_slider_words, col_rerun_btn = st.columns([1, 1, 0.5])
835
-
836
  with col_slider_topic:
837
  new_num_topics = st.slider(
838
  "Number of Topics",
@@ -859,7 +864,6 @@ tab_category_details:
859
  # Update session state with the new slider values
860
  st.session_state.num_topics_slider = st.session_state.num_topics_slider_new
861
  st.session_state.num_top_words_slider = st.session_state.num_top_words_slider_new
862
-
863
  # Recalculate topic modeling results
864
  if not st.session_state.results_df.empty:
865
  df_topic_data_new = perform_topic_modeling(
@@ -884,7 +888,6 @@ tab_category_details:
884
  * Topics: **{st.session_state.last_num_topics}**
885
  * Top Words: **{st.session_state.last_num_top_words}**
886
  """)
887
-
888
  df_topic_data = st.session_state.topic_results # Get the potentially updated results
889
  if df_topic_data is not None and not df_topic_data.empty:
890
  st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
@@ -892,7 +895,6 @@ tab_category_details:
892
  else:
893
  st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
894
 
895
-
896
  # --- 5. White-Label Configuration (NEW SECTION FOR CUSTOM BRANDING) ---
897
  st.markdown("---")
898
  st.markdown("### 5. White-Label Report Configuration 🎨")
@@ -909,10 +911,12 @@ tab_category_details:
909
  key='custom_branding_input',
910
  help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
911
  )
 
912
  # 6. Downloads (Updated to pass custom variables)
913
  st.markdown("---")
914
  st.markdown("### 6. Downloads")
915
  col_csv, col_html = st.columns(2)
 
916
  # CSV Download
917
  csv_buffer = generate_entity_csv(df)
918
  with col_csv:
@@ -923,9 +927,11 @@ tab_category_details:
923
  mime="text/csv",
924
  use_container_width=True
925
  )
 
926
  # --- NEW LOGIC: Wrap the simple text input into proper HTML for the report ---
927
  # We wrap the user's plain text in a styled HTML paragraph element
928
  branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
 
929
  # HTML Download (Passing custom white-label parameters)
930
  html_content = generate_html_report(
931
  df,
@@ -944,4 +950,12 @@ tab_category_details:
944
  file_name="ner_topic_full_report.html",
945
  mime="text/html",
946
  use_container_width=True
947
- )
 
 
 
 
 
 
 
 
 
22
  # ------------------------------
23
  from gliner import GLiNER
24
  from streamlit_extras.stylable_container import stylable_container
25
+
26
  # Using a try/except for comet_ml import
27
  try:
28
  from comet_ml import Experiment
 
32
  def log_parameter(self, *args): pass
33
  def log_table(self, *args): pass
34
  def end(self): pass
35
+
36
  # --- Model Home Directory (Fix for deployment environments) ---
37
  os.environ['HF_HOME'] = '/tmp'
38
+
39
  # --- Fixed Label Definitions and Mappings (Used as Fallback) ---
40
  FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
41
  FIXED_ENTITY_COLOR_MAP = {
 
54
  "People & Roles": ["person", "organization", "position"],
55
  "Locations": ["country", "city"],
56
  "Time & Dates": ["date", "time"],
57
+ "Numbers & Finance": ["money", "cardinal"]
58
+ }
59
  REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
60
+
61
  # --- Dynamic Color Generator for Custom Labels ---
62
+ # Use Plotly's Alphabet set for a large pool of distinct colors
63
  COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
64
+
65
  def extract_label(node_name):
66
  """Extracts the label from a node string like 'Text (Label)'."""
67
  match = re.search(r'\(([^)]+)\)$', node_name)
68
  return match.group(1) if match else "Unknown"
69
+
70
  def remove_trailing_punctuation(text_string):
71
  """Removes trailing punctuation from a string."""
72
  return text_string.rstrip(string.punctuation)
73
+
74
  def get_dynamic_color_map(active_labels, fixed_map):
75
  """Generates a color map, using fixed colors if available, otherwise dynamic colors."""
76
  color_map = {}
 
86
  # Generate a new color from the palette
87
  color_map[label] = next(COLOR_PALETTE)
88
  return color_map
89
+
90
  def highlight_entities(text, df_entities, entity_color_map):
91
  """
92
  Generates HTML to display text with entities highlighted and colored.
 
112
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
113
  # Use a div to mimic the Streamlit input box style for the report
114
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
115
+
116
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
117
  """Performs basic Topic Modeling using LDA."""
118
  documents = df_entities['text'].unique().tolist()
 
147
  return pd.DataFrame(topic_data_list)
148
  except Exception as e:
149
  return None
150
+
151
  def create_topic_word_bubbles(df_topic_data):
152
  """Generates a Plotly Bubble Chart for top words across all topics."""
153
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
 
177
  marker=dict(line=dict(width=1, color='DarkSlateGrey'))
178
  )
179
  return fig
180
+
181
  def generate_network_graph(df, raw_text, entity_color_map):
182
  """Generates a network graph visualization (Node Plot) with edges based on entity co-occurrence in sentences."""
183
  entity_counts = df['text'].value_counts().reset_index()
 
214
  if n1 in pos_map and n2 in pos_map:
215
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
216
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
217
+
218
  fig = go.Figure()
219
  edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
220
  fig.add_trace(edge_trace)
 
248
  margin=dict(t=50, b=10, l=10, r=10), height=600
249
  )
250
  return fig
251
+
252
  def generate_entity_csv(df):
253
  """Generates a CSV file of the extracted entities in an in-memory buffer."""
254
  csv_buffer = BytesIO()
 
256
  csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
257
  csv_buffer.seek(0)
258
  return csv_buffer
259
+
260
  # -----------------------------------
261
  # --- HTML REPORT GENERATION FUNCTION (MODIFIED FOR WHITE-LABEL) ---
262
  def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
 
266
  """
267
  # Use the category values from the DataFrame to ensure the report matches the app's current mode (fixed or custom)
268
  unique_categories = df['category'].unique()
269
+
270
  # 1. Generate Visualizations (Plotly HTML)
271
  # 1a. Treemap
272
  fig_treemap = px.treemap(
 
278
  color_discrete_sequence=px.colors.qualitative.Dark24
279
  )
280
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
281
+ treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
282
+
283
+ # 1b. Pie Chart
284
  grouped_counts = df['category'].value_counts().reset_index()
285
  grouped_counts.columns = ['Category', 'Count']
286
  color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
287
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
288
  fig_pie.update_layout(margin=dict(t=50, b=10))
289
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
290
+
291
  # 1c. Bar Chart (Category Count)
292
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
293
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
294
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
295
+
296
  # 1d. Bar Chart (Most Frequent Entities)
297
  word_counts = df['text'].value_counts().reset_index()
298
  word_counts.columns = ['Entity', 'Count']
 
302
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
303
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
304
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
305
+
306
  # 1e. Network Graph HTML - IMPORTANT: Pass color map
307
  network_fig = generate_network_graph(df, text_input, entity_color_map)
308
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
309
+
310
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
311
  if df_topic_data is not None and not df_topic_data.empty:
312
  bubble_figure = create_topic_word_bubbles(df_topic_data)
 
319
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
320
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
321
  topic_charts_html += '</div>'
322
+
323
  # 2. Get Highlighted Text - IMPORTANT: Pass color map
324
  highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
325
+
326
  # 3. Entity Tables (Pandas to HTML)
327
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
328
  classes='table table-striped',
329
  index=False
330
  )
331
+
332
  # 4. Construct the Final HTML (UPDATED FOR WHITE-LABELING)
333
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
334
  <meta charset="UTF-8">
 
353
  <div class="container">
354
  <h1>{report_title}</h1>
355
  <div class="metadata">
356
+ {branding_html}
357
+ <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
358
  <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
359
  </div>
360
  <h2>1. Analyzed Text & Extracted Entities</h2>
 
362
  <div class="highlighted-text-container">
363
  {highlighted_text_html}
364
  </div>
365
+ <h2>2. Full Extracted Entities Table </h2>
 
366
  {entity_table_html}
367
  <h2>3. Data Visualizations</h2>
368
  <h3>3.1 Entity Distribution Treemap</h3>
 
381
  </html>
382
  """
383
  return html_content
384
+
385
  def chunk_text(text, max_chunk_size=1500):
386
  """Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
387
  # Split by double newline (paragraph) or sentence-like separators
 
401
  if current_chunk:
402
  chunks.append((current_chunk, current_offset))
403
  return chunks
404
+
405
  def process_chunked_text(text, labels, model):
406
  """Processes large text in chunks and aggregates/offsets the entities."""
407
  # GLiNER model context size can be around 1024-1500 tokens/words. We use a generous char limit.
 
418
  entity['end'] += chunk_offset
419
  all_entities.append(entity)
420
  return all_entities
421
+
 
422
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
423
+
424
  # --- Conditional Mobile Warning ---
425
  st.markdown(
426
  """
 
478
  unsafe_allow_html=True)
479
 
480
  # --- Topic Modeling Settings (Moved to main body, but need to initialize key outside of 'if st.session_state.show_results:') ---
481
+ st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue")
482
 
 
 
483
  tab1, tab2 = st.tabs(["Embed", "Important Notes"])
484
  with tab1:
485
  with st.expander("Embed"):
 
493
  ></iframe>
494
  '''
495
  st.code(code, language="html")
496
+
497
  with tab2:
498
  expander = st.expander("**Important Notes**")
499
  expander.markdown("""
 
503
  **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
504
  """)
505
  st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)") # Updated contact info
506
+
507
  # --- Comet ML Setup (Placeholder/Conditional) ---
508
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
509
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 
522
  print(f"FATAL ERROR: Failed to load NER model: {e}")
523
  st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
524
  st.stop()
525
+
526
  # --- LONG DEFAULT TEXT (178 Words) ---
527
  DEFAULT_TEXT = (
528
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
 
539
  "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
540
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
541
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
542
+
543
  # -----------------------------------
544
  # --- Session State Initialization (CRITICAL FIX) ---
545
  if 'show_results' not in st.session_state: st.session_state.show_results = False
 
556
  if 'num_top_words_slider' not in st.session_state: st.session_state.num_top_words_slider = 10
557
  if 'last_num_topics' not in st.session_state: st.session_state.last_num_topics = None
558
  if 'last_num_top_words' not in st.session_state: st.session_state.last_num_top_words = None
559
+ if 'last_active_labels' not in st.session_state: st.session_state.last_active_labels = None # Added for results comparison
560
+
561
 
 
562
  def clear_text():
563
  """Clears the text area (sets it to an empty string) and hides results."""
564
  st.session_state['my_text_area'] = ""
 
613
  st.session_state.is_custom_mode = False
614
 
615
  active_labels = st.session_state.active_labels_list
616
+
617
+ # Get current topic modeling settings (used for caching logic)
618
+ current_num_topics = st.session_state.num_topics_slider
619
+ current_num_top_words = st.session_state.num_top_words_slider
620
+
621
+ # Caching Logic: Check if we need to re-run the full process
622
+ should_rerun_full_analysis = (
623
+ text.strip() != st.session_state.last_text.strip() or
624
+ active_labels != st.session_state.last_active_labels
625
+ )
626
+
627
+ if should_rerun_full_analysis and text.strip() and word_count <= word_limit:
628
+
629
+ # 2. Rerunning Full Analysis
630
  CHUNKING_THRESHOLD = 500
631
  should_chunk = word_count > CHUNKING_THRESHOLD
632
  mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
633
  if should_chunk:
634
  mode_msg += " with **chunking** for large text"
635
+
636
+ with st.spinner(f"Analyzing text with {mode_msg}..."):
637
+ start_time = time.time()
638
+
639
+ # 2a. Load Model (Model constraints are updated based on active labels)
640
+ # NOTE: Load time is cached, so this is fast on subsequent runs.
641
+ model = load_ner_model(active_labels)
642
+
643
+ # 2b. Extract Entities (using chunking if necessary)
644
+ if should_chunk:
645
+ all_entities = process_chunked_text(text, active_labels, model)
646
+ else:
647
+ all_entities = model.predict_entities(text, active_labels)
648
+
649
+ end_time = time.time()
650
+ elapsed_time = end_time - start_time
651
+
652
+ # 2c. Prepare DataFrame
653
+ df = pd.DataFrame(all_entities)
654
 
655
+ if not df.empty:
656
+ # Add category mapping
657
+ if st.session_state.is_custom_mode:
658
+ df['category'] = 'User Defined Entities'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
659
  else:
660
+ df['category'] = df['label'].map(REVERSE_FIXED_CATEGORY_MAPPING).fillna('Other')
661
+
662
+ # Clean up extracted text
663
+ df['text'] = df['text'].apply(remove_trailing_punctuation)
664
+
665
+ # 2d. Perform Topic Modeling on extracted entities
666
+ df_topic_data = perform_topic_modeling(df, num_topics=current_num_topics, num_top_words=current_num_top_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
667
  else:
668
+ df_topic_data = None
669
+
670
+ # 5. Save Results to Session State
671
+ st.session_state.results_df = df
672
+ st.session_state.topic_results = df_topic_data
673
+ st.session_state.elapsed_time = elapsed_time
674
+ st.session_state.last_text = text
675
+ st.session_state.show_results = True
676
+ st.session_state.last_active_labels = active_labels
677
+ st.session_state.last_num_topics = current_num_topics # Save topic settings
678
+ st.session_state.last_num_top_words = current_num_top_words # Save topic settings
679
+ else:
680
+ st.info("Results already calculated for the current text and settings.")
681
+ st.session_state.show_results = True
682
 
683
  # --- Display Download Link and Results (Updated with White-Label inputs) ---
684
  if st.session_state.show_results:
 
695
  # 1. Highlighted Text
696
  st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
697
  st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
698
+
699
  # 2. Detailed Entity Analysis Tabs
700
  st.markdown("### 2. Detailed Entity Analysis")
701
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
702
+
703
  # Determine which categories to use for the tabs
704
  if st.session_state.is_custom_mode:
705
  unique_categories = ["User Defined Entities"]
 
707
  st.markdown(f"**Custom Labels Detected: {', '.join(tabs_to_show)}**")
708
  else:
709
  unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
 
711
+ # --- Section 2a: Detailed Tables by Category/Label ---
712
+ # --- Function to Apply Conditional Coloring to Scores ---
713
+ def color_score_gradient(df):
714
+ """
715
+ Applies a color gradient to the 'score' column using Pandas Styler.
716
+ High scores (closer to 1.0) will be darker/more saturated.
717
+ """
718
+ # Use 'YlGnBu' (Yellow-Green-Blue) gradient.
719
+ # We apply the gradient only to the 'score' column subset.
720
+ return df.style.background_gradient(
721
+ cmap='YlGnBu',
722
+ subset=['score']
723
+ ).format(
724
+ {'score': '{:.4f}'} # Re-apply the four decimal place format
725
+ )
726
 
727
+ # --- Your Main Tab Detail Logic ---
728
+ with tab_category_details:
729
+ st.markdown("#### Detailed Entities Table (Grouped by Category)")
730
+ if st.session_state.is_custom_mode:
731
+ # In custom mode, group by the actual label since the category is just "User Defined Entities"
732
+ tabs_list = df['label'].unique().tolist()
733
+ tabs_category = st.tabs(tabs_list)
734
+
735
+ for label, tab in zip(tabs_list, tabs_category):
736
+ # Prepare the DataFrame for the current label
737
+ df_label = df[df['label'] == label][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
738
+
739
+ # Apply the coloring function
740
+ styled_df_label = color_score_gradient(df_label)
741
+ with tab:
742
+ st.markdown(f"##### {label.capitalize()} Entities ({len(df_label)} total)")
743
+ st.dataframe(
744
+ # Pass the STYLED DataFrame object to Streamlit
745
+ styled_df_label,
746
+ use_container_width=True,
747
+ # NOTE: st.column_config for 'score' is removed because Pandas Styler handles formatting and coloring
748
+ )
749
+ else:
750
+ # In fixed mode, group by the category defined in FIXED_CATEGORY_MAPPING
751
+ tabs_category = st.tabs(unique_categories)
752
+
753
+ for category, tab in zip(unique_categories, tabs_category):
754
+ # Prepare the DataFrame for the current category
755
+ df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
756
+
757
+ # Apply the coloring function
758
+ styled_df_category = color_score_gradient(df_category)
759
+ with tab:
760
+ st.markdown(f"##### {category} Entities ({len(df_category)} total)")
761
+ if not df_category.empty:
762
+ st.dataframe(
763
+ # Pass the STYLED DataFrame object to Streamlit
764
+ styled_df_category,
765
+ use_container_width=True,
766
+ # NOTE: st.column_config for 'score' is removed
767
+ )
768
+ else:
769
+ st.info(f"No entities of category **{category}** were found in the text.")
770
 
 
771
  # --- INSERTED GLOSSARY HERE ---
772
  with st.expander("See Glossary of tags"):
773
+ st.write('''- **text**: ['entity extracted from your text data']
774
+ - **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']
775
+ - **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']
776
+ - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
777
+ - **start**: ['index of the start of the corresponding entity']
778
+ - **end**: ['index of the end of the corresponding entity']''')
779
  # --- END GLOSSARY INSERTION ---
780
+
781
  # --- Section 2b: Treemap Visualization ---
782
  with tab_treemap_viz:
783
  st.markdown("#### Treemap: Entity Distribution")
 
790
  )
791
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
792
  st.plotly_chart(fig_treemap, use_container_width=True)
793
+
794
  # --- Section 3: Comparative Charts (COMPLETED) ---
795
  st.markdown("---")
796
  st.markdown("### 3. Comparative Charts")
 
799
  grouped_counts.columns = ['Category', 'Count']
800
  # Determine color sequence for charts
801
  chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
802
+
803
  with col1: # Pie Chart
804
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
805
  fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
806
  st.plotly_chart(fig_pie, use_container_width=True)
807
+
808
  with col2: # Bar Chart by Category
809
  st.markdown("#### Entity Count by Category")
810
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
811
  fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
812
  st.plotly_chart(fig_bar_category, use_container_width=True)
813
+
814
  with col3: # Bar Chart for Most Frequent Entities
815
  st.markdown("#### Top 10 Most Frequent Entities")
816
  word_counts = df['text'].value_counts().reset_index()
 
838
  st.markdown("Adjust the settings below and click **'Re-Run Topic Model'** to instantly update the visualization based on the extracted entities.")
839
 
840
  col_slider_topic, col_slider_words, col_rerun_btn = st.columns([1, 1, 0.5])
 
841
  with col_slider_topic:
842
  new_num_topics = st.slider(
843
  "Number of Topics",
 
864
  # Update session state with the new slider values
865
  st.session_state.num_topics_slider = st.session_state.num_topics_slider_new
866
  st.session_state.num_top_words_slider = st.session_state.num_top_words_slider_new
 
867
  # Recalculate topic modeling results
868
  if not st.session_state.results_df.empty:
869
  df_topic_data_new = perform_topic_modeling(
 
888
  * Topics: **{st.session_state.last_num_topics}**
889
  * Top Words: **{st.session_state.last_num_top_words}**
890
  """)
 
891
  df_topic_data = st.session_state.topic_results # Get the potentially updated results
892
  if df_topic_data is not None and not df_topic_data.empty:
893
  st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
 
895
  else:
896
  st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
897
 
 
898
  # --- 5. White-Label Configuration (NEW SECTION FOR CUSTOM BRANDING) ---
899
  st.markdown("---")
900
  st.markdown("### 5. White-Label Report Configuration 🎨")
 
911
  key='custom_branding_input',
912
  help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
913
  )
914
+
915
  # 6. Downloads (Updated to pass custom variables)
916
  st.markdown("---")
917
  st.markdown("### 6. Downloads")
918
  col_csv, col_html = st.columns(2)
919
+
920
  # CSV Download
921
  csv_buffer = generate_entity_csv(df)
922
  with col_csv:
 
927
  mime="text/csv",
928
  use_container_width=True
929
  )
930
+
931
  # --- NEW LOGIC: Wrap the simple text input into proper HTML for the report ---
932
  # We wrap the user's plain text in a styled HTML paragraph element
933
  branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
934
+
935
  # HTML Download (Passing custom white-label parameters)
936
  html_content = generate_html_report(
937
  df,
 
950
  file_name="ner_topic_full_report.html",
951
  mime="text/html",
952
  use_container_width=True
953
+ )
954
+
955
+
956
+
957
+
958
+
959
+
960
+
961
+