AIEcosystem commited on
Commit
3dbd695
·
verified ·
1 Parent(s): b3682ae

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +82 -143
src/streamlit_app.py CHANGED
@@ -22,7 +22,6 @@ from sklearn.decomposition import LatentDirichletAllocation
22
  # ------------------------------
23
  from gliner import GLiNER
24
  from streamlit_extras.stylable_container import stylable_container
25
-
26
  # Using a try/except for comet_ml import
27
  try:
28
  from comet_ml import Experiment
@@ -32,10 +31,8 @@ except ImportError:
32
  def log_parameter(self, *args): pass
33
  def log_table(self, *args): pass
34
  def end(self): pass
35
-
36
  # --- Model Home Directory (Fix for deployment environments) ---
37
  os.environ['HF_HOME'] = '/tmp'
38
-
39
  # --- Fixed Label Definitions and Mappings (Used as Fallback) ---
40
  FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
41
  FIXED_ENTITY_COLOR_MAP = {
@@ -49,7 +46,6 @@ FIXED_ENTITY_COLOR_MAP = {
49
  "money": "#f43f5e", # Red
50
  "position": "#a855f7", # Violet
51
  }
52
-
53
  # --- Fixed Category Mapping ---
54
  FIXED_CATEGORY_MAPPING = {
55
  "People & Roles": ["person", "organization", "position"],
@@ -57,20 +53,16 @@ FIXED_CATEGORY_MAPPING = {
57
  "Time & Dates": ["date", "time"],
58
  "Numbers & Finance": ["money", "cardinal"]}
59
  REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
60
-
61
  # --- Dynamic Color Generator for Custom Labels ---
62
  # Use Plotly's Alphabet set for a large pool of distinct colors
63
  COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
64
-
65
  def extract_label(node_name):
66
  """Extracts the label from a node string like 'Text (Label)'."""
67
  match = re.search(r'\(([^)]+)\)$', node_name)
68
  return match.group(1) if match else "Unknown"
69
-
70
  def remove_trailing_punctuation(text_string):
71
  """Removes trailing punctuation from a string."""
72
  return text_string.rstrip(string.punctuation)
73
-
74
  def get_dynamic_color_map(active_labels, fixed_map):
75
  """Generates a color map, using fixed colors if available, otherwise dynamic colors."""
76
  color_map = {}
@@ -86,7 +78,6 @@ def get_dynamic_color_map(active_labels, fixed_map):
86
  # Generate a new color from the palette
87
  color_map[label] = next(COLOR_PALETTE)
88
  return color_map
89
-
90
  def highlight_entities(text, df_entities, entity_color_map):
91
  """
92
  Generates HTML to display text with entities highlighted and colored.
@@ -101,11 +92,9 @@ def highlight_entities(text, df_entities, entity_color_map):
101
  # Ensure the entity indices are within the bounds of the full text
102
  start = max(0, entity['start'])
103
  end = min(len(text), entity['end'])
104
-
105
  # Get entity text from the full document based on its indices
106
  # The 'text' column in the dataframe is now an attribute of the chunked text, not the original span
107
  entity_text_from_full_doc = text[start:end]
108
-
109
  label = entity['label']
110
  color = entity_color_map.get(label, '#000000')
111
  # Create a span with background color and tooltip
@@ -114,7 +103,6 @@ def highlight_entities(text, df_entities, entity_color_map):
114
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
115
  # Use a div to mimic the Streamlit input box style for the report
116
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
117
-
118
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
119
  """Performs basic Topic Modeling using LDA."""
120
  documents = df_entities['text'].unique().tolist()
@@ -122,29 +110,24 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
122
  # but here we use the extracted entity texts as per the original code's intent.
123
  if len(documents) < 2:
124
  return None
125
-
126
  N = min(num_top_words, len(documents))
127
  try:
128
  tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(1, 3))
129
  tfidf = tfidf_vectorizer.fit_transform(documents)
130
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
131
-
132
  if len(tfidf_feature_names) < num_topics:
133
  tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3))
134
  tfidf = tfidf_vectorizer.fit_transform(documents)
135
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
136
  if len(tfidf_feature_names) < num_topics:
137
  return None
138
-
139
  lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1)
140
  lda.fit(tfidf)
141
-
142
  topic_data_list = []
143
  for topic_idx, topic in enumerate(lda.components_):
144
  top_words_indices = topic.argsort()[:-N - 1:-1]
145
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
146
  word_weights = [topic[i] for i in top_words_indices]
147
-
148
  for word, weight in zip(top_words, word_weights):
149
  topic_data_list.append({
150
  'Topic_ID': f'Topic #{topic_idx + 1}',
@@ -152,17 +135,14 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
152
  'Weight': weight,
153
  })
154
  return pd.DataFrame(topic_data_list)
155
-
156
  except Exception as e:
157
  return None
158
-
159
  def create_topic_word_bubbles(df_topic_data):
160
  """Generates a Plotly Bubble Chart for top words across all topics."""
161
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
162
  df_topic_data['x_pos'] = df_topic_data.index
163
  if df_topic_data.empty:
164
  return None
165
-
166
  fig = px.scatter(
167
  df_topic_data,
168
  x='x_pos', y='weight', size='weight', color='topic', text='word', hover_name='word', size_max=40,
@@ -186,7 +166,6 @@ def create_topic_word_bubbles(df_topic_data):
186
  marker=dict(line=dict(width=1, color='DarkSlateGrey'))
187
  )
188
  return fig
189
-
190
  def generate_network_graph(df, raw_text, entity_color_map):
191
  """Generates a network graph visualization (Node Plot) with edges based on entity co-occurrence in sentences."""
192
  entity_counts = df['text'].value_counts().reset_index()
@@ -194,7 +173,6 @@ def generate_network_graph(df, raw_text, entity_color_map):
194
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
195
  if unique_entities.shape[0] < 2:
196
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
197
-
198
  num_nodes = len(unique_entities)
199
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
200
  radius = 10
@@ -217,7 +195,6 @@ def generate_network_graph(df, raw_text, entity_color_map):
217
  node2 = unique_entities_in_sentence[j]
218
  edge_tuple = tuple(sorted((node1, node2)))
219
  edges.add(edge_tuple)
220
-
221
  edge_x = []
222
  edge_y = []
223
  for edge in edges:
@@ -225,11 +202,9 @@ def generate_network_graph(df, raw_text, entity_color_map):
225
  if n1 in pos_map and n2 in pos_map:
226
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
227
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
228
-
229
  fig = go.Figure()
230
  edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
231
  fig.add_trace(edge_trace)
232
-
233
  fig.add_trace(go.Scatter(
234
  x=unique_entities['x'], y=unique_entities['y'], mode='markers+text', name='Entities', text=unique_entities['text'], textposition="top center", showlegend=False,
235
  marker=dict(
@@ -241,7 +216,6 @@ def generate_network_graph(df, raw_text, entity_color_map):
241
  customdata=unique_entities[['label', 'score', 'frequency']],
242
  hovertemplate=("<b>%{text}</b><br>Label: %{customdata[0]}<br>Score: %{customdata[1]:.2f}<br>Frequency: %{customdata[2]}<extra></extra>")
243
  ))
244
-
245
  legend_traces = []
246
  seen_labels = set()
247
  for index, row in unique_entities.iterrows():
@@ -250,10 +224,8 @@ def generate_network_graph(df, raw_text, entity_color_map):
250
  seen_labels.add(label)
251
  color = entity_color_map.get(label, '#cccccc')
252
  legend_traces.append(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True))
253
-
254
  for trace in legend_traces:
255
  fig.add_trace(trace)
256
-
257
  fig.update_layout(
258
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
259
  showlegend=True, hovermode='closest',
@@ -263,7 +235,6 @@ def generate_network_graph(df, raw_text, entity_color_map):
263
  margin=dict(t=50, b=10, l=10, r=10), height=600
264
  )
265
  return fig
266
-
267
  # --- CSV GENERATION FUNCTION ---
268
  def generate_entity_csv(df):
269
  """Generates a CSV file of the extracted entities in an in-memory buffer."""
@@ -273,7 +244,6 @@ def generate_entity_csv(df):
273
  csv_buffer.seek(0)
274
  return csv_buffer
275
  # -----------------------------------
276
-
277
  # --- HTML REPORT GENERATION FUNCTION (MODIFIED FOR WHITE-LABEL) ---
278
  def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
279
  """
@@ -282,7 +252,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
282
  """
283
  # Use the category values from the DataFrame to ensure the report matches the app's current mode (fixed or custom)
284
  unique_categories = df['category'].unique()
285
-
286
  # 1. Generate Visualizations (Plotly HTML)
287
  # 1a. Treemap
288
  fig_treemap = px.treemap(
@@ -294,21 +263,17 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
294
  color_discrete_sequence=px.colors.qualitative.Dark24
295
  )
296
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
297
- treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
298
-
299
- # 1b. Pie Chart
300
  grouped_counts = df['category'].value_counts().reset_index()
301
  grouped_counts.columns = ['Category', 'Count']
302
  color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
303
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
304
  fig_pie.update_layout(margin=dict(t=50, b=10))
305
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
306
-
307
  # 1c. Bar Chart (Category Count)
308
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
309
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
310
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
311
-
312
  # 1d. Bar Chart (Most Frequent Entities)
313
  word_counts = df['text'].value_counts().reset_index()
314
  word_counts.columns = ['Entity', 'Count']
@@ -318,12 +283,10 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
318
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
319
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
320
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
321
-
322
  # 1e. Network Graph HTML - IMPORTANT: Pass color map
323
  network_fig = generate_network_graph(df, text_input, entity_color_map)
324
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
325
-
326
- # 1f. Topic Charts HTML
327
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
328
  if df_topic_data is not None and not df_topic_data.empty:
329
  bubble_figure = create_topic_word_bubbles(df_topic_data)
@@ -336,16 +299,13 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
336
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
337
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
338
  topic_charts_html += '</div>'
339
-
340
  # 2. Get Highlighted Text - IMPORTANT: Pass color map
341
  highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
342
-
343
  # 3. Entity Tables (Pandas to HTML)
344
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
345
  classes='table table-striped',
346
  index=False
347
  )
348
-
349
  # 4. Construct the Final HTML (UPDATED FOR WHITE-LABELING)
350
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
351
  <meta charset="UTF-8">
@@ -370,8 +330,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
370
  <div class="container">
371
  <h1>{report_title}</h1>
372
  <div class="metadata">
373
- {branding_html} <!-- CUSTOM BRANDING INSERTED HERE -->
374
- <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
375
  <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
376
  </div>
377
  <h2>1. Analyzed Text & Extracted Entities</h2>
@@ -399,8 +358,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
399
  </html>
400
  """
401
  return html_content
402
- # -----------------------------------
403
-
404
  # --- CHUNKING IMPLEMENTATION FOR LARGE TEXT ---
405
  def chunk_text(text, max_chunk_size=1500):
406
  """Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
@@ -409,10 +366,8 @@ def chunk_text(text, max_chunk_size=1500):
409
  chunks = []
410
  current_chunk = ""
411
  current_offset = 0
412
-
413
  for segment in segments:
414
  if not segment: continue
415
-
416
  if len(current_chunk) + len(segment) > max_chunk_size and current_chunk:
417
  # Save the current chunk and its starting offset
418
  chunks.append((current_chunk, current_offset))
@@ -422,34 +377,26 @@ def chunk_text(text, max_chunk_size=1500):
422
  current_chunk += segment
423
  if current_chunk:
424
  chunks.append((current_chunk, current_offset))
425
-
426
  return chunks
427
-
428
  def process_chunked_text(text, labels, model):
429
  """Processes large text in chunks and aggregates/offsets the entities."""
430
  # GLiNER model context size can be around 1024-1500 tokens/words. We use a generous char limit.
431
  # The word count limit is 10000, but we chunk around 500 words for safety/performance.
432
  MAX_CHUNK_CHARS = 3500
433
-
434
  chunks = chunk_text(text, max_chunk_size=MAX_CHUNK_CHARS)
435
  all_entities = []
436
-
437
  for chunk_text, chunk_offset in chunks:
438
  # Predict entities on the small chunk
439
  chunk_entities = model.predict_entities(chunk_text, labels)
440
-
441
  # Offset the start and end indices to match the original document
442
  for entity in chunk_entities:
443
  entity['start'] += chunk_offset
444
  entity['end'] += chunk_offset
445
  all_entities.append(entity)
446
-
447
  return all_entities
448
  # -----------------------------------
449
-
450
  # --- Page Configuration and Styling (No Sidebar) ---
451
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
452
-
453
  # --- Conditional Mobile Warning ---
454
  st.markdown(
455
  """
@@ -463,7 +410,6 @@ st.markdown(
463
  [data-testid="stAppViewBlock"] {
464
  background-color: #ffffff !important;
465
  }
466
-
467
  /* CSS Media Query: Only show the content inside this selector when the screen width is 600px or less (typical mobile size) */
468
  @media (max-width: 600px) {
469
  #mobile-warning-container {
@@ -506,10 +452,32 @@ st.markdown(
506
  </div>
507
  """,
508
  unsafe_allow_html=True)
509
- # ----------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
511
  # Removed st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") for white-labeling
512
-
513
  tab1, tab2 = st.tabs(["Embed", "Important Notes"])
514
  with tab1:
515
  with st.expander("Embed"):
@@ -523,20 +491,15 @@ with tab1:
523
  ></iframe>
524
  '''
525
  st.code(code, language="html")
526
-
527
  with tab2:
528
  expander = st.expander("**Important Notes**")
529
  expander.markdown("""
530
  **Named Entities (Fixed Mode):** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position".
531
-
532
  **Custom Labels Mode:** You can define your own comma-separated labels (e.g., `product, symptom, client_id`) in the input box below.
533
-
534
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
535
-
536
  **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
537
  """)
538
  st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)") # Updated contact info
539
-
540
  # --- Comet ML Setup (Placeholder/Conditional) ---
541
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
542
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
@@ -544,7 +507,7 @@ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
544
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
545
 
546
  # --- Model Loading ---
547
- @st.cache_resource
548
  def load_ner_model(labels):
549
  """Loads the GLiNER model and caches it."""
550
  try:
@@ -552,10 +515,9 @@ def load_ner_model(labels):
552
  return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
553
  except Exception as e:
554
  # Log the actual error to the console for debugging
555
- print(f"FATAL ERROR: Failed to load NER model: {e}")
556
  st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
557
  st.stop()
558
-
559
  # --- LONG DEFAULT TEXT (178 Words) ---
560
  DEFAULT_TEXT = (
561
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
@@ -573,7 +535,6 @@ DEFAULT_TEXT = (
573
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
574
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
575
  # -----------------------------------
576
-
577
  # --- Session State Initialization (CRITICAL FIX) ---
578
  if 'show_results' not in st.session_state: st.session_state.show_results = False
579
  if 'last_text' not in st.session_state: st.session_state.last_text = ""
@@ -620,7 +581,7 @@ with col_results:
620
  with col_clear:
621
  st.button("Clear text", on_click=clear_text, use_container_width=True)
622
 
623
- # --- Results Trigger and Processing (Updated Logic with Chunking) ---
624
  if run_button:
625
  # 1. Determine Active Labels and Mode
626
  custom_labels_raw = st.session_state.custom_labels_input
@@ -635,7 +596,6 @@ if run_button:
635
  else:
636
  st.session_state.active_labels_list = custom_labels_list
637
  st.session_state.is_custom_mode = True
638
-
639
  else:
640
  st.session_state.active_labels_list = FIXED_LABELS
641
  st.session_state.is_custom_mode = False
@@ -652,77 +612,73 @@ if run_button:
652
  # Define a safe threshold for when to start chunking (e.g., above 500 words)
653
  CHUNKING_THRESHOLD = 500
654
  should_chunk = word_count > CHUNKING_THRESHOLD
655
-
656
  mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
657
  if should_chunk:
658
  mode_msg += " with **chunking** for large text"
659
 
660
- with st.spinner(f"Extracting entities using {mode_msg}...", show_time=True):
 
 
 
661
 
662
- # Re-run prediction only if text or active labels have changed
663
- current_settings = (text, tuple(active_labels))
664
- last_settings = (st.session_state.last_text, tuple(st.session_state.get('last_active_labels', [])))
 
 
 
 
 
 
 
665
 
666
  if current_settings != last_settings:
667
- st.session_state.last_text = text
668
- st.session_state['last_active_labels'] = active_labels
669
-
670
  start_time = time.time()
 
671
 
672
- # Load model using the determined active labels
673
- model = load_ner_model(active_labels)
674
-
675
- # --- Model Prediction & Dataframe Creation (Using Chunking if needed) ---
676
  if should_chunk:
677
- entities = process_chunked_text(text, active_labels, model)
678
- st.info(f"Text was split into {len(chunk_text(text))} chunks for processing.")
679
  else:
680
- # Original logic for small texts
681
- entities = model.predict_entities(text, active_labels)
682
 
683
- elapsed_time = time.time() - start_time
684
- st.session_state.elapsed_time = elapsed_time
685
 
686
- # --- DataFrame Construction ---
687
- df = pd.DataFrame(entities)
688
  if df.empty:
689
- st.session_state.results_df = df
690
- st.session_state.topic_results = None
691
- st.session_state.show_results = True
692
  else:
693
- # Clean up entity text (optional, but good practice)
694
- df['text'] = df['text'].apply(remove_trailing_punctuation)
695
-
696
- # Map entities to categories
697
- if st.session_state.is_custom_mode:
698
- # For custom labels, group everything under a single category
699
- df['category'] = "User Defined Entities"
700
- else:
701
- # For fixed labels, use the fixed mapping
702
- df['category'] = df['label'].map(REVERSE_FIXED_CATEGORY_MAPPING).fillna('Other')
703
-
704
- # Remove duplicates for topics/frequency analysis, keeping the highest score
705
- df_unique_entities = df.sort_values('score', ascending=False).drop_duplicates(subset=['text', 'label'])
706
-
707
- # --- Topic Modeling ---
708
- # We use the unique entities as input for the topic modeling
709
- df_topic_data = perform_topic_modeling(df_unique_entities, num_topics=min(3, len(df_unique_entities.text.unique())), num_top_words=10)
710
-
711
- # Update session state
712
- st.session_state.results_df = df
713
- st.session_state.topic_results = df_topic_data
714
- st.session_state.show_results = True
715
-
 
716
  else:
717
- # If settings haven't changed, just show the last results
718
  st.session_state.show_results = True
719
 
720
-
721
  # --- Display Download Link and Results (Updated with White-Label inputs) ---
722
  if st.session_state.show_results:
723
  df = st.session_state.results_df
724
  df_topic_data = st.session_state.topic_results
725
-
726
  # Generate the color map based on the results DF labels
727
  current_labels_in_df = df['label'].unique().tolist()
728
  entity_color_map = get_dynamic_color_map(current_labels_in_df, FIXED_ENTITY_COLOR_MAP)
@@ -731,15 +687,12 @@ if st.session_state.show_results:
731
  st.warning("No entities were found in the provided text with the current label set.")
732
  else:
733
  st.subheader("Analysis Results", divider="blue")
734
-
735
  # 1. Highlighted Text
736
  st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
737
  st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
738
-
739
  # 2. Detailed Entity Analysis Tabs
740
  st.markdown("### 2. Detailed Entity Analysis")
741
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
742
-
743
  # Determine which categories to use for the tabs
744
  if st.session_state.is_custom_mode:
745
  unique_categories = ["User Defined Entities"]
@@ -747,11 +700,9 @@ if st.session_state.show_results:
747
  st.markdown(f"**Custom Labels Detected: {', '.join(tabs_to_show)}**")
748
  else:
749
  unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
750
-
751
  # --- Section 2a: Detailed Tables by Category/Label ---
752
  with tab_category_details:
753
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
754
-
755
  if st.session_state.is_custom_mode:
756
  # In custom mode, group by the actual label since the category is just "User Defined Entities"
757
  tabs_list = df['label'].unique().tolist()
@@ -780,12 +731,10 @@ if st.session_state.show_results:
780
  )
781
  else:
782
  st.info(f"No entities of category **{category}** were found in the text.")
783
-
784
  # --- INSERTED GLOSSARY HERE ---
785
  with st.expander("See Glossary of tags"):
786
  st.write('''- **text**: ['entity extracted from your text data']- **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']- **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']- **start**: ['index of the start of the corresponding entity']- **end**: ['index of the end of the corresponding entity']''')
787
  # --- END GLOSSARY INSERTION ---
788
-
789
  # --- Section 2b: Treemap Visualization ---
790
  with tab_treemap_viz:
791
  st.markdown("#### Treemap: Entity Distribution")
@@ -798,28 +747,23 @@ if st.session_state.show_results:
798
  )
799
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
800
  st.plotly_chart(fig_treemap, use_container_width=True)
801
-
802
  # --- Section 3: Comparative Charts (COMPLETED) ---
803
  st.markdown("---")
804
  st.markdown("### 3. Comparative Charts")
805
  col1, col2, col3 = st.columns(3)
806
  grouped_counts = df['category'].value_counts().reset_index()
807
  grouped_counts.columns = ['Category', 'Count']
808
-
809
  # Determine color sequence for charts
810
  chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
811
-
812
  with col1: # Pie Chart
813
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
814
  fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
815
  st.plotly_chart(fig_pie, use_container_width=True)
816
-
817
  with col2: # Bar Chart by Category
818
  st.markdown("#### Entity Count by Category")
819
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
820
  fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
821
  st.plotly_chart(fig_bar_category, use_container_width=True)
822
-
823
  with col3: # Bar Chart for Most Frequent Entities
824
  st.markdown("#### Top 10 Most Frequent Entities")
825
  word_counts = df['text'].value_counts().reset_index()
@@ -831,35 +775,35 @@ if st.session_state.show_results:
831
  st.plotly_chart(fig_bar_freq, use_container_width=True)
832
  else:
833
  st.info("No entities were repeated enough for a Top 10 frequency chart.")
834
-
835
  # 4. Network Graph and Topic Modeling
836
  st.markdown("---")
837
  st.markdown("### 4. Advanced Analysis")
838
  col_network, col_topic = st.columns(2)
839
-
840
  with col_network:
841
  with st.expander("🔗 Entity Co-occurrence Network Graph", expanded=True):
842
  st.plotly_chart(generate_network_graph(df, st.session_state.last_text, entity_color_map), use_container_width=True)
843
-
844
  with col_topic:
845
  with st.expander("💡 Topic Modeling (LDA)", expanded=True):
 
 
 
 
 
 
846
  if df_topic_data is not None and not df_topic_data.empty:
847
  st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
848
  st.markdown("This chart visualizes the key words driving the identified topics, based on extracted entities.")
849
  else:
850
  st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
851
-
852
  # --- 5. White-Label Configuration (NEW SECTION FOR CUSTOM BRANDING) ---
853
  st.markdown("---")
854
  st.markdown("### 5. White-Label Report Configuration 🎨")
855
-
856
  # Set a dynamic default title based on the mode
857
  default_report_title = f"{'Custom' if st.session_state.is_custom_mode else 'Fixed'} Entity Analysis Report"
858
  custom_report_title = st.text_input(
859
  "Type Your Report Title (for HTML Report), and then press Enter.",
860
  value=default_report_title
861
  )
862
-
863
  # UPDATED: Simplified input for the user
864
  custom_branding_text_input = st.text_area(
865
  "Type Your Brand Name or Tagline (Appears below the title in the report), and then press Enter.",
@@ -867,13 +811,10 @@ if st.session_state.show_results:
867
  key='custom_branding_input',
868
  help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
869
  )
870
-
871
  # 6. Downloads (Updated to pass custom variables)
872
  st.markdown("---")
873
  st.markdown("### 6. Downloads")
874
-
875
  col_csv, col_html = st.columns(2)
876
-
877
  # CSV Download
878
  csv_buffer = generate_entity_csv(df)
879
  with col_csv:
@@ -884,11 +825,9 @@ if st.session_state.show_results:
884
  mime="text/csv",
885
  use_container_width=True
886
  )
887
-
888
  # --- NEW LOGIC: Wrap the simple text input into proper HTML for the report ---
889
  # We wrap the user's plain text in a styled HTML paragraph element
890
  branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
891
-
892
  # HTML Download (Passing custom white-label parameters)
893
  html_content = generate_html_report(
894
  df,
@@ -907,4 +846,4 @@ if st.session_state.show_results:
907
  file_name="ner_topic_full_report.html",
908
  mime="text/html",
909
  use_container_width=True
910
- )
 
22
  # ------------------------------
23
  from gliner import GLiNER
24
  from streamlit_extras.stylable_container import stylable_container
 
25
  # Using a try/except for comet_ml import
26
  try:
27
  from comet_ml import Experiment
 
31
  def log_parameter(self, *args): pass
32
  def log_table(self, *args): pass
33
  def end(self): pass
 
34
  # --- Model Home Directory (Fix for deployment environments) ---
35
  os.environ['HF_HOME'] = '/tmp'
 
36
  # --- Fixed Label Definitions and Mappings (Used as Fallback) ---
37
  FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
38
  FIXED_ENTITY_COLOR_MAP = {
 
46
  "money": "#f43f5e", # Red
47
  "position": "#a855f7", # Violet
48
  }
 
49
  # --- Fixed Category Mapping ---
50
  FIXED_CATEGORY_MAPPING = {
51
  "People & Roles": ["person", "organization", "position"],
 
53
  "Time & Dates": ["date", "time"],
54
  "Numbers & Finance": ["money", "cardinal"]}
55
  REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
 
56
  # --- Dynamic Color Generator for Custom Labels ---
57
  # Use Plotly's Alphabet set for a large pool of distinct colors
58
  COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
 
59
  def extract_label(node_name):
60
  """Extracts the label from a node string like 'Text (Label)'."""
61
  match = re.search(r'\(([^)]+)\)$', node_name)
62
  return match.group(1) if match else "Unknown"
 
63
  def remove_trailing_punctuation(text_string):
64
  """Removes trailing punctuation from a string."""
65
  return text_string.rstrip(string.punctuation)
 
66
  def get_dynamic_color_map(active_labels, fixed_map):
67
  """Generates a color map, using fixed colors if available, otherwise dynamic colors."""
68
  color_map = {}
 
78
  # Generate a new color from the palette
79
  color_map[label] = next(COLOR_PALETTE)
80
  return color_map
 
81
  def highlight_entities(text, df_entities, entity_color_map):
82
  """
83
  Generates HTML to display text with entities highlighted and colored.
 
92
  # Ensure the entity indices are within the bounds of the full text
93
  start = max(0, entity['start'])
94
  end = min(len(text), entity['end'])
 
95
  # Get entity text from the full document based on its indices
96
  # The 'text' column in the dataframe is now an attribute of the chunked text, not the original span
97
  entity_text_from_full_doc = text[start:end]
 
98
  label = entity['label']
99
  color = entity_color_map.get(label, '#000000')
100
  # Create a span with background color and tooltip
 
103
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
104
  # Use a div to mimic the Streamlit input box style for the report
105
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 
106
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
107
  """Performs basic Topic Modeling using LDA."""
108
  documents = df_entities['text'].unique().tolist()
 
110
  # but here we use the extracted entity texts as per the original code's intent.
111
  if len(documents) < 2:
112
  return None
 
113
  N = min(num_top_words, len(documents))
114
  try:
115
  tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(1, 3))
116
  tfidf = tfidf_vectorizer.fit_transform(documents)
117
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
 
118
  if len(tfidf_feature_names) < num_topics:
119
  tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3))
120
  tfidf = tfidf_vectorizer.fit_transform(documents)
121
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
122
  if len(tfidf_feature_names) < num_topics:
123
  return None
 
124
  lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1)
125
  lda.fit(tfidf)
 
126
  topic_data_list = []
127
  for topic_idx, topic in enumerate(lda.components_):
128
  top_words_indices = topic.argsort()[:-N - 1:-1]
129
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
130
  word_weights = [topic[i] for i in top_words_indices]
 
131
  for word, weight in zip(top_words, word_weights):
132
  topic_data_list.append({
133
  'Topic_ID': f'Topic #{topic_idx + 1}',
 
135
  'Weight': weight,
136
  })
137
  return pd.DataFrame(topic_data_list)
 
138
  except Exception as e:
139
  return None
 
140
  def create_topic_word_bubbles(df_topic_data):
141
  """Generates a Plotly Bubble Chart for top words across all topics."""
142
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
143
  df_topic_data['x_pos'] = df_topic_data.index
144
  if df_topic_data.empty:
145
  return None
 
146
  fig = px.scatter(
147
  df_topic_data,
148
  x='x_pos', y='weight', size='weight', color='topic', text='word', hover_name='word', size_max=40,
 
166
  marker=dict(line=dict(width=1, color='DarkSlateGrey'))
167
  )
168
  return fig
 
169
  def generate_network_graph(df, raw_text, entity_color_map):
170
  """Generates a network graph visualization (Node Plot) with edges based on entity co-occurrence in sentences."""
171
  entity_counts = df['text'].value_counts().reset_index()
 
173
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
174
  if unique_entities.shape[0] < 2:
175
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
 
176
  num_nodes = len(unique_entities)
177
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
178
  radius = 10
 
195
  node2 = unique_entities_in_sentence[j]
196
  edge_tuple = tuple(sorted((node1, node2)))
197
  edges.add(edge_tuple)
 
198
  edge_x = []
199
  edge_y = []
200
  for edge in edges:
 
202
  if n1 in pos_map and n2 in pos_map:
203
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
204
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
 
205
  fig = go.Figure()
206
  edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
207
  fig.add_trace(edge_trace)
 
208
  fig.add_trace(go.Scatter(
209
  x=unique_entities['x'], y=unique_entities['y'], mode='markers+text', name='Entities', text=unique_entities['text'], textposition="top center", showlegend=False,
210
  marker=dict(
 
216
  customdata=unique_entities[['label', 'score', 'frequency']],
217
  hovertemplate=("<b>%{text}</b><br>Label: %{customdata[0]}<br>Score: %{customdata[1]:.2f}<br>Frequency: %{customdata[2]}<extra></extra>")
218
  ))
 
219
  legend_traces = []
220
  seen_labels = set()
221
  for index, row in unique_entities.iterrows():
 
224
  seen_labels.add(label)
225
  color = entity_color_map.get(label, '#cccccc')
226
  legend_traces.append(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True))
 
227
  for trace in legend_traces:
228
  fig.add_trace(trace)
 
229
  fig.update_layout(
230
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
231
  showlegend=True, hovermode='closest',
 
235
  margin=dict(t=50, b=10, l=10, r=10), height=600
236
  )
237
  return fig
 
238
  # --- CSV GENERATION FUNCTION ---
239
  def generate_entity_csv(df):
240
  """Generates a CSV file of the extracted entities in an in-memory buffer."""
 
244
  csv_buffer.seek(0)
245
  return csv_buffer
246
  # -----------------------------------
 
247
  # --- HTML REPORT GENERATION FUNCTION (MODIFIED FOR WHITE-LABEL) ---
248
  def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
249
  """
 
252
  """
253
  # Use the category values from the DataFrame to ensure the report matches the app's current mode (fixed or custom)
254
  unique_categories = df['category'].unique()
 
255
  # 1. Generate Visualizations (Plotly HTML)
256
  # 1a. Treemap
257
  fig_treemap = px.treemap(
 
263
  color_discrete_sequence=px.colors.qualitative.Dark24
264
  )
265
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
266
+ treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn') # 1b. Pie Chart
 
 
267
  grouped_counts = df['category'].value_counts().reset_index()
268
  grouped_counts.columns = ['Category', 'Count']
269
  color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
270
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
271
  fig_pie.update_layout(margin=dict(t=50, b=10))
272
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
 
273
  # 1c. Bar Chart (Category Count)
274
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
275
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
276
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
 
277
  # 1d. Bar Chart (Most Frequent Entities)
278
  word_counts = df['text'].value_counts().reset_index()
279
  word_counts.columns = ['Entity', 'Count']
 
283
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
284
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
285
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
 
286
  # 1e. Network Graph HTML - IMPORTANT: Pass color map
287
  network_fig = generate_network_graph(df, text_input, entity_color_map)
288
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
289
+ # 1f. Topic Charts HTML
 
290
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
291
  if df_topic_data is not None and not df_topic_data.empty:
292
  bubble_figure = create_topic_word_bubbles(df_topic_data)
 
299
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
300
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
301
  topic_charts_html += '</div>'
 
302
  # 2. Get Highlighted Text - IMPORTANT: Pass color map
303
  highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
 
304
  # 3. Entity Tables (Pandas to HTML)
305
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
306
  classes='table table-striped',
307
  index=False
308
  )
 
309
  # 4. Construct the Final HTML (UPDATED FOR WHITE-LABELING)
310
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
311
  <meta charset="UTF-8">
 
330
  <div class="container">
331
  <h1>{report_title}</h1>
332
  <div class="metadata">
333
+ {branding_html} <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
 
334
  <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
335
  </div>
336
  <h2>1. Analyzed Text & Extracted Entities</h2>
 
358
  </html>
359
  """
360
  return html_content
 
 
361
  # --- CHUNKING IMPLEMENTATION FOR LARGE TEXT ---
362
  def chunk_text(text, max_chunk_size=1500):
363
  """Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
 
366
  chunks = []
367
  current_chunk = ""
368
  current_offset = 0
 
369
  for segment in segments:
370
  if not segment: continue
 
371
  if len(current_chunk) + len(segment) > max_chunk_size and current_chunk:
372
  # Save the current chunk and its starting offset
373
  chunks.append((current_chunk, current_offset))
 
377
  current_chunk += segment
378
  if current_chunk:
379
  chunks.append((current_chunk, current_offset))
 
380
  return chunks
 
381
  def process_chunked_text(text, labels, model):
382
  """Processes large text in chunks and aggregates/offsets the entities."""
383
  # GLiNER model context size can be around 1024-1500 tokens/words. We use a generous char limit.
384
  # The word count limit is 10000, but we chunk around 500 words for safety/performance.
385
  MAX_CHUNK_CHARS = 3500
 
386
  chunks = chunk_text(text, max_chunk_size=MAX_CHUNK_CHARS)
387
  all_entities = []
 
388
  for chunk_text, chunk_offset in chunks:
389
  # Predict entities on the small chunk
390
  chunk_entities = model.predict_entities(chunk_text, labels)
 
391
  # Offset the start and end indices to match the original document
392
  for entity in chunk_entities:
393
  entity['start'] += chunk_offset
394
  entity['end'] += chunk_offset
395
  all_entities.append(entity)
 
396
  return all_entities
397
  # -----------------------------------
 
398
  # --- Page Configuration and Styling (No Sidebar) ---
399
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 
400
  # --- Conditional Mobile Warning ---
401
  st.markdown(
402
  """
 
410
  [data-testid="stAppViewBlock"] {
411
  background-color: #ffffff !important;
412
  }
 
413
  /* CSS Media Query: Only show the content inside this selector when the screen width is 600px or less (typical mobile size) */
414
  @media (max-width: 600px) {
415
  #mobile-warning-container {
 
452
  </div>
453
  """,
454
  unsafe_allow_html=True)
455
+
456
+ # --- Sidebar Inputs for Topic Modeling (NEW) ---
457
+ st.sidebar.header("Topic Modeling Settings 💡")
458
+ num_topics_input = st.sidebar.slider(
459
+ "Number of Topics",
460
+ min_value=2,
461
+ max_value=10,
462
+ value=5,
463
+ step=1,
464
+ key='num_topics_slider',
465
+ help="The number of underlying topics (clusters) to discover in the entity data (LDA)."
466
+ )
467
+ num_top_words_input = st.sidebar.slider(
468
+ "Number of Top Words per Topic",
469
+ min_value=5,
470
+ max_value=20,
471
+ value=10,
472
+ step=1,
473
+ key='num_top_words_slider',
474
+ help="The number of most important words to display for each topic."
475
+ )
476
+ st.sidebar.markdown("---")
477
+ # -----------------------------------------------
478
+
479
  st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
480
  # Removed st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") for white-labeling
 
481
  tab1, tab2 = st.tabs(["Embed", "Important Notes"])
482
  with tab1:
483
  with st.expander("Embed"):
 
491
  ></iframe>
492
  '''
493
  st.code(code, language="html")
 
494
  with tab2:
495
  expander = st.expander("**Important Notes**")
496
  expander.markdown("""
497
  **Named Entities (Fixed Mode):** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position".
 
498
  **Custom Labels Mode:** You can define your own comma-separated labels (e.g., `product, symptom, client_id`) in the input box below.
 
499
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
 
500
  **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
501
  """)
502
  st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)") # Updated contact info
 
503
  # --- Comet ML Setup (Placeholder/Conditional) ---
504
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
505
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 
507
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
508
 
509
  # --- Model Loading ---
510
+ @st.cache_resourced
511
  def load_ner_model(labels):
512
  """Loads the GLiNER model and caches it."""
513
  try:
 
515
  return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
516
  except Exception as e:
517
  # Log the actual error to the console for debugging
518
+ print(f"FATAL ERROR: Failed to load NER model: {e}")
519
  st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
520
  st.stop()
 
521
  # --- LONG DEFAULT TEXT (178 Words) ---
522
  DEFAULT_TEXT = (
523
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
 
535
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
536
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
537
  # -----------------------------------
 
538
  # --- Session State Initialization (CRITICAL FIX) ---
539
  if 'show_results' not in st.session_state: st.session_state.show_results = False
540
  if 'last_text' not in st.session_state: st.session_state.last_text = ""
 
581
  with col_clear:
582
  st.button("Clear text", on_click=clear_text, use_container_width=True)
583
 
584
+ # --- Results Trigger and Processing (Completed Logic with Chunking and Topic Vars) ---
585
  if run_button:
586
  # 1. Determine Active Labels and Mode
587
  custom_labels_raw = st.session_state.custom_labels_input
 
596
  else:
597
  st.session_state.active_labels_list = custom_labels_list
598
  st.session_state.is_custom_mode = True
 
599
  else:
600
  st.session_state.active_labels_list = FIXED_LABELS
601
  st.session_state.is_custom_mode = False
 
612
  # Define a safe threshold for when to start chunking (e.g., above 500 words)
613
  CHUNKING_THRESHOLD = 500
614
  should_chunk = word_count > CHUNKING_THRESHOLD
 
615
  mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
616
  if should_chunk:
617
  mode_msg += " with **chunking** for large text"
618
 
619
+ # --- Topic Modeling Input Retrieval ---
620
+ # Get the current slider values
621
+ current_num_topics = st.session_state.num_topics_slider
622
+ current_num_top_words = st.session_state.num_top_words_slider
623
 
624
+ with st.spinner(f"Extracting entities using {mode_msg}...", show_time=True):
625
+ # Re-run prediction only if text, active labels, OR topic parameters have changed
626
+ current_settings = (text, tuple(active_labels), current_num_topics, current_num_top_words)
627
+ # Add topic settings to last_settings check
628
+ last_settings = (
629
+ st.session_state.last_text,
630
+ tuple(st.session_state.get('last_active_labels', [])),
631
+ st.session_state.get('last_num_topics', None),
632
+ st.session_state.get('last_num_top_words', None)
633
+ )
634
 
635
  if current_settings != last_settings:
 
 
 
636
  start_time = time.time()
637
+ ner_model = load_ner_model(labels=active_labels)
638
 
639
+ # 2. Perform NER Extraction
 
 
 
640
  if should_chunk:
641
+ all_entities_list = process_chunked_text(text, active_labels, ner_model)
 
642
  else:
643
+ all_entities_list = ner_model.predict_entities(text, active_labels)
 
644
 
645
+ df = pd.DataFrame(all_entities_list)
 
646
 
 
 
647
  if df.empty:
648
+ df_topic_data = None
 
 
649
  else:
650
+ # 3. Add Category Mapping
651
+ df['category'] = df['label'].apply(
652
+ lambda l: REVERSE_FIXED_CATEGORY_MAPPING.get(l, "User Defined Entities")
653
+ )
654
+
655
+ # 4. Perform Topic Modeling (Passing the new parameters)
656
+ df_topic_data = perform_topic_modeling(
657
+ df_entities=df,
658
+ num_topics=current_num_topics, # NEW PARAMETER
659
+ num_top_words=current_num_top_words # NEW PARAMETER
660
+ )
661
+
662
+ end_time = time.time()
663
+ elapsed_time = end_time - start_time
664
+
665
+ # 5. Save Results to Session State
666
+ st.session_state.results_df = df
667
+ st.session_state.topic_results = df_topic_data
668
+ st.session_state.elapsed_time = elapsed_time
669
+ st.session_state.last_text = text
670
+ st.session_state.show_results = True
671
+ st.session_state.last_active_labels = active_labels
672
+ st.session_state.last_num_topics = current_num_topics # Save topic settings
673
+ st.session_state.last_num_top_words = current_num_top_words # Save topic settings
674
  else:
675
+ st.info("Results already calculated for the current text and settings.")
676
  st.session_state.show_results = True
677
 
 
678
  # --- Display Download Link and Results (Updated with White-Label inputs) ---
679
  if st.session_state.show_results:
680
  df = st.session_state.results_df
681
  df_topic_data = st.session_state.topic_results
 
682
  # Generate the color map based on the results DF labels
683
  current_labels_in_df = df['label'].unique().tolist()
684
  entity_color_map = get_dynamic_color_map(current_labels_in_df, FIXED_ENTITY_COLOR_MAP)
 
687
  st.warning("No entities were found in the provided text with the current label set.")
688
  else:
689
  st.subheader("Analysis Results", divider="blue")
 
690
  # 1. Highlighted Text
691
  st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
692
  st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
 
693
  # 2. Detailed Entity Analysis Tabs
694
  st.markdown("### 2. Detailed Entity Analysis")
695
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
 
696
  # Determine which categories to use for the tabs
697
  if st.session_state.is_custom_mode:
698
  unique_categories = ["User Defined Entities"]
 
700
  st.markdown(f"**Custom Labels Detected: {', '.join(tabs_to_show)}**")
701
  else:
702
  unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
 
703
  # --- Section 2a: Detailed Tables by Category/Label ---
704
  with tab_category_details:
705
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
 
706
  if st.session_state.is_custom_mode:
707
  # In custom mode, group by the actual label since the category is just "User Defined Entities"
708
  tabs_list = df['label'].unique().tolist()
 
731
  )
732
  else:
733
  st.info(f"No entities of category **{category}** were found in the text.")
 
734
  # --- INSERTED GLOSSARY HERE ---
735
  with st.expander("See Glossary of tags"):
736
  st.write('''- **text**: ['entity extracted from your text data']- **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']- **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']- **start**: ['index of the start of the corresponding entity']- **end**: ['index of the end of the corresponding entity']''')
737
  # --- END GLOSSARY INSERTION ---
 
738
  # --- Section 2b: Treemap Visualization ---
739
  with tab_treemap_viz:
740
  st.markdown("#### Treemap: Entity Distribution")
 
747
  )
748
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
749
  st.plotly_chart(fig_treemap, use_container_width=True)
 
750
  # --- Section 3: Comparative Charts (COMPLETED) ---
751
  st.markdown("---")
752
  st.markdown("### 3. Comparative Charts")
753
  col1, col2, col3 = st.columns(3)
754
  grouped_counts = df['category'].value_counts().reset_index()
755
  grouped_counts.columns = ['Category', 'Count']
 
756
  # Determine color sequence for charts
757
  chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
 
758
  with col1: # Pie Chart
759
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
760
  fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
761
  st.plotly_chart(fig_pie, use_container_width=True)
 
762
  with col2: # Bar Chart by Category
763
  st.markdown("#### Entity Count by Category")
764
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
765
  fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
766
  st.plotly_chart(fig_bar_category, use_container_width=True)
 
767
  with col3: # Bar Chart for Most Frequent Entities
768
  st.markdown("#### Top 10 Most Frequent Entities")
769
  word_counts = df['text'].value_counts().reset_index()
 
775
  st.plotly_chart(fig_bar_freq, use_container_width=True)
776
  else:
777
  st.info("No entities were repeated enough for a Top 10 frequency chart.")
 
778
  # 4. Network Graph and Topic Modeling
779
  st.markdown("---")
780
  st.markdown("### 4. Advanced Analysis")
781
  col_network, col_topic = st.columns(2)
 
782
  with col_network:
783
  with st.expander("🔗 Entity Co-occurrence Network Graph", expanded=True):
784
  st.plotly_chart(generate_network_graph(df, st.session_state.last_text, entity_color_map), use_container_width=True)
 
785
  with col_topic:
786
  with st.expander("💡 Topic Modeling (LDA)", expanded=True):
787
+ # Display the current settings used for the topic modeling result
788
+ st.markdown(f"""
789
+ **LDA Parameters:**
790
+ * Topics: **{st.session_state.last_num_topics}**
791
+ * Top Words: **{st.session_state.last_num_top_words}**
792
+ """)
793
  if df_topic_data is not None and not df_topic_data.empty:
794
  st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
795
  st.markdown("This chart visualizes the key words driving the identified topics, based on extracted entities.")
796
  else:
797
  st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
 
798
  # --- 5. White-Label Configuration (NEW SECTION FOR CUSTOM BRANDING) ---
799
  st.markdown("---")
800
  st.markdown("### 5. White-Label Report Configuration 🎨")
 
801
  # Set a dynamic default title based on the mode
802
  default_report_title = f"{'Custom' if st.session_state.is_custom_mode else 'Fixed'} Entity Analysis Report"
803
  custom_report_title = st.text_input(
804
  "Type Your Report Title (for HTML Report), and then press Enter.",
805
  value=default_report_title
806
  )
 
807
  # UPDATED: Simplified input for the user
808
  custom_branding_text_input = st.text_area(
809
  "Type Your Brand Name or Tagline (Appears below the title in the report), and then press Enter.",
 
811
  key='custom_branding_input',
812
  help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
813
  )
 
814
  # 6. Downloads (Updated to pass custom variables)
815
  st.markdown("---")
816
  st.markdown("### 6. Downloads")
 
817
  col_csv, col_html = st.columns(2)
 
818
  # CSV Download
819
  csv_buffer = generate_entity_csv(df)
820
  with col_csv:
 
825
  mime="text/csv",
826
  use_container_width=True
827
  )
 
828
  # --- NEW LOGIC: Wrap the simple text input into proper HTML for the report ---
829
  # We wrap the user's plain text in a styled HTML paragraph element
830
  branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
 
831
  # HTML Download (Passing custom white-label parameters)
832
  html_content = generate_html_report(
833
  df,
 
846
  file_name="ner_topic_full_report.html",
847
  mime="text/html",
848
  use_container_width=True
849
+ )