AIEcosystem commited on
Commit
89b47a1
·
verified ·
1 Parent(s): f1f7a42

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +183 -180
src/streamlit_app.py CHANGED
@@ -42,16 +42,16 @@ FIXED_ENTITY_COLOR_MAP = {
42
  "money": "#f43f5e", # Red
43
  "position": "#a855f7", # Violet
44
  }
 
45
  # --- Fixed Category Mapping ---
46
  FIXED_CATEGORY_MAPPING = {
47
  "People & Roles": ["person", "organization", "position"],
48
  "Locations": ["country", "city"],
49
  "Time & Dates": ["date", "time"],
50
- "Numbers & Finance": ["money", "cardinal"]
51
- }
52
  REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
53
 
54
- # --- Default Custom Labels (New Requirement) ---
55
  DEFAULT_CUSTOM_LABELS = "person, location, organization, product, date, time, event"
56
 
57
  # --- Dynamic Color Generator for Custom Labels ---
@@ -71,7 +71,7 @@ def get_dynamic_color_map(active_labels, fixed_map):
71
  color_map = {}
72
  if active_labels == FIXED_LABELS:
73
  return fixed_map
74
-
75
  for label in active_labels:
76
  if label in fixed_map:
77
  color_map[label] = fixed_map[label]
@@ -83,9 +83,13 @@ def highlight_entities(text, df_entities, entity_color_map):
83
  """Generates HTML to display text with entities highlighted and colored."""
84
  if df_entities.empty:
85
  return text
86
-
 
 
 
 
87
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
88
- highlighted_text = text
89
 
90
  for entity in entities:
91
  start = max(0, entity['start'])
@@ -93,10 +97,10 @@ def highlight_entities(text, df_entities, entity_color_map):
93
  entity_text_from_full_doc = text[start:end]
94
  label = entity['label']
95
  color = entity_color_map.get(label, '#000000')
96
-
97
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text_from_full_doc}</span>'
98
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
99
-
100
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
101
 
102
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
@@ -104,50 +108,51 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
104
  documents = df_entities['text'].unique().tolist()
105
  if len(documents) < 2:
106
  return None
107
-
108
  N = min(num_top_words, len(documents))
109
-
110
  try:
111
  # Step 1: Try aggressive filtering
112
  tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(1, 3))
113
  tfidf = tfidf_vectorizer.fit_transform(documents)
114
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
115
-
116
  # Step 2: Fallback if not enough features
117
  if len(tfidf_feature_names) < num_topics:
118
  tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3))
119
  tfidf = tfidf_vectorizer.fit_transform(documents)
120
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
121
- if len(tfidf_feature_names) < num_topics:
122
  return None
123
 
124
  lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1)
 
125
  lda.fit(tfidf)
126
  topic_data_list = []
127
-
128
  for topic_idx, topic in enumerate(lda.components_):
129
  top_words_indices = topic.argsort()[:-N - 1:-1]
130
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
131
  word_weights = [topic[i] for i in top_words_indices]
132
-
133
  for word, weight in zip(top_words, word_weights):
134
  topic_data_list.append({
135
  'Topic_ID': f'Topic #{topic_idx + 1}',
136
  'Word': word,
137
  'Weight': weight,
138
  })
139
-
140
  return pd.DataFrame(topic_data_list)
141
-
142
  except Exception as e:
143
  # print(f"Topic Modeling Error: {e}")
144
  return None
145
-
146
  def create_topic_word_bubbles(df_topic_data):
147
  """Generates a Plotly Bubble Chart for top words across all topics."""
148
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
149
  df_topic_data['x_pos'] = df_topic_data.index
150
-
151
  if df_topic_data.empty:
152
  return None
153
 
@@ -174,33 +179,56 @@ def create_topic_word_bubbles(df_topic_data):
174
  marker=dict(line=dict(width=1, color='DarkSlateGrey'))
175
  )
176
  return fig
177
-
178
  def generate_network_graph(df, raw_text, entity_color_map):
179
- """Generates a network graph visualization (Node Plot) with edges based on entity co-occurrence in sentences."""
180
- df = df.reset_index(drop=True)
 
 
 
 
 
 
 
 
 
181
  entity_counts = df['text'].value_counts().reset_index()
182
  entity_counts.columns = ['text', 'frequency']
183
- representative_entities = df.sort_values('score', ascending=False).drop_duplicates(subset=['text'])[['text', 'label', 'score']]
184
- unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
185
 
 
 
 
 
 
 
 
 
 
186
  if unique_entities.shape[0] < 2:
187
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
188
-
 
189
  num_nodes = len(unique_entities)
190
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
191
  radius = 10
192
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
193
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
 
 
194
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
 
 
195
  edges = set()
196
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
197
 
 
 
198
  for sentence in sentences:
199
  entities_in_sentence = []
200
- for entity_text in unique_entities['text'].unique():
201
  if entity_text.lower() in sentence.lower():
202
  entities_in_sentence.append(entity_text)
203
-
204
  unique_entities_in_sentence = list(set(entities_in_sentence))
205
 
206
  for i in range(len(unique_entities_in_sentence)):
@@ -209,7 +237,8 @@ def generate_network_graph(df, raw_text, entity_color_map):
209
  node2 = unique_entities_in_sentence[j]
210
  edge_tuple = tuple(sorted((node1, node2)))
211
  edges.add(edge_tuple)
212
-
 
213
  edge_x = []
214
  edge_y = []
215
 
@@ -222,7 +251,7 @@ def generate_network_graph(df, raw_text, entity_color_map):
222
  fig = go.Figure()
223
  edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
224
  fig.add_trace(edge_trace)
225
-
226
  fig.add_trace(go.Scatter(
227
  x=unique_entities['x'], y=unique_entities['y'], mode='markers+text', name='Entities', text=unique_entities['text'], textposition="top center", showlegend=False,
228
  marker=dict(
@@ -234,7 +263,8 @@ def generate_network_graph(df, raw_text, entity_color_map):
234
  customdata=unique_entities[['label', 'score', 'frequency']],
235
  hovertemplate=("<b>%{text}</b><br>Label: %{customdata[0]}<br>Score: %{customdata[1]:.2f}<br>Frequency: %{customdata[2]}<extra></extra>")
236
  ))
237
-
 
238
  legend_traces = []
239
  seen_labels = set()
240
  for index, row in unique_entities.iterrows():
@@ -243,10 +273,10 @@ def generate_network_graph(df, raw_text, entity_color_map):
243
  seen_labels.add(label)
244
  color = entity_color_map.get(label, '#cccccc')
245
  legend_traces.append(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True))
246
-
247
  for trace in legend_traces:
248
  fig.add_trace(trace)
249
-
250
  fig.update_layout(
251
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
252
  showlegend=True, hovermode='closest',
@@ -256,7 +286,7 @@ def generate_network_graph(df, raw_text, entity_color_map):
256
  margin=dict(t=50, b=10, l=10, r=10), height=600
257
  )
258
  return fig
259
-
260
  def generate_entity_csv(df):
261
  """Generates a CSV file of the extracted entities in an in-memory buffer."""
262
  csv_buffer = BytesIO()
@@ -264,7 +294,7 @@ def generate_entity_csv(df):
264
  csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
265
  csv_buffer.seek(0)
266
  return csv_buffer
267
-
268
  # --- HTML REPORT GENERATION FUNCTION ---
269
  def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
270
  """
@@ -283,7 +313,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
283
  color_discrete_sequence=px.colors.qualitative.Bold
284
  )
285
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
286
- treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
287
 
288
  # 1b. Pie Chart
289
  grouped_counts = df['category'].value_counts().reset_index()
@@ -292,12 +322,12 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
292
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
293
  fig_pie.update_layout(margin=dict(t=50, b=10))
294
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
295
-
296
  # 1c. Bar Chart (Category Count)
297
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
298
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
299
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
300
-
301
  # 1d. Bar Chart (Most Frequent Entities)
302
  word_counts = df['text'].value_counts().reset_index()
303
  word_counts.columns = ['Entity', 'Count']
@@ -307,11 +337,11 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
307
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
308
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
309
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
310
-
311
  # 1e. Network Graph HTML
312
  network_fig = generate_network_graph(df, text_input, entity_color_map)
313
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
314
-
315
  # 1f. Topic Modeling Bubble Chart
316
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
317
  if df_topic_data is not None and not df_topic_data.empty:
@@ -325,23 +355,21 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
325
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
326
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
327
  topic_charts_html += '</div>'
328
-
329
  # 2. Get Highlighted Text
330
  highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
331
-
332
  # 3. Entity Tables (Pandas to HTML)
333
- # --- FIX: Apply color gradient styling to the DataFrame BEFORE converting to HTML ---
334
  styled_df = df[['text', 'label', 'score', 'start', 'end', 'category']].style.background_gradient(
335
  cmap='YlGnBu',
336
  subset=['score']
337
  ).format({'score': '{:.4f}'})
338
-
339
  entity_table_html = styled_df.to_html(
340
  classes='table table-striped',
341
  index=False,
342
  )
343
- # --- END FIX ---
344
-
345
  # 4. Construct the Final HTML
346
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
347
  <meta charset="UTF-8">
@@ -384,10 +412,10 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
384
  <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
385
  <div class="chart-box">{pie_html}</div>
386
  <div class="chart-box">{bar_category_html}</div>
387
- <h3>3.3 Most Frequent Entities</h3>
388
- <div class="chart-box">{bar_freq_html}</div>
389
- <h3>3.4 Entity Relationship Map (Edges = Same Sentence)</h3>
390
- <div class="chart-box">{network_html}</div>
391
  <h2>4. Topic Modelling</h2>
392
  {topic_charts_html}
393
  </div>
@@ -402,7 +430,7 @@ def chunk_text(text, max_chunk_size=1500):
402
  chunks = []
403
  current_chunk = ""
404
  current_offset = 0
405
-
406
  for segment in segments:
407
  if not segment: continue
408
  if len(current_chunk) + len(segment) > max_chunk_size and current_chunk:
@@ -411,10 +439,10 @@ def chunk_text(text, max_chunk_size=1500):
411
  current_chunk = segment
412
  else:
413
  current_chunk += segment
414
-
415
  if current_chunk:
416
  chunks.append((current_chunk, current_offset))
417
-
418
  return chunks
419
 
420
  def process_chunked_text(text, labels, model):
@@ -422,14 +450,14 @@ def process_chunked_text(text, labels, model):
422
  MAX_CHUNK_CHARS = 3500
423
  chunks = chunk_text(text, max_chunk_size=MAX_CHUNK_CHARS)
424
  all_entities = []
425
-
426
  for chunk_data, chunk_offset in chunks:
427
  chunk_entities = model.predict_entities(chunk_data, labels)
428
  for entity in chunk_entities:
429
  entity['start'] += chunk_offset
430
  entity['end'] += chunk_offset
431
  all_entities.append(entity)
432
-
433
  return all_entities
434
 
435
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
@@ -499,25 +527,24 @@ with tab1:
499
  ></iframe>
500
  '''
501
  st.code(code, language="html")
502
-
503
  with tab2:
504
  expander = st.expander("**Important Notes**")
505
  expander.markdown("""
506
- **Named Entities (Fixed Mode):** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position".
507
- **Custom Labels Mode:** You can define your own comma-separated labels (e.g., `product, symptom, client_id`) in the input box below.
508
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
509
- **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
510
  """)
511
  st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)")
512
 
513
  # --- Model Loading ---
514
- @st.cache_resource
515
  def load_ner_model(labels):
516
  """Loads the GLiNER model and caches it."""
517
  try:
 
518
  return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
519
  except Exception as e:
520
- print(f"FATAL ERROR: Failed to load NER model: {e}")
521
  st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
522
  st.stop()
523
 
@@ -532,25 +559,22 @@ DEFAULT_TEXT = (
532
  "space capabilities within the **European Union**. The core team, including lead engineer Marcus Davies, will hold "
533
  "their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
534
  "media platform X (under the username @TechCEO) was overwhelmingly positive, with many major tech "
535
- "publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
536
- "end of the year, further strengthening the technological standing of the **European Union**. The platform is designed to be compatible with both Windows and Linux operating systems. "
537
  "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
538
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
539
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
540
 
541
  # -----------------------------------
542
- # --- Session State Initialization ---
543
  if 'show_results' not in st.session_state: st.session_state.show_results = False
 
544
  if 'last_text' not in st.session_state: st.session_state.last_text = ""
545
  if 'results_df' not in st.session_state: st.session_state.results_df = pd.DataFrame()
546
  if 'elapsed_time' not in st.session_state: st.session_state.elapsed_time = 0.0
547
  if 'topic_results' not in st.session_state: st.session_state.topic_results = None
548
- if 'my_text_area' not in st.session_state: st.session_state.my_text_area = DEFAULT_TEXT
549
- # --- UPDATED: Pre-populate custom labels input ---
550
- if 'custom_labels_input' not in st.session_state: st.session_state.custom_labels_input = DEFAULT_CUSTOM_LABELS
551
- # --- END UPDATED ---
552
  if 'active_labels_list' not in st.session_state: st.session_state.active_labels_list = FIXED_LABELS
553
- if 'is_custom_mode' not in st.session_state: st.session_state.is_custom_mode = False
554
  if 'num_topics_slider' not in st.session_state: st.session_state.num_topics_slider = 5
555
  if 'num_top_words_slider' not in st.session_state: st.session_state.num_top_words_slider = 10
556
  if 'last_num_topics' not in st.session_state: st.session_state.last_num_topics = None
@@ -566,66 +590,59 @@ def clear_text():
566
  st.session_state.elapsed_time = 0.0
567
  st.session_state.topic_results = None
568
 
569
- # --- Text Input and Clear Button ---
 
 
570
  word_limit = 10000
571
  text = st.text_area(
572
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
573
  height=250,
574
  key='my_text_area',
575
- )
 
576
  word_count = len(text.split())
577
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
578
 
579
- # --- Custom Labels Input (Now Pre-populated and Editable) ---
580
- custom_labels_text = st.text_area(
581
- "**Optional:** Edit the comma-separated entity labels below to define your own categories. Delete existing labels or add new ones. **(Recommendation: 10-30 distinct labels per run for best results.)**",
582
- height=60,
583
- key='custom_labels_input',
584
- # Placeholder is now unnecessary as the value is pre-populated
585
- )
586
 
587
  col_results, col_clear = st.columns([1, 1])
 
588
  with col_results:
589
- run_button = st.button("Results", key='run_results', use_container_width=True)
 
590
  with col_clear:
591
  st.button("Clear text", on_click=clear_text, use_container_width=True)
592
 
593
- # --- Results Trigger and Processing ---
594
- if run_button:
595
- # 1. Determine Active Labels and Mode
596
- custom_labels_raw = st.session_state.custom_labels_input
597
- if custom_labels_raw.strip():
598
- custom_labels_list = [label.strip().lower() for label in custom_labels_raw.split(',') if label.strip()]
599
- if not custom_labels_list:
600
- st.session_state.active_labels_list = FIXED_LABELS
601
- st.session_state.is_custom_mode = False
602
- st.info("No valid custom labels found. Falling back to default fixed labels.")
603
- else:
604
- st.session_state.active_labels_list = custom_labels_list
605
- st.session_state.is_custom_mode = True
606
- else:
607
- st.session_state.active_labels_list = FIXED_LABELS
608
- st.session_state.is_custom_mode = False
609
-
610
- active_labels = st.session_state.active_labels_list
611
- current_num_topics = st.session_state.num_topics_slider
612
- current_num_top_words = st.session_state.num_top_words_slider
613
 
614
- # Caching Logic: Check if we need to re-run the full process
615
- should_rerun_full_analysis = (
616
- text.strip() != st.session_state.last_text.strip() or
617
- active_labels != st.session_state.last_active_labels
618
- )
619
-
620
  if text.strip() and word_count <= word_limit:
 
 
 
 
 
 
 
 
 
 
621
  if should_rerun_full_analysis:
622
  # 2. Rerunning Full Analysis
623
  CHUNKING_THRESHOLD = 500
624
  should_chunk = word_count > CHUNKING_THRESHOLD
625
- mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
626
  if should_chunk:
627
  mode_msg += " with **chunking** for large text"
628
-
629
  with st.spinner(f"Analyzing text with {mode_msg}..."):
630
  start_time = time.time()
631
 
@@ -645,20 +662,20 @@ if run_button:
645
  df = pd.DataFrame(all_entities)
646
 
647
  if not df.empty:
648
- df = df.reset_index(drop=True)
649
- if st.session_state.is_custom_mode:
650
- df['category'] = 'User Defined Entities'
651
- else:
652
- df['category'] = df['label'].map(REVERSE_FIXED_CATEGORY_MAPPING).fillna('Other')
653
 
 
 
 
654
  df['text'] = df['text'].apply(remove_trailing_punctuation)
655
-
656
  # 2d. Perform Topic Modeling on extracted entities
657
  df_topic_data = perform_topic_modeling(df, num_topics=current_num_topics, num_top_words=current_num_top_words)
658
  else:
659
  df_topic_data = None
660
-
661
- # 5. Save Results to Session State
662
  st.session_state.results_df = df
663
  st.session_state.topic_results = df_topic_data
664
  st.session_state.elapsed_time = elapsed_time
@@ -670,6 +687,7 @@ if run_button:
670
  else:
671
  st.info("Results already calculated for the current text and settings.")
672
  st.session_state.show_results = True
 
673
  elif word_count > word_limit:
674
  st.error(f"Text too long! Please limit your input to {word_limit} words.")
675
  st.session_state.show_results = False
@@ -684,26 +702,12 @@ if st.session_state.show_results:
684
 
685
  current_labels_in_df = df['label'].unique().tolist()
686
  entity_color_map = get_dynamic_color_map(current_labels_in_df, FIXED_ENTITY_COLOR_MAP)
687
-
688
  if df.empty:
689
  st.warning("No entities were found in the provided text with the current label set.")
690
  else:
691
  st.subheader("1. Analysis Results", divider="blue")
692
-
693
-
694
-
695
- # 1. Highlighted Text placed inside an Expander
696
- with st.expander(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})", expanded=False):
697
- st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
698
-
699
-
700
-
701
-
702
-
703
- # 2. Detailed Entity Analysis Tabs
704
- st.markdown("### 2. Detailed Entity Analysis")
705
- tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
706
-
707
  # --- Function to Apply Conditional Coloring to Scores (For Streamlit UI only) ---
708
  def color_score_gradient(df_input):
709
  """Applies a color gradient to the 'score' column using Pandas Styler."""
@@ -714,41 +718,39 @@ if st.session_state.show_results:
714
  {'score': '{:.4f}'}
715
  )
716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
717
  # --- Section 2a: Detailed Tables by Category/Label ---
718
  with tab_category_details:
719
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
720
- if st.session_state.is_custom_mode:
721
- tabs_list = df['label'].unique().tolist()
722
- tabs_category = st.tabs(tabs_list)
723
-
724
- for label, tab in zip(tabs_list, tabs_category):
725
- df_label = df[df['label'] == label][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
726
- styled_df_label = color_score_gradient(df_label)
727
- with tab:
728
- st.markdown(f"##### {label.capitalize()} Entities ({len(df_label)} total)")
729
- st.dataframe(styled_df_label, use_container_width=True)
730
- else:
731
- unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
732
- tabs_category = st.tabs(unique_categories)
733
-
734
- for category, tab in zip(unique_categories, tabs_category):
735
- df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
736
- styled_df_category = color_score_gradient(df_category)
737
- with tab:
738
- st.markdown(f"##### {category} Entities ({len(df_category)} total)")
739
- if not df_category.empty:
740
- st.dataframe(styled_df_category, use_container_width=True)
741
- else:
742
- st.info(f"No entities of category **{category}** were found in the text.")
743
-
744
  with st.expander("See Glossary of tags"):
745
- st.write('''- **text**: ['entity extracted from your text data']
746
- - **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']
747
- - **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']
748
- - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
749
- - **start**: ['index of the start of the corresponding entity']
750
- - **end**: ['index of the end of the corresponding entity']''')
751
-
752
  # --- Section 2b: Treemap Visualization ---
753
  with tab_treemap_viz:
754
  st.markdown("#### Treemap: Entity Distribution")
@@ -761,7 +763,7 @@ if st.session_state.show_results:
761
  )
762
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
763
  st.plotly_chart(fig_treemap, use_container_width=True)
764
-
765
  # 3. Comparative Charts
766
  st.markdown("---")
767
  st.markdown("### 3. Comparative Charts")
@@ -769,18 +771,18 @@ if st.session_state.show_results:
769
  grouped_counts = df['category'].value_counts().reset_index()
770
  grouped_counts.columns = ['Category', 'Count']
771
  chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
772
-
773
  with col1: # Pie Chart
774
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
775
  fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
776
  st.plotly_chart(fig_pie, use_container_width=True)
777
-
778
  with col2: # Bar Chart by Category
779
  st.markdown("#### Entity Count by Category")
780
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
781
  fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
782
  st.plotly_chart(fig_bar_category, use_container_width=True)
783
-
784
  with col3: # Bar Chart for Most Frequent Entities
785
  st.markdown("#### Top 10 Most Frequent Entities")
786
  word_counts = df['text'].value_counts().reset_index()
@@ -792,22 +794,23 @@ if st.session_state.show_results:
792
  st.plotly_chart(fig_bar_freq, use_container_width=True)
793
  else:
794
  st.info("No entities were repeated enough for a Top 10 frequency chart.")
795
-
796
  # 4. Advanced Analysis
797
  st.markdown("---")
798
  st.markdown("### 4. Advanced Analysis")
799
-
800
  # --- A. Network Graph Section ---
801
  with st.expander("🔗 Entity Co-occurrence Network Graph", expanded=True):
 
802
  st.plotly_chart(generate_network_graph(df, st.session_state.last_text, entity_color_map), use_container_width=True)
803
-
804
  # --- B. Topic Modeling Section ---
805
  st.markdown("---")
806
  with st.container(border=True):
807
  st.markdown("#### 💡 Topic Modeling (LDA) Configuration and Results")
808
  st.markdown("Adjust the settings below and click **'Re-Run Topic Model'** to instantly update the visualization based on the extracted entities.")
809
-
810
  col_slider_topic, col_slider_words, col_rerun_btn = st.columns([1, 1, 0.5])
 
811
  with col_slider_topic:
812
  new_num_topics = st.slider(
813
  "Number of Topics",
@@ -818,6 +821,7 @@ if st.session_state.show_results:
818
  key='num_topics_slider_new',
819
  help="The number of topics to discover (2 to 10)."
820
  )
 
821
  with col_slider_words:
822
  new_num_top_words = st.slider(
823
  "Number of Top Words",
@@ -828,12 +832,12 @@ if st.session_state.show_results:
828
  key='num_top_words_slider_new',
829
  help="The number of top words to display per topic (5 to 20)."
830
  )
831
-
832
  def rerun_topic_model():
833
  # Update session state with the new slider values
834
  st.session_state.num_topics_slider = st.session_state.num_topics_slider_new
835
  st.session_state.num_top_words_slider = st.session_state.num_top_words_slider_new
836
-
837
  if not st.session_state.results_df.empty:
838
  # Recalculate topic modeling results
839
  df_topic_data_new = perform_topic_modeling(
@@ -844,30 +848,30 @@ if st.session_state.show_results:
844
  st.session_state.topic_results = df_topic_data_new
845
  st.session_state.last_num_topics = st.session_state.num_topics_slider
846
  st.session_state.last_num_top_words = st.session_state.num_top_words_slider
847
-
848
  with col_rerun_btn:
849
  st.markdown("<div style='height: 38px;'></div>", unsafe_allow_html=True)
850
  st.button("Re-Run Topic Model", on_click=rerun_topic_model, use_container_width=True, type="primary")
851
-
852
  st.markdown("---")
853
  st.markdown(f"""
854
  **Current LDA Parameters:**
855
  * Topics: **{st.session_state.num_topics_slider}**
856
  * Top Words: **{st.session_state.num_top_words_slider}**
857
  """)
858
-
859
  df_topic_data = st.session_state.topic_results
860
-
861
  if df_topic_data is not None and not df_topic_data.empty:
862
  st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
863
  st.markdown("This chart visualizes the key words driving the identified topics, based on extracted entities.")
864
  else:
865
  st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
866
-
867
  # 5. White-Label Configuration
868
  st.markdown("---")
869
  st.markdown("### 5. White-Label Report Configuration 🎨")
870
- default_report_title = f"{'Custom' if st.session_state.is_custom_mode else 'Fixed'} Entity Analysis Report"
871
  custom_report_title = st.text_input(
872
  "Type Your Report Title (for HTML Report), and then press Enter.",
873
  value=default_report_title
@@ -878,12 +882,12 @@ if st.session_state.show_results:
878
  key='custom_branding_input',
879
  help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
880
  )
881
-
882
  # 6. Downloads
883
  st.markdown("---")
884
  st.markdown("### 6. Downloads")
885
  col_csv, col_html = st.columns(2)
886
-
887
  # CSV Download
888
  csv_buffer = generate_entity_csv(df)
889
  with col_csv:
@@ -894,10 +898,9 @@ if st.session_state.show_results:
894
  mime="text/csv",
895
  use_container_width=True
896
  )
897
-
898
  # HTML Download (Passing custom white-label parameters)
899
  branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
900
-
901
  html_content = generate_html_report(
902
  df,
903
  st.session_state.last_text,
 
42
  "money": "#f43f5e", # Red
43
  "position": "#a855f7", # Violet
44
  }
45
+
46
  # --- Fixed Category Mapping ---
47
  FIXED_CATEGORY_MAPPING = {
48
  "People & Roles": ["person", "organization", "position"],
49
  "Locations": ["country", "city"],
50
  "Time & Dates": ["date", "time"],
51
+ "Numbers & Finance": ["money", "cardinal"]}
 
52
  REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
53
 
54
+ # --- Default Custom Labels (Not used, but kept for full code compatibility) ---
55
  DEFAULT_CUSTOM_LABELS = "person, location, organization, product, date, time, event"
56
 
57
  # --- Dynamic Color Generator for Custom Labels ---
 
71
  color_map = {}
72
  if active_labels == FIXED_LABELS:
73
  return fixed_map
74
+
75
  for label in active_labels:
76
  if label in fixed_map:
77
  color_map[label] = fixed_map[label]
 
83
  """Generates HTML to display text with entities highlighted and colored."""
84
  if df_entities.empty:
85
  return text
86
+
87
+ # --- FIX: Ensure the DataFrame has a unique index before sorting/converting ---
88
+ # Create a copy and reset index for safety, resolving potential errors in the to_dict step.
89
+ df_entities = df_entities.copy().reset_index(drop=True)
90
+
91
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
92
+ highlighted_text = text
93
 
94
  for entity in entities:
95
  start = max(0, entity['start'])
 
97
  entity_text_from_full_doc = text[start:end]
98
  label = entity['label']
99
  color = entity_color_map.get(label, '#000000')
100
+
101
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text_from_full_doc}</span>'
102
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
103
+
104
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
105
 
106
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
 
108
  documents = df_entities['text'].unique().tolist()
109
  if len(documents) < 2:
110
  return None
111
+
112
  N = min(num_top_words, len(documents))
113
+
114
  try:
115
  # Step 1: Try aggressive filtering
116
  tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(1, 3))
117
  tfidf = tfidf_vectorizer.fit_transform(documents)
118
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
119
+
120
  # Step 2: Fallback if not enough features
121
  if len(tfidf_feature_names) < num_topics:
122
  tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3))
123
  tfidf = tfidf_vectorizer.fit_transform(documents)
124
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
125
+ if len(tfidf_feature_names) < num_topics:
126
  return None
127
 
128
  lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1)
129
+
130
  lda.fit(tfidf)
131
  topic_data_list = []
132
+
133
  for topic_idx, topic in enumerate(lda.components_):
134
  top_words_indices = topic.argsort()[:-N - 1:-1]
135
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
136
  word_weights = [topic[i] for i in top_words_indices]
137
+
138
  for word, weight in zip(top_words, word_weights):
139
  topic_data_list.append({
140
  'Topic_ID': f'Topic #{topic_idx + 1}',
141
  'Word': word,
142
  'Weight': weight,
143
  })
144
+
145
  return pd.DataFrame(topic_data_list)
146
+
147
  except Exception as e:
148
  # print(f"Topic Modeling Error: {e}")
149
  return None
150
+
151
  def create_topic_word_bubbles(df_topic_data):
152
  """Generates a Plotly Bubble Chart for top words across all topics."""
153
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
154
  df_topic_data['x_pos'] = df_topic_data.index
155
+
156
  if df_topic_data.empty:
157
  return None
158
 
 
179
  marker=dict(line=dict(width=1, color='DarkSlateGrey'))
180
  )
181
  return fig
182
+
183
  def generate_network_graph(df, raw_text, entity_color_map):
184
+ """
185
+ Generates a network graph visualization (Node Plot) with edges based on
186
+ entity co-occurrence in sentences.
187
+
188
+ FIXED: The logic for creating 'unique_entities' is revised to guarantee
189
+ that the 'text' column is unique, resolving the ValueError.
190
+ """
191
+
192
+ # 1. Prepare Data for Nodes
193
+
194
+ # Calculate frequency (count)
195
  entity_counts = df['text'].value_counts().reset_index()
196
  entity_counts.columns = ['text', 'frequency']
 
 
197
 
198
+ # Sort the dataframe by score descending *before* dropping duplicates to ensure the best score/label is kept
199
+ df_sorted = df.sort_values('score', ascending=False).reset_index(drop=True)
200
+
201
+ # Drop duplicates based on 'text' to guarantee unique entity names for the index
202
+ unique_entities_data = df_sorted.drop_duplicates(subset=['text'])[['text', 'label', 'score']]
203
+
204
+ # Merge the unique data with the frequency counts
205
+ unique_entities = unique_entities_data.merge(entity_counts, on='text', how='left')
206
+
207
  if unique_entities.shape[0] < 2:
208
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
209
+
210
+ # 2. Node Positioning
211
  num_nodes = len(unique_entities)
212
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
213
  radius = 10
214
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
215
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
216
+
217
+ # This line now succeeds because 'text' is guaranteed to be unique
218
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
219
+
220
+ # 3. Edge Calculation (Co-occurrence)
221
  edges = set()
222
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
223
 
224
+ unique_entity_texts = unique_entities['text'].unique().tolist()
225
+
226
  for sentence in sentences:
227
  entities_in_sentence = []
228
+ for entity_text in unique_entity_texts:
229
  if entity_text.lower() in sentence.lower():
230
  entities_in_sentence.append(entity_text)
231
+
232
  unique_entities_in_sentence = list(set(entities_in_sentence))
233
 
234
  for i in range(len(unique_entities_in_sentence)):
 
237
  node2 = unique_entities_in_sentence[j]
238
  edge_tuple = tuple(sorted((node1, node2)))
239
  edges.add(edge_tuple)
240
+
241
+ # 4. Plotly Figure Generation
242
  edge_x = []
243
  edge_y = []
244
 
 
251
  fig = go.Figure()
252
  edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
253
  fig.add_trace(edge_trace)
254
+
255
  fig.add_trace(go.Scatter(
256
  x=unique_entities['x'], y=unique_entities['y'], mode='markers+text', name='Entities', text=unique_entities['text'], textposition="top center", showlegend=False,
257
  marker=dict(
 
263
  customdata=unique_entities[['label', 'score', 'frequency']],
264
  hovertemplate=("<b>%{text}</b><br>Label: %{customdata[0]}<br>Score: %{customdata[1]:.2f}<br>Frequency: %{customdata[2]}<extra></extra>")
265
  ))
266
+
267
+ # 5. Legend and Layout
268
  legend_traces = []
269
  seen_labels = set()
270
  for index, row in unique_entities.iterrows():
 
273
  seen_labels.add(label)
274
  color = entity_color_map.get(label, '#cccccc')
275
  legend_traces.append(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True))
276
+
277
  for trace in legend_traces:
278
  fig.add_trace(trace)
279
+
280
  fig.update_layout(
281
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
282
  showlegend=True, hovermode='closest',
 
286
  margin=dict(t=50, b=10, l=10, r=10), height=600
287
  )
288
  return fig
289
+
290
  def generate_entity_csv(df):
291
  """Generates a CSV file of the extracted entities in an in-memory buffer."""
292
  csv_buffer = BytesIO()
 
294
  csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
295
  csv_buffer.seek(0)
296
  return csv_buffer
297
+
298
  # --- HTML REPORT GENERATION FUNCTION ---
299
  def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
300
  """
 
313
  color_discrete_sequence=px.colors.qualitative.Bold
314
  )
315
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
316
+ treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
317
 
318
  # 1b. Pie Chart
319
  grouped_counts = df['category'].value_counts().reset_index()
 
322
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
323
  fig_pie.update_layout(margin=dict(t=50, b=10))
324
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
325
+
326
  # 1c. Bar Chart (Category Count)
327
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
328
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
329
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
330
+
331
  # 1d. Bar Chart (Most Frequent Entities)
332
  word_counts = df['text'].value_counts().reset_index()
333
  word_counts.columns = ['Entity', 'Count']
 
337
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
338
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
339
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
340
+
341
  # 1e. Network Graph HTML
342
  network_fig = generate_network_graph(df, text_input, entity_color_map)
343
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
344
+
345
  # 1f. Topic Modeling Bubble Chart
346
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
347
  if df_topic_data is not None and not df_topic_data.empty:
 
355
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
356
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
357
  topic_charts_html += '</div>'
358
+
359
  # 2. Get Highlighted Text
360
  highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
361
+
362
  # 3. Entity Tables (Pandas to HTML)
363
+ # Apply color gradient styling to the DataFrame BEFORE converting to HTML
364
  styled_df = df[['text', 'label', 'score', 'start', 'end', 'category']].style.background_gradient(
365
  cmap='YlGnBu',
366
  subset=['score']
367
  ).format({'score': '{:.4f}'})
 
368
  entity_table_html = styled_df.to_html(
369
  classes='table table-striped',
370
  index=False,
371
  )
372
+
 
373
  # 4. Construct the Final HTML
374
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
375
  <meta charset="UTF-8">
 
412
  <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
413
  <div class="chart-box">{pie_html}</div>
414
  <div class="chart-box">{bar_category_html}</div>
415
+ <h3>3.3 Most Frequent Entities</h3>
416
+ <div class="chart-box">{bar_freq_html}</div>
417
+ <h3>3.4 Entity Relationship Map (Edges = Same Sentence)</h3>
418
+ <div class="chart-box">{network_html}</div>
419
  <h2>4. Topic Modelling</h2>
420
  {topic_charts_html}
421
  </div>
 
430
  chunks = []
431
  current_chunk = ""
432
  current_offset = 0
433
+
434
  for segment in segments:
435
  if not segment: continue
436
  if len(current_chunk) + len(segment) > max_chunk_size and current_chunk:
 
439
  current_chunk = segment
440
  else:
441
  current_chunk += segment
442
+
443
  if current_chunk:
444
  chunks.append((current_chunk, current_offset))
445
+
446
  return chunks
447
 
448
  def process_chunked_text(text, labels, model):
 
450
  MAX_CHUNK_CHARS = 3500
451
  chunks = chunk_text(text, max_chunk_size=MAX_CHUNK_CHARS)
452
  all_entities = []
453
+
454
  for chunk_data, chunk_offset in chunks:
455
  chunk_entities = model.predict_entities(chunk_data, labels)
456
  for entity in chunk_entities:
457
  entity['start'] += chunk_offset
458
  entity['end'] += chunk_offset
459
  all_entities.append(entity)
460
+
461
  return all_entities
462
 
463
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 
527
  ></iframe>
528
  '''
529
  st.code(code, language="html")
 
530
  with tab2:
531
  expander = st.expander("**Important Notes**")
532
  expander.markdown("""
533
+ **Named Entities (Fixed Mode):** This DataHarvest web app predicts nine (9) fixed labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position".
 
534
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
535
+ **How to Use:** Type or paste your text into the text area below, then click the 'Analyze Text' button.
536
  """)
537
  st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)")
538
 
539
  # --- Model Loading ---
540
+ @st.cache_resourced
541
  def load_ner_model(labels):
542
  """Loads the GLiNER model and caches it."""
543
  try:
544
+ # Note: Model loading is kept for fixed labels although not strictly required by the prompt
545
  return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
546
  except Exception as e:
547
+ # print(f"FATAL ERROR: Failed to load NER model: {e}")
548
  st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
549
  st.stop()
550
 
 
559
  "space capabilities within the **European Union**. The core team, including lead engineer Marcus Davies, will hold "
560
  "their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
561
  "media platform X (under the username @TechCEO) was overwhelmingly positive, with many major tech "
562
+ "publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
563
+ "end of the year, further strengthening the technological standing of the **European Union**. The platform is designed to be compatible with both Windows and Linux operating systems. "
564
  "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
565
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
566
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
567
 
568
  # -----------------------------------
569
+ # --- Session State Initialization (Cleaned up for Text Area Input) ---
570
  if 'show_results' not in st.session_state: st.session_state.show_results = False
571
+ if 'my_text_area' not in st.session_state: st.session_state.my_text_area = DEFAULT_TEXT
572
  if 'last_text' not in st.session_state: st.session_state.last_text = ""
573
  if 'results_df' not in st.session_state: st.session_state.results_df = pd.DataFrame()
574
  if 'elapsed_time' not in st.session_state: st.session_state.elapsed_time = 0.0
575
  if 'topic_results' not in st.session_state: st.session_state.topic_results = None
 
 
 
 
576
  if 'active_labels_list' not in st.session_state: st.session_state.active_labels_list = FIXED_LABELS
577
+ if 'is_custom_mode' not in st.session_state: st.session_state.is_custom_mode = False # Force Fixed Mode
578
  if 'num_topics_slider' not in st.session_state: st.session_state.num_topics_slider = 5
579
  if 'num_top_words_slider' not in st.session_state: st.session_state.num_top_words_slider = 10
580
  if 'last_num_topics' not in st.session_state: st.session_state.last_num_topics = None
 
590
  st.session_state.elapsed_time = 0.0
591
  st.session_state.topic_results = None
592
 
593
+ # --- Revised Text Area Input ---
594
+ st.markdown("## ✍️ Text Input for Analysis")
595
+
596
  word_limit = 10000
597
  text = st.text_area(
598
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
599
  height=250,
600
  key='my_text_area',
601
+ )
602
+
603
  word_count = len(text.split())
604
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
605
 
606
+ # Always Fixed Mode controls
607
+ st.markdown("---")
608
+ st.markdown("### Analysis Mode: **Fixed Entity Labels**")
609
+ st.info(f"The analysis will use the pre-defined fixed label set: **{', '.join(FIXED_LABELS)}**")
 
 
 
610
 
611
  col_results, col_clear = st.columns([1, 1])
612
+
613
  with col_results:
614
+ run_button = st.button("Analyze Text", key='run_results', use_container_width=True, type="primary")
615
+
616
  with col_clear:
617
  st.button("Clear text", on_click=clear_text, use_container_width=True)
618
 
619
+ # --- Define Active Labels and Settings ---
620
+ active_labels = FIXED_LABELS # Always fixed labels
621
+ st.session_state.active_labels_list = active_labels
622
+ current_num_topics = st.session_state.num_topics_slider
623
+ current_num_top_words = st.session_state.num_top_words_slider
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
624
 
625
+ # --- Results Trigger and Processing (Fixed for index error) ---
626
+ if run_button:
 
 
 
 
627
  if text.strip() and word_count <= word_limit:
628
+
629
+ # 1. Determine Active Labels and Mode (Already determined: Fixed Mode)
630
+ active_labels = st.session_state.active_labels_list
631
+
632
+ # Caching Logic: Check if we need to re-run the full process
633
+ should_rerun_full_analysis = (
634
+ text.strip() != st.session_state.last_text.strip() or
635
+ active_labels != st.session_state.last_active_labels
636
+ )
637
+
638
  if should_rerun_full_analysis:
639
  # 2. Rerunning Full Analysis
640
  CHUNKING_THRESHOLD = 500
641
  should_chunk = word_count > CHUNKING_THRESHOLD
642
+ mode_msg = "fixed labels"
643
  if should_chunk:
644
  mode_msg += " with **chunking** for large text"
645
+
646
  with st.spinner(f"Analyzing text with {mode_msg}..."):
647
  start_time = time.time()
648
 
 
662
  df = pd.DataFrame(all_entities)
663
 
664
  if not df.empty:
665
+ # 💥 FIX: Reset to a default, unique integer index
666
+ df = df.reset_index(drop=True)
 
 
 
667
 
668
+ # Force fixed category mapping
669
+ df['category'] = df['label'].map(REVERSE_FIXED_CATEGORY_MAPPING).fillna('Other')
670
+
671
  df['text'] = df['text'].apply(remove_trailing_punctuation)
672
+
673
  # 2d. Perform Topic Modeling on extracted entities
674
  df_topic_data = perform_topic_modeling(df, num_topics=current_num_topics, num_top_words=current_num_top_words)
675
  else:
676
  df_topic_data = None
677
+
678
+ # 3. Save Results to Session State
679
  st.session_state.results_df = df
680
  st.session_state.topic_results = df_topic_data
681
  st.session_state.elapsed_time = elapsed_time
 
687
  else:
688
  st.info("Results already calculated for the current text and settings.")
689
  st.session_state.show_results = True
690
+
691
  elif word_count > word_limit:
692
  st.error(f"Text too long! Please limit your input to {word_limit} words.")
693
  st.session_state.show_results = False
 
702
 
703
  current_labels_in_df = df['label'].unique().tolist()
704
  entity_color_map = get_dynamic_color_map(current_labels_in_df, FIXED_ENTITY_COLOR_MAP)
705
+
706
  if df.empty:
707
  st.warning("No entities were found in the provided text with the current label set.")
708
  else:
709
  st.subheader("1. Analysis Results", divider="blue")
710
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  # --- Function to Apply Conditional Coloring to Scores (For Streamlit UI only) ---
712
  def color_score_gradient(df_input):
713
  """Applies a color gradient to the 'score' column using Pandas Styler."""
 
718
  {'score': '{:.4f}'}
719
  )
720
 
721
+ # 1. Highlighted Text placed inside an Expander
722
+ # Force Fixed Mode in display header
723
+ with st.expander(f"### 1. Analyzed Text with Highlighted Entities (Fixed Mode)", expanded=False):
724
+ st.markdown(
725
+ highlight_entities(st.session_state.last_text, df, entity_color_map),
726
+ unsafe_allow_html=True
727
+ )
728
+ st.markdown(f"**Total Entities Found:** {len(df)}")
729
+
730
+ # 2. Detailed Entity Analysis Tabs
731
+ st.markdown("### 2. Detailed Entity Analysis")
732
+ tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
733
+
734
  # --- Section 2a: Detailed Tables by Category/Label ---
735
  with tab_category_details:
736
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
737
+
738
+ # This section now ONLY executes the FIXED MODE logic
739
+ unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
740
+ tabs_category = st.tabs(unique_categories)
741
+ for category, tab in zip(unique_categories, tabs_category):
742
+ df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
743
+ styled_df_category = color_score_gradient(df_category)
744
+ with tab:
745
+ st.markdown(f"##### {category} Entities ({len(df_category)} total)")
746
+ if not df_category.empty:
747
+ st.dataframe(styled_df_category, use_container_width=True)
748
+ else:
749
+ st.info(f"No entities of category **{category}** were found in the text.")
750
+
 
 
 
 
 
 
 
 
 
 
751
  with st.expander("See Glossary of tags"):
752
+ st.write('''- **text**: ['entity extracted from your text data']- **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']- **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']- **start**: ['index of the start of the corresponding entity']- **end**: ['index of the end of the corresponding entity']''')
753
+
 
 
 
 
 
754
  # --- Section 2b: Treemap Visualization ---
755
  with tab_treemap_viz:
756
  st.markdown("#### Treemap: Entity Distribution")
 
763
  )
764
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
765
  st.plotly_chart(fig_treemap, use_container_width=True)
766
+
767
  # 3. Comparative Charts
768
  st.markdown("---")
769
  st.markdown("### 3. Comparative Charts")
 
771
  grouped_counts = df['category'].value_counts().reset_index()
772
  grouped_counts.columns = ['Category', 'Count']
773
  chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
774
+
775
  with col1: # Pie Chart
776
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
777
  fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
778
  st.plotly_chart(fig_pie, use_container_width=True)
779
+
780
  with col2: # Bar Chart by Category
781
  st.markdown("#### Entity Count by Category")
782
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
783
  fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
784
  st.plotly_chart(fig_bar_category, use_container_width=True)
785
+
786
  with col3: # Bar Chart for Most Frequent Entities
787
  st.markdown("#### Top 10 Most Frequent Entities")
788
  word_counts = df['text'].value_counts().reset_index()
 
794
  st.plotly_chart(fig_bar_freq, use_container_width=True)
795
  else:
796
  st.info("No entities were repeated enough for a Top 10 frequency chart.")
797
+
798
  # 4. Advanced Analysis
799
  st.markdown("---")
800
  st.markdown("### 4. Advanced Analysis")
801
+
802
  # --- A. Network Graph Section ---
803
  with st.expander("🔗 Entity Co-occurrence Network Graph", expanded=True):
804
+ # This calls the CORRECTED generate_network_graph function
805
  st.plotly_chart(generate_network_graph(df, st.session_state.last_text, entity_color_map), use_container_width=True)
806
+
807
  # --- B. Topic Modeling Section ---
808
  st.markdown("---")
809
  with st.container(border=True):
810
  st.markdown("#### 💡 Topic Modeling (LDA) Configuration and Results")
811
  st.markdown("Adjust the settings below and click **'Re-Run Topic Model'** to instantly update the visualization based on the extracted entities.")
 
812
  col_slider_topic, col_slider_words, col_rerun_btn = st.columns([1, 1, 0.5])
813
+
814
  with col_slider_topic:
815
  new_num_topics = st.slider(
816
  "Number of Topics",
 
821
  key='num_topics_slider_new',
822
  help="The number of topics to discover (2 to 10)."
823
  )
824
+
825
  with col_slider_words:
826
  new_num_top_words = st.slider(
827
  "Number of Top Words",
 
832
  key='num_top_words_slider_new',
833
  help="The number of top words to display per topic (5 to 20)."
834
  )
835
+
836
  def rerun_topic_model():
837
  # Update session state with the new slider values
838
  st.session_state.num_topics_slider = st.session_state.num_topics_slider_new
839
  st.session_state.num_top_words_slider = st.session_state.num_top_words_slider_new
840
+
841
  if not st.session_state.results_df.empty:
842
  # Recalculate topic modeling results
843
  df_topic_data_new = perform_topic_modeling(
 
848
  st.session_state.topic_results = df_topic_data_new
849
  st.session_state.last_num_topics = st.session_state.num_topics_slider
850
  st.session_state.last_num_top_words = st.session_state.num_top_words_slider
851
+
852
  with col_rerun_btn:
853
  st.markdown("<div style='height: 38px;'></div>", unsafe_allow_html=True)
854
  st.button("Re-Run Topic Model", on_click=rerun_topic_model, use_container_width=True, type="primary")
855
+
856
  st.markdown("---")
857
  st.markdown(f"""
858
  **Current LDA Parameters:**
859
  * Topics: **{st.session_state.num_topics_slider}**
860
  * Top Words: **{st.session_state.num_top_words_slider}**
861
  """)
862
+
863
  df_topic_data = st.session_state.topic_results
864
+
865
  if df_topic_data is not None and not df_topic_data.empty:
866
  st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
867
  st.markdown("This chart visualizes the key words driving the identified topics, based on extracted entities.")
868
  else:
869
  st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
870
+
871
  # 5. White-Label Configuration
872
  st.markdown("---")
873
  st.markdown("### 5. White-Label Report Configuration 🎨")
874
+ default_report_title = "Fixed Entity Analysis Report"
875
  custom_report_title = st.text_input(
876
  "Type Your Report Title (for HTML Report), and then press Enter.",
877
  value=default_report_title
 
882
  key='custom_branding_input',
883
  help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
884
  )
885
+
886
  # 6. Downloads
887
  st.markdown("---")
888
  st.markdown("### 6. Downloads")
889
  col_csv, col_html = st.columns(2)
890
+
891
  # CSV Download
892
  csv_buffer = generate_entity_csv(df)
893
  with col_csv:
 
898
  mime="text/csv",
899
  use_container_width=True
900
  )
901
+
902
  # HTML Download (Passing custom white-label parameters)
903
  branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
 
904
  html_content = generate_html_report(
905
  df,
906
  st.session_state.last_text,