AIEcosystem commited on
Commit
0024c4d
·
verified ·
1 Parent(s): 1c3f8f0

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +180 -204
src/streamlit_app.py CHANGED
@@ -16,6 +16,7 @@ from sklearn.decomposition import LatentDirichletAllocation
16
  # ------------------------------
17
  from gliner import GLiNER
18
  from streamlit_extras.stylable_container import stylable_container
 
19
  # Using a try/except for comet_ml import
20
  try:
21
  from comet_ml import Experiment
@@ -25,9 +26,11 @@ except ImportError:
25
  def log_parameter(self, *args): pass
26
  def log_table(self, *args): pass
27
  def end(self): pass
 
28
  # --- Model Home Directory (Fix for deployment environments) ---
29
  # Set HF_HOME environment variable to a writable path
30
  os.environ['HF_HOME'] = '/tmp'
 
31
  # --- Color Map for Highlighting and Network Graph Nodes ---
32
  entity_color_map = {
33
  "person": "#10b981",
@@ -46,23 +49,26 @@ entity_color_map = {
46
  "url": "#60a5fa",
47
  "nationality_religion": "#fb7185"
48
  }
 
49
  # --- Utility Functions ---
50
  def extract_label(node_name):
51
  """Extracts the label from a node string like 'Text (Label)'."""
52
  match = re.search(r'\(([^)]+)\)$', node_name)
53
  return match.group(1) if match else "Unknown"
54
-
55
  def remove_trailing_punctuation(text_string):
56
  """Removes trailing punctuation from a string."""
57
  return text_string.rstrip(string.punctuation)
58
-
59
  def highlight_entities(text, df_entities):
60
  """Generates HTML to display text with entities highlighted and colored."""
61
  if df_entities.empty:
62
  return text
 
63
  # Sort entities by start index descending to insert highlights without affecting subsequent indices
64
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
65
  highlighted_text = text
 
66
  for entity in entities:
67
  start = entity['start']
68
  end = entity['end']
@@ -72,167 +78,152 @@ def highlight_entities(text, df_entities):
72
 
73
  # Create a span with background color and tooltip
74
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
75
-
76
  # Replace the original text segment with the highlighted HTML
77
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
 
78
  # Use a div to mimic the Streamlit input box style for the report
79
  return f'<div style="border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
80
 
81
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
82
  """
83
- Performs basic Topic Modeling using LDA on the extracted entities
84
  and returns structured data for visualization.
85
-
86
  Includes updated TF-IDF parameters (stop_words='english', max_df=0.95, min_df=1).
87
  """
88
  # Aggregate all unique entity text into a single document list
89
  documents = df_entities['text'].unique().tolist()
90
-
91
  if len(documents) < 2:
92
  return None
93
-
94
- N = min(num_top_words, len(documents))
95
 
 
96
  try:
97
- # UPDATED: Added stop_words='english' to filter common words tokenized
98
  # from multi-word entities (e.g., "The" from "The White House").
99
  tfidf_vectorizer = TfidfVectorizer(
100
- max_df=0.95,
101
  min_df=1, # Retained at 1 to keep all unique entities
102
  stop_words='english' # <-- THIS IS THE KEY ADDITION
103
  )
104
  tfidf = tfidf_vectorizer.fit_transform(documents)
105
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
106
-
107
  lda = LatentDirichletAllocation(
108
- n_components=num_topics, max_iter=5, learning_method='online',
109
- random_state=42, n_jobs=-1
110
  )
111
  lda.fit(tfidf)
112
-
113
  topic_data_list = []
114
  for topic_idx, topic in enumerate(lda.components_):
115
- top_words_indices = topic.argsort()[:-N - 1:-1]
116
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
117
  word_weights = [topic[i] for i in top_words_indices]
118
-
119
  for word, weight in zip(top_words, word_weights):
120
  topic_data_list.append({
121
  'Topic_ID': f'Topic #{topic_idx + 1}',
122
  'Word': word,
123
  'Weight': weight,
124
  })
125
-
126
  return pd.DataFrame(topic_data_list)
127
-
128
  except Exception as e:
129
  st.error(f"Topic modeling failed: {e}")
130
  return None
131
-
132
  def create_topic_word_bubbles(df_topic_data):
133
  """Generates a Plotly Bubble Chart for top words across all topics."""
134
-
135
  if df_topic_data.empty:
136
  return None
137
-
138
  fig = px.scatter(
139
- df_topic_data,
140
- x='Word',
141
- y='Topic_ID',
142
- size='Weight',
143
  color='Topic_ID',
144
- size_max=80,
145
  title='Topic Word Weights (Bubble Chart)',
146
  color_discrete_sequence=px.colors.qualitative.Bold,
147
  hover_data={'Word': True, 'Weight': ':.3f', 'Topic_ID': False}
148
  )
149
-
150
  fig.update_layout(
151
  xaxis_title="Entity/Word (Bubble size = Word Weight)",
152
  yaxis_title="Topic ID",
153
  xaxis={'tickangle': -45, 'showgrid': False},
154
  yaxis={'showgrid': True, 'autorange': 'reversed'},
155
  showlegend=True,
156
- plot_bgcolor='#FFF0F5',
157
  paper_bgcolor='#FFF0F5',
158
  height=600,
159
  margin=dict(t=50, b=100, l=50, r=10),
160
  )
161
-
162
  fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
163
-
164
  return fig
165
-
166
  def generate_network_graph(df, raw_text):
167
  """
168
- Generates a network graph visualization (Node Plot) with edges
169
  based on entity co-occurrence in sentences.
170
  """
171
  entity_counts = df['text'].value_counts().reset_index()
172
  entity_counts.columns = ['text', 'frequency']
173
-
174
  # Merge counts with unique entities (text + label)
175
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
176
-
177
  if unique_entities.shape[0] < 2:
178
  # Return a simple figure with a message if not enough data
179
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
180
-
181
  num_nodes = len(unique_entities)
182
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
183
-
184
- radius = 10
185
-
186
  # Assign circular positions + a little randomness
187
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
188
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
189
-
190
  # Map entity text to its coordinates for easy lookup
191
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
192
-
193
  # ----------------------------------------------------------------------
194
  # 1. Identify Edges (Co-occurrence in sentences)
195
  # ----------------------------------------------------------------------
196
  edges = set()
197
-
198
  # Simple sentence segmentation (handles standard punctuation followed by space)
199
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
200
-
201
  for sentence in sentences:
202
  # Find unique entities that are substrings of this sentence
203
  entities_in_sentence = []
204
  for entity_text in unique_entities['text'].unique():
205
  if entity_text.lower() in sentence.lower():
206
  entities_in_sentence.append(entity_text)
207
-
208
  # Create edges (pairs) based on co-occurrence
209
  unique_entities_in_sentence = list(set(entities_in_sentence))
210
-
211
  # Create all unique pairs (edges)
212
  for i in range(len(unique_entities_in_sentence)):
213
  for j in range(i + 1, len(unique_entities_in_sentence)):
214
  node1 = unique_entities_in_sentence[i]
215
  node2 = unique_entities_in_sentence[j]
216
-
217
  # Ensure consistent order for the set to avoid duplicates like (A, B) and (B, A)
218
  edge_tuple = tuple(sorted((node1, node2)))
219
  edges.add(edge_tuple)
220
-
221
  # ----------------------------------------------------------------------
222
  # 2. Create Plotly Trace Data for Edges
223
  # ----------------------------------------------------------------------
224
  edge_x = []
225
  edge_y = []
226
-
227
  for edge in edges:
228
  n1, n2 = edge
229
  if n1 in pos_map and n2 in pos_map:
230
  # Append coordinates for line segment: [x1, x2, None] for separation
231
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
232
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
233
-
234
  fig = go.Figure()
235
-
236
  # Add Edge Trace (Lines)
237
  edge_trace = go.Scatter(
238
  x=edge_x, y=edge_y,
@@ -243,7 +234,6 @@ def generate_network_graph(df, raw_text):
243
  showlegend=False # Edges don't need a legend entry
244
  )
245
  fig.add_trace(edge_trace)
246
-
247
  # ----------------------------------------------------------------------
248
  # 3. Add Node Trace (Markers)
249
  # ----------------------------------------------------------------------
@@ -254,9 +244,9 @@ def generate_network_graph(df, raw_text):
254
  name='Entities',
255
  text=unique_entities['text'],
256
  textposition="top center",
257
- # FIX: Explicitly set showlegend=False for the main node trace
258
  # as we are creating separate traces for the legend colors below.
259
- showlegend=False,
260
  marker=dict(
261
  size=unique_entities['frequency'] * 5 + 10,
262
  color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
@@ -273,7 +263,7 @@ def generate_network_graph(df, raw_text):
273
  "Frequency: %{customdata[2]}<extra></extra>"
274
  )
275
  ))
276
-
277
  # Adding discrete traces for the legend based on unique labels
278
  legend_traces = []
279
  seen_labels = set()
@@ -287,7 +277,7 @@ def generate_network_graph(df, raw_text):
287
  y=[None],
288
  mode='markers',
289
  marker=dict(size=10, color=color),
290
- name=f"{label.capitalize()}",
291
  showlegend=True # Ensure legend traces are explicitly visible
292
  ))
293
  for trace in legend_traces:
@@ -305,70 +295,61 @@ def generate_network_graph(df, raw_text):
305
  margin=dict(t=50, b=10, l=10, r=10),
306
  height=600
307
  )
308
-
309
  return fig
310
 
311
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
312
  """
313
  Generates a full HTML report containing all analysis results and visualizations.
314
-
315
- FIX 1: Added a discrete color sequence to the Treemap to prevent black color.
316
- FIX 2: Adjusted CSS grid properties and added min-width to grid items to prevent plot overlap.
 
317
  """
318
-
319
  # 1. Generate Visualizations (Plotly HTML)
320
-
321
  # 1a. Treemap
322
  # FIX 1: Explicitly set a color_discrete_sequence to prevent the Treemap from being black
323
  fig_treemap = px.treemap(
324
- df,
325
- path=[px.Constant("All Entities"), 'category', 'label', 'text'],
326
  values='score',
327
- color='category',
328
  title="Entity Distribution by Category and Label",
329
  color_discrete_sequence=px.colors.qualitative.Dark24 # Use a robust color sequence
330
  )
331
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
332
  treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
333
-
334
  # 1b. Pie Chart
335
  grouped_counts = df['category'].value_counts().reset_index()
336
  grouped_counts.columns = ['Category', 'Count']
337
- fig_pie = px.pie(grouped_counts, values='Count', names='Category',
338
- title='Distribution of Entities by Category',
339
- color_discrete_sequence=px.colors.sequential.RdBu)
340
  fig_pie.update_layout(margin=dict(t=50, b=10))
341
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
342
-
343
  # 1c. Bar Chart (Category Count)
344
- fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',
345
- color='Category', title='Total Entities per Category',
346
- color_discrete_sequence=px.colors.qualitative.Pastel)
347
- fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},
348
- margin=dict(t=50, b=10))
349
- bar_category_html = fig_bar_category.to_html(full_html=False,
350
- include_plotlyjs='cdn')
351
-
352
  # 1d. Bar Chart (Most Frequent Entities)
353
- word_counts = df['text'].value_counts().reset_index()
354
- word_counts.columns = ['Entity', 'Count']
355
-
356
  # Top 10 repeating entities
357
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
358
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
359
-
360
  if not repeating_entities.empty:
361
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',
362
- color='Entity', title='Top 10 Most Frequent Entities',
363
- color_discrete_sequence=px.colors.sequential.Plasma)
364
- fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},
365
- margin=dict(t=50, b=10))
366
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
367
-
368
  # 1e. Network Graph HTML - UPDATED to pass text_input
369
  network_fig = generate_network_graph(df, text_input)
370
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
371
-
372
  # 1f. Topic Charts HTML (Now a single Bubble Chart with Placeholder logic)
373
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
374
  if df_topic_data is not None and not df_topic_data.empty:
@@ -383,16 +364,16 @@ margin=dict(t=50, b=10))
383
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
384
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
385
  topic_charts_html += '</div>'
386
-
387
  # 2. Get Highlighted Text
388
  highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
389
-
390
  # 3. Entity Tables (Pandas to HTML)
391
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
392
- classes='table table-striped',
393
  index=False
394
  )
395
-
396
  # 4. Construct the Final HTML
397
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
398
  <meta charset="UTF-8">
@@ -406,20 +387,21 @@ margin=dict(t=50, b=10))
406
  h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
407
  h3 {{ color: #555; margin-top: 20px; }}
408
  .metadata {{ background-color: #FFE4E1; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
409
- /* FIX 2: Modified grid to ensure each item gets min 30% of the container width */
410
- .grid {{
411
- display: grid;
412
- grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); /* Adjusted min-width for better fit */
413
- gap: 20px;
414
- margin-top: 20px;
415
  }}
416
- .chart-box {{
417
- background-color: #f9f9f9;
418
- padding: 15px;
419
- border-radius: 8px;
420
  box-shadow: 0 2px 4px rgba(0,0,0,0.05);
421
- /* Important: Set a minimum width for the chart box in the grid */
422
- min-width: 0;
 
423
  }}
424
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
425
  table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
@@ -427,14 +409,14 @@ margin=dict(t=50, b=10))
427
  /* Specific styling for highlighted text element */
428
  .highlighted-text {{ border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
429
  @media (max-width: 1050px) {{ /* Increased breakpoint to help prevent overlap */
430
- .grid {{
431
- grid-template-columns: 1fr; /* Stack charts vertically on smaller screens */
432
  }}
433
  }}
434
  </style></head><body>
435
  <div class="container">
436
  <h1>Entity and Topic Analysis Report</h1>
437
-
438
  <div class="metadata">
439
  <p><strong>Generated At:</strong> {time.strftime('%Y-%m-%d %H:%M:%S')}</p>
440
  <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
@@ -444,25 +426,26 @@ margin=dict(t=50, b=10))
444
  <div class="highlighted-text-container">
445
  {highlighted_text_html}
446
  </div>
447
-
448
  <h2>2. Full Extracted Entities Table</h2>
449
  {entity_table_html}
450
  <h2>3. Data Visualizations</h2>
451
-
452
  <h3>3.1 Entity Distribution Treemap</h3>
453
  <div class="chart-box">{treemap_html}</div>
454
- <h3>3.2 Comparative Charts (Pie, Category Count, Frequency)</h3>
455
- <div class="grid">
456
- <div class="chart-box">{pie_html}</div>
457
- <div class="chart-box">{bar_category_html}</div>
458
- <div class="chart-box">{bar_freq_html}</div>
459
- </div>
 
460
  <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
461
  <div class="chart-box">{network_html}</div>
462
-
463
  <h2>4. Topic Modeling (LDA on Entities)</h2>
464
  {topic_charts_html}
465
-
466
  </div></body></html>
467
  """
468
  return html_content
@@ -505,13 +488,17 @@ st.markdown(
505
  st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
506
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
507
  expander = st.expander("**Important notes**")
508
- expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.**Results:** Results are compiled into a single, comprehensive **HTML report** for easy download and sharing.**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
 
 
509
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 
510
  # --- Comet ML Setup (Placeholder/Conditional) ---
511
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
512
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
513
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
514
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 
515
  # --- Label Definitions and Category Mapping ---
516
  labels = list(entity_color_map.keys())
517
  category_mapping = {
@@ -521,8 +508,9 @@ category_mapping = {
521
  "Digital & Products": ["platform", "product", "media_type", "url"],
522
  }
523
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 
524
  # --- Model Loading ---
525
- @st.cache_resource
526
  def load_ner_model():
527
  """Loads the GLiNER model and caches it."""
528
  try:
@@ -531,9 +519,9 @@ def load_ner_model():
531
  except Exception as e:
532
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
533
  st.stop()
534
-
535
  model = load_ner_model()
536
-
537
  # --- LONG DEFAULT TEXT (178 Words) ---
538
  DEFAULT_TEXT = (
539
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
@@ -551,7 +539,6 @@ DEFAULT_TEXT = (
551
  "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
552
  )
553
  # -----------------------------------
554
-
555
  # --- Session State Initialization (CRITICAL FIX) ---
556
  if 'show_results' not in st.session_state:
557
  st.session_state.show_results = False
@@ -566,7 +553,7 @@ if 'topic_results' not in st.session_state:
566
  # FIX: Initialize the text area key with default text before st.text_area is called
567
  if 'my_text_area' not in st.session_state:
568
  st.session_state.my_text_area = DEFAULT_TEXT
569
-
570
  # --- Clear Button Function (MODIFIED) ---
571
  def clear_text():
572
  """Clears the text area (sets it to an empty string) and hides results."""
@@ -577,21 +564,20 @@ def clear_text():
577
  st.session_state.results_df = pd.DataFrame()
578
  st.session_state.elapsed_time = 0.0
579
  st.session_state.topic_results = None
580
-
581
  # --- Text Input and Clear Button ---
582
  word_limit = 1000
583
  # The text area now safely uses the pre-initialized session state value
584
  text = st.text_area(
585
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
586
- height=250,
587
  key='my_text_area',
588
- value=st.session_state.my_text_area
589
- )
590
-
591
  word_count = len(text.split())
592
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
593
  st.button("Clear text", on_click=clear_text)
594
-
595
  # --- Results Trigger and Processing (Updated Logic) ---
596
  if st.button("Results"):
597
  if not text.strip():
@@ -605,27 +591,26 @@ if st.button("Results"):
605
  if text != st.session_state.last_text:
606
  st.session_state.last_text = text
607
  start_time = time.time()
608
-
609
  # --- Model Prediction & Dataframe Creation ---
610
  entities = model.predict_entities(text, labels)
611
  df = pd.DataFrame(entities)
612
-
613
  if not df.empty:
614
  df['text'] = df['text'].apply(remove_trailing_punctuation)
615
  df['category'] = df['label'].map(reverse_category_mapping)
616
  st.session_state.results_df = df
617
-
618
  unique_entity_count = len(df['text'].unique())
619
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
620
-
621
  st.session_state.topic_results = perform_topic_modeling(
622
- df,
623
- num_topics=2,
624
  num_top_words=N_TOP_WORDS_TO_USE
625
  )
626
-
627
  if comet_initialized:
628
- # FIX APPLIED HERE: Corrected indentation for the following lines
629
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
630
  experiment.log_parameter("input_text", text)
631
  experiment.log_table("predicted_entities", df)
@@ -633,10 +618,10 @@ if st.button("Results"):
633
  else:
634
  st.session_state.results_df = pd.DataFrame()
635
  st.session_state.topic_results = None
636
-
637
  end_time = time.time()
638
  st.session_state.elapsed_time = end_time - start_time
639
-
640
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
641
  st.session_state.show_results = True
642
 
@@ -644,144 +629,135 @@ if st.button("Results"):
644
  if st.session_state.show_results:
645
  df = st.session_state.results_df
646
  df_topic_data = st.session_state.topic_results
647
-
648
  if df.empty:
649
  st.warning("No entities were found in the provided text.")
650
  else:
651
  st.subheader("Analysis Results", divider="blue")
652
-
653
  # 1. Highlighted Text
654
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
655
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
656
-
657
  # 2. Entity Summary Table (Count by Label - kept outside tabs)
658
  st.markdown("### 2. Entity Summary Table (Count by Label)")
659
  grouped_entity_table = df['label'].value_counts().reset_index()
660
  grouped_entity_table.columns = ['Entity Label', 'Count']
661
  grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(reverse_category_mapping)
662
  st.dataframe(grouped_entity_table[['Category', 'Entity Label', 'Count']], use_container_width=True)
 
663
 
664
- st.markdown("---")
665
  st.markdown("### 3. Detailed Entity Analysis")
666
-
667
  # 3. New Tabs: Tab 1: Category Details Table | Tab 2: Treemap
668
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
669
-
670
  # TAB 1: Detailed Entities Table Grouped by Category
671
  with tab_category_details:
672
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
673
-
674
  # Get the unique categories for creating inner tabs
675
  unique_categories = list(category_mapping.keys())
676
-
677
- # Create inner tabs dynamically based on the available categories
678
- tabs_category = st.tabs(unique_categories)
679
 
 
 
680
  # We iterate over the categories and tabs simultaneously
681
  for category, tab in zip(unique_categories, tabs_category):
682
  # Filter the main DataFrame for the current category
683
  df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
684
-
685
  with tab:
686
  st.markdown(f"##### {category} Entities ({len(df_category)} total)")
687
  if not df_category.empty:
688
  # Display the DataFrame for the current category
689
  st.dataframe(
690
- df_category,
691
- use_container_width=True,
692
  # Format the score for better readability
693
  column_config={'score': st.column_config.NumberColumn(format="%.4f")}
694
  )
695
  else:
696
  st.info(f"No entities of category **{category}** were found in the text.")
697
-
698
  # TAB 2: Treemap
699
  with tab_treemap_viz:
700
  st.markdown("#### Treemap: Entity Distribution")
701
  # Treemap
702
  # FIX 1 (Streamlit): Added a robust color sequence here too for consistency in the Streamlit plot
703
  fig_treemap = px.treemap(
704
- df,
705
- path=[px.Constant("All Entities"), 'category', 'label', 'text'],
706
  values='score',
707
- color='category',
708
  title="Entity Distribution by Category and Label",
709
  color_discrete_sequence=px.colors.qualitative.Dark24 # Applied fix here
710
  )
711
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
712
  st.plotly_chart(fig_treemap, use_container_width=True)
713
-
714
  # 4. Comparative Charts (Keep outside the new tabs, as in original code structure)
715
  st.markdown("---")
716
  st.markdown("### 4. Comparative Charts")
717
-
718
- # FIX 2 (Streamlit): The Streamlit plot columns (col1, col2, col3) naturally handle overlap,
719
- # so no change is needed here, the fix is only in the HTML report.
720
- col1, col2, col3 = st.columns(3)
721
-
722
- # Pie Chart
723
  grouped_counts = df['category'].value_counts().reset_index()
724
  grouped_counts.columns = ['Category', 'Count']
725
- fig_pie = px.pie(grouped_counts, values='Count', names='Category',
726
- title='Distribution by Category',
727
- color_discrete_sequence=px.colors.sequential.RdBu)
728
  with col1:
 
 
729
  st.plotly_chart(fig_pie, use_container_width=True)
730
- # Category Count Bar Chart
731
- fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',
732
- color='Category', title='Total Entities per Category',
733
- color_discrete_sequence=px.colors.qualitative.Pastel)
734
  with col2:
 
 
735
  st.plotly_chart(fig_bar_category, use_container_width=True)
736
- # Most Frequent Entities Bar Chart
737
- word_counts = df['text'].value_counts().reset_index()
738
- word_counts.columns = ['Entity', 'Count']
739
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
740
- fig_bar_freq = go.Figure().update_layout(title="No repeating entities for plot")
741
- if not repeating_entities.empty:
742
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',
743
- color='Entity', title='Top 10 Most Frequent Entities',
744
- color_discrete_sequence=px.colors.sequential.Plasma)
745
  with col3:
746
- st.plotly_chart(fig_bar_freq, use_container_width=True)
747
-
748
- # 5. Network Graph
 
 
 
 
749
  st.markdown("---")
750
  st.markdown("### 5. Entity Co-occurrence Network")
 
 
751
  network_fig = generate_network_graph(df, st.session_state.last_text)
752
  st.plotly_chart(network_fig, use_container_width=True)
753
-
754
- # 6. Topic Modeling
755
  st.markdown("---")
756
- st.markdown("### 6. Topic Modeling (LDA on Entities)")
 
 
757
  if df_topic_data is not None and not df_topic_data.empty:
758
  bubble_figure = create_topic_word_bubbles(df_topic_data)
759
  if bubble_figure:
760
  st.plotly_chart(bubble_figure, use_container_width=True)
761
  else:
762
- st.error("Visualization for Topic Modeling failed.")
763
  else:
764
- st.info("Topic Modeling requires at least two unique entities and sufficient data to generate meaningful topics.")
765
 
766
- # Final Report Download
767
  st.markdown("---")
768
- st.markdown("### Download Full HTML Report 🚀")
769
-
770
- # Generate the full HTML content
771
- html_report = generate_html_report(
772
- df=df,
773
- text_input=st.session_state.last_text,
774
- elapsed_time=st.session_state.elapsed_time,
775
- df_topic_data=df_topic_data
776
- )
777
-
778
  st.download_button(
779
- label="Download Analysis Report (.html)",
780
  data=html_report,
781
- file_name="entity_analysis_report.html",
782
- mime="text/html"
 
783
  )
784
 
785
-
786
-
787
-
 
16
  # ------------------------------
17
  from gliner import GLiNER
18
  from streamlit_extras.stylable_container import stylable_container
19
+
20
  # Using a try/except for comet_ml import
21
  try:
22
  from comet_ml import Experiment
 
26
  def log_parameter(self, *args): pass
27
  def log_table(self, *args): pass
28
  def end(self): pass
29
+
30
  # --- Model Home Directory (Fix for deployment environments) ---
31
  # Set HF_HOME environment variable to a writable path
32
  os.environ['HF_HOME'] = '/tmp'
33
+
34
  # --- Color Map for Highlighting and Network Graph Nodes ---
35
  entity_color_map = {
36
  "person": "#10b981",
 
49
  "url": "#60a5fa",
50
  "nationality_religion": "#fb7185"
51
  }
52
+
53
  # --- Utility Functions ---
54
  def extract_label(node_name):
55
  """Extracts the label from a node string like 'Text (Label)'."""
56
  match = re.search(r'\(([^)]+)\)$', node_name)
57
  return match.group(1) if match else "Unknown"
58
+
59
  def remove_trailing_punctuation(text_string):
60
  """Removes trailing punctuation from a string."""
61
  return text_string.rstrip(string.punctuation)
62
+
63
  def highlight_entities(text, df_entities):
64
  """Generates HTML to display text with entities highlighted and colored."""
65
  if df_entities.empty:
66
  return text
67
+
68
  # Sort entities by start index descending to insert highlights without affecting subsequent indices
69
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
70
  highlighted_text = text
71
+
72
  for entity in entities:
73
  start = entity['start']
74
  end = entity['end']
 
78
 
79
  # Create a span with background color and tooltip
80
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
 
81
  # Replace the original text segment with the highlighted HTML
82
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
83
+
84
  # Use a div to mimic the Streamlit input box style for the report
85
  return f'<div style="border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
86
 
87
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
88
  """
89
+ Performs basic Topic Modeling using LDA on the extracted entities
90
  and returns structured data for visualization.
91
+
92
  Includes updated TF-IDF parameters (stop_words='english', max_df=0.95, min_df=1).
93
  """
94
  # Aggregate all unique entity text into a single document list
95
  documents = df_entities['text'].unique().tolist()
 
96
  if len(documents) < 2:
97
  return None
 
 
98
 
99
+ N = min(num_top_words, len(documents))
100
  try:
101
+ # UPDATED: Added stop_words='english' to filter common words tokenized
102
  # from multi-word entities (e.g., "The" from "The White House").
103
  tfidf_vectorizer = TfidfVectorizer(
104
+ max_df=0.95,
105
  min_df=1, # Retained at 1 to keep all unique entities
106
  stop_words='english' # <-- THIS IS THE KEY ADDITION
107
  )
108
  tfidf = tfidf_vectorizer.fit_transform(documents)
109
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
110
+
111
  lda = LatentDirichletAllocation(
112
+ n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
 
113
  )
114
  lda.fit(tfidf)
 
115
  topic_data_list = []
116
  for topic_idx, topic in enumerate(lda.components_):
117
+ top_words_indices = topic.argsort()[:-N - 1:-1]
118
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
119
  word_weights = [topic[i] for i in top_words_indices]
 
120
  for word, weight in zip(top_words, word_weights):
121
  topic_data_list.append({
122
  'Topic_ID': f'Topic #{topic_idx + 1}',
123
  'Word': word,
124
  'Weight': weight,
125
  })
 
126
  return pd.DataFrame(topic_data_list)
 
127
  except Exception as e:
128
  st.error(f"Topic modeling failed: {e}")
129
  return None
130
+
131
  def create_topic_word_bubbles(df_topic_data):
132
  """Generates a Plotly Bubble Chart for top words across all topics."""
133
+
134
  if df_topic_data.empty:
135
  return None
 
136
  fig = px.scatter(
137
+ df_topic_data,
138
+ x='Word',
139
+ y='Topic_ID',
140
+ size='Weight',
141
  color='Topic_ID',
142
+ size_max=80,
143
  title='Topic Word Weights (Bubble Chart)',
144
  color_discrete_sequence=px.colors.qualitative.Bold,
145
  hover_data={'Word': True, 'Weight': ':.3f', 'Topic_ID': False}
146
  )
 
147
  fig.update_layout(
148
  xaxis_title="Entity/Word (Bubble size = Word Weight)",
149
  yaxis_title="Topic ID",
150
  xaxis={'tickangle': -45, 'showgrid': False},
151
  yaxis={'showgrid': True, 'autorange': 'reversed'},
152
  showlegend=True,
153
+ plot_bgcolor='#FFF0F5',
154
  paper_bgcolor='#FFF0F5',
155
  height=600,
156
  margin=dict(t=50, b=100, l=50, r=10),
157
  )
158
+
159
  fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
160
+
161
  return fig
162
+
163
  def generate_network_graph(df, raw_text):
164
  """
165
+ Generates a network graph visualization (Node Plot) with edges
166
  based on entity co-occurrence in sentences.
167
  """
168
  entity_counts = df['text'].value_counts().reset_index()
169
  entity_counts.columns = ['text', 'frequency']
170
+
171
  # Merge counts with unique entities (text + label)
172
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
 
173
  if unique_entities.shape[0] < 2:
174
  # Return a simple figure with a message if not enough data
175
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
176
+
177
  num_nodes = len(unique_entities)
178
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
179
+
180
+ radius = 10
 
181
  # Assign circular positions + a little randomness
182
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
183
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
184
+
185
  # Map entity text to its coordinates for easy lookup
186
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
 
187
  # ----------------------------------------------------------------------
188
  # 1. Identify Edges (Co-occurrence in sentences)
189
  # ----------------------------------------------------------------------
190
  edges = set()
191
+
192
  # Simple sentence segmentation (handles standard punctuation followed by space)
193
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
 
194
  for sentence in sentences:
195
  # Find unique entities that are substrings of this sentence
196
  entities_in_sentence = []
197
  for entity_text in unique_entities['text'].unique():
198
  if entity_text.lower() in sentence.lower():
199
  entities_in_sentence.append(entity_text)
 
200
  # Create edges (pairs) based on co-occurrence
201
  unique_entities_in_sentence = list(set(entities_in_sentence))
202
+
203
  # Create all unique pairs (edges)
204
  for i in range(len(unique_entities_in_sentence)):
205
  for j in range(i + 1, len(unique_entities_in_sentence)):
206
  node1 = unique_entities_in_sentence[i]
207
  node2 = unique_entities_in_sentence[j]
208
+
209
  # Ensure consistent order for the set to avoid duplicates like (A, B) and (B, A)
210
  edge_tuple = tuple(sorted((node1, node2)))
211
  edges.add(edge_tuple)
 
212
  # ----------------------------------------------------------------------
213
  # 2. Create Plotly Trace Data for Edges
214
  # ----------------------------------------------------------------------
215
  edge_x = []
216
  edge_y = []
217
+
218
  for edge in edges:
219
  n1, n2 = edge
220
  if n1 in pos_map and n2 in pos_map:
221
  # Append coordinates for line segment: [x1, x2, None] for separation
222
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
223
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
224
+
225
  fig = go.Figure()
226
+
227
  # Add Edge Trace (Lines)
228
  edge_trace = go.Scatter(
229
  x=edge_x, y=edge_y,
 
234
  showlegend=False # Edges don't need a legend entry
235
  )
236
  fig.add_trace(edge_trace)
 
237
  # ----------------------------------------------------------------------
238
  # 3. Add Node Trace (Markers)
239
  # ----------------------------------------------------------------------
 
244
  name='Entities',
245
  text=unique_entities['text'],
246
  textposition="top center",
247
+ # FIX: Explicitly set showlegend=False for the main node trace
248
  # as we are creating separate traces for the legend colors below.
249
+ showlegend=False,
250
  marker=dict(
251
  size=unique_entities['frequency'] * 5 + 10,
252
  color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
 
263
  "Frequency: %{customdata[2]}<extra></extra>"
264
  )
265
  ))
266
+
267
  # Adding discrete traces for the legend based on unique labels
268
  legend_traces = []
269
  seen_labels = set()
 
277
  y=[None],
278
  mode='markers',
279
  marker=dict(size=10, color=color),
280
+ name=f"{label.capitalize()}",
281
  showlegend=True # Ensure legend traces are explicitly visible
282
  ))
283
  for trace in legend_traces:
 
295
  margin=dict(t=50, b=10, l=10, r=10),
296
  height=600
297
  )
298
+
299
  return fig
300
 
301
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
302
  """
303
  Generates a full HTML report containing all analysis results and visualizations.
304
+
305
+ FIX APPLIED: Removed the CSS Grid layout for the three comparative charts
306
+ (Pie, Category Count, Frequency) and stacked them vertically to prevent
307
+ overlapping and ensure reliable rendering across devices.
308
  """
309
+
310
  # 1. Generate Visualizations (Plotly HTML)
311
+
312
  # 1a. Treemap
313
  # FIX 1: Explicitly set a color_discrete_sequence to prevent the Treemap from being black
314
  fig_treemap = px.treemap(
315
+ df,
316
+ path=[px.Constant("All Entities"), 'category', 'label', 'text'],
317
  values='score',
318
+ color='category',
319
  title="Entity Distribution by Category and Label",
320
  color_discrete_sequence=px.colors.qualitative.Dark24 # Use a robust color sequence
321
  )
322
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
323
  treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
324
+
325
  # 1b. Pie Chart
326
  grouped_counts = df['category'].value_counts().reset_index()
327
  grouped_counts.columns = ['Category', 'Count']
328
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
 
 
329
  fig_pie.update_layout(margin=dict(t=50, b=10))
330
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
331
+
332
  # 1c. Bar Chart (Category Count)
333
+ fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
334
+ fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=10))
335
+ bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
336
+
 
 
 
 
337
  # 1d. Bar Chart (Most Frequent Entities)
338
+ word_counts = df['text'].value_counts().reset_index()
339
+ word_counts.columns = ['Entity', 'Count']
 
340
  # Top 10 repeating entities
341
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
342
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
343
+
344
  if not repeating_entities.empty:
345
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
346
+ fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=10))
 
 
 
347
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
348
+
349
  # 1e. Network Graph HTML - UPDATED to pass text_input
350
  network_fig = generate_network_graph(df, text_input)
351
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
352
+
353
  # 1f. Topic Charts HTML (Now a single Bubble Chart with Placeholder logic)
354
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
355
  if df_topic_data is not None and not df_topic_data.empty:
 
364
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
365
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
366
  topic_charts_html += '</div>'
367
+
368
  # 2. Get Highlighted Text
369
  highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
370
+
371
  # 3. Entity Tables (Pandas to HTML)
372
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
373
+ classes='table table-striped',
374
  index=False
375
  )
376
+
377
  # 4. Construct the Final HTML
378
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
379
  <meta charset="UTF-8">
 
387
  h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
388
  h3 {{ color: #555; margin-top: 20px; }}
389
  .metadata {{ background-color: #FFE4E1; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
390
+ /* The 'grid' class is kept for potential future use or the network graph, but not used for 3.2 */
391
+ .grid {{
392
+ display: grid;
393
+ grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
394
+ gap: 20px;
395
+ margin-top: 20px;
396
  }}
397
+ .chart-box {{
398
+ background-color: #f9f9f9;
399
+ padding: 15px;
400
+ border-radius: 8px;
401
  box-shadow: 0 2px 4px rgba(0,0,0,0.05);
402
+ /* Important: Set a minimum width for the chart box, and margin for stacking */
403
+ min-width: 0;
404
+ margin-bottom: 20px; /* NEW: Added margin for separation when stacked */
405
  }}
406
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
407
  table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
 
409
  /* Specific styling for highlighted text element */
410
  .highlighted-text {{ border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
411
  @media (max-width: 1050px) {{ /* Increased breakpoint to help prevent overlap */
412
+ .grid {{
413
+ grid-template-columns: 1fr; /* Stack charts vertically on smaller screens */
414
  }}
415
  }}
416
  </style></head><body>
417
  <div class="container">
418
  <h1>Entity and Topic Analysis Report</h1>
419
+
420
  <div class="metadata">
421
  <p><strong>Generated At:</strong> {time.strftime('%Y-%m-%d %H:%M:%S')}</p>
422
  <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
 
426
  <div class="highlighted-text-container">
427
  {highlighted_text_html}
428
  </div>
429
+
430
  <h2>2. Full Extracted Entities Table</h2>
431
  {entity_table_html}
432
  <h2>3. Data Visualizations</h2>
433
+
434
  <h3>3.1 Entity Distribution Treemap</h3>
435
  <div class="chart-box">{treemap_html}</div>
436
+ <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
437
+
438
+ <!-- FIX: Charts are now in separate chart-box divs (not a 'grid') for guaranteed vertical stacking -->
439
+ <div class="chart-box">{pie_html}</div>
440
+ <div class="chart-box">{bar_category_html}</div>
441
+ <div class="chart-box">{bar_freq_html}</div>
442
+
443
  <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
444
  <div class="chart-box">{network_html}</div>
445
+
446
  <h2>4. Topic Modeling (LDA on Entities)</h2>
447
  {topic_charts_html}
448
+
449
  </div></body></html>
450
  """
451
  return html_content
 
488
  st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
489
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
490
  expander = st.expander("**Important notes**")
491
+ expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
492
+ **Results:** Results are compiled into a single, comprehensive **HTML report** for easy download and sharing.
493
+ **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
494
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
495
+
496
  # --- Comet ML Setup (Placeholder/Conditional) ---
497
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
498
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
499
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
500
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
501
+
502
  # --- Label Definitions and Category Mapping ---
503
  labels = list(entity_color_map.keys())
504
  category_mapping = {
 
508
  "Digital & Products": ["platform", "product", "media_type", "url"],
509
  }
510
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
511
+
512
  # --- Model Loading ---
513
+ @st.cache_resourced
514
  def load_ner_model():
515
  """Loads the GLiNER model and caches it."""
516
  try:
 
519
  except Exception as e:
520
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
521
  st.stop()
522
+
523
  model = load_ner_model()
524
+
525
  # --- LONG DEFAULT TEXT (178 Words) ---
526
  DEFAULT_TEXT = (
527
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
 
539
  "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
540
  )
541
  # -----------------------------------
 
542
  # --- Session State Initialization (CRITICAL FIX) ---
543
  if 'show_results' not in st.session_state:
544
  st.session_state.show_results = False
 
553
  # FIX: Initialize the text area key with default text before st.text_area is called
554
  if 'my_text_area' not in st.session_state:
555
  st.session_state.my_text_area = DEFAULT_TEXT
556
+
557
  # --- Clear Button Function (MODIFIED) ---
558
  def clear_text():
559
  """Clears the text area (sets it to an empty string) and hides results."""
 
564
  st.session_state.results_df = pd.DataFrame()
565
  st.session_state.elapsed_time = 0.0
566
  st.session_state.topic_results = None
567
+
568
  # --- Text Input and Clear Button ---
569
  word_limit = 1000
570
  # The text area now safely uses the pre-initialized session state value
571
  text = st.text_area(
572
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
573
+ height=250,
574
  key='my_text_area',
575
+ value=st.session_state.my_text_area)
576
+
 
577
  word_count = len(text.split())
578
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
579
  st.button("Clear text", on_click=clear_text)
580
+
581
  # --- Results Trigger and Processing (Updated Logic) ---
582
  if st.button("Results"):
583
  if not text.strip():
 
591
  if text != st.session_state.last_text:
592
  st.session_state.last_text = text
593
  start_time = time.time()
594
+
595
  # --- Model Prediction & Dataframe Creation ---
596
  entities = model.predict_entities(text, labels)
597
  df = pd.DataFrame(entities)
598
+
599
  if not df.empty:
600
  df['text'] = df['text'].apply(remove_trailing_punctuation)
601
  df['category'] = df['label'].map(reverse_category_mapping)
602
  st.session_state.results_df = df
603
+
604
  unique_entity_count = len(df['text'].unique())
605
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
606
+
607
  st.session_state.topic_results = perform_topic_modeling(
608
+ df,
609
+ num_topics=2,
610
  num_top_words=N_TOP_WORDS_TO_USE
611
  )
612
+
613
  if comet_initialized:
 
614
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
615
  experiment.log_parameter("input_text", text)
616
  experiment.log_table("predicted_entities", df)
 
618
  else:
619
  st.session_state.results_df = pd.DataFrame()
620
  st.session_state.topic_results = None
621
+
622
  end_time = time.time()
623
  st.session_state.elapsed_time = end_time - start_time
624
+
625
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
626
  st.session_state.show_results = True
627
 
 
629
  if st.session_state.show_results:
630
  df = st.session_state.results_df
631
  df_topic_data = st.session_state.topic_results
632
+
633
  if df.empty:
634
  st.warning("No entities were found in the provided text.")
635
  else:
636
  st.subheader("Analysis Results", divider="blue")
637
+
638
  # 1. Highlighted Text
639
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
640
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
641
+
642
  # 2. Entity Summary Table (Count by Label - kept outside tabs)
643
  st.markdown("### 2. Entity Summary Table (Count by Label)")
644
  grouped_entity_table = df['label'].value_counts().reset_index()
645
  grouped_entity_table.columns = ['Entity Label', 'Count']
646
  grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(reverse_category_mapping)
647
  st.dataframe(grouped_entity_table[['Category', 'Entity Label', 'Count']], use_container_width=True)
648
+ st.markdown("---")
649
 
 
650
  st.markdown("### 3. Detailed Entity Analysis")
 
651
  # 3. New Tabs: Tab 1: Category Details Table | Tab 2: Treemap
652
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
653
+
654
  # TAB 1: Detailed Entities Table Grouped by Category
655
  with tab_category_details:
656
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
 
657
  # Get the unique categories for creating inner tabs
658
  unique_categories = list(category_mapping.keys())
 
 
 
659
 
660
+ # Create inner tabs dynamically based on the available categories
661
+ tabs_category = st.tabs(unique_categories)
662
  # We iterate over the categories and tabs simultaneously
663
  for category, tab in zip(unique_categories, tabs_category):
664
  # Filter the main DataFrame for the current category
665
  df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
666
+
667
  with tab:
668
  st.markdown(f"##### {category} Entities ({len(df_category)} total)")
669
  if not df_category.empty:
670
  # Display the DataFrame for the current category
671
  st.dataframe(
672
+ df_category,
673
+ use_container_width=True,
674
  # Format the score for better readability
675
  column_config={'score': st.column_config.NumberColumn(format="%.4f")}
676
  )
677
  else:
678
  st.info(f"No entities of category **{category}** were found in the text.")
 
679
  # TAB 2: Treemap
680
  with tab_treemap_viz:
681
  st.markdown("#### Treemap: Entity Distribution")
682
  # Treemap
683
  # FIX 1 (Streamlit): Added a robust color sequence here too for consistency in the Streamlit plot
684
  fig_treemap = px.treemap(
685
+ df,
686
+ path=[px.Constant("All Entities"), 'category', 'label', 'text'],
687
  values='score',
688
+ color='category',
689
  title="Entity Distribution by Category and Label",
690
  color_discrete_sequence=px.colors.qualitative.Dark24 # Applied fix here
691
  )
692
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
693
  st.plotly_chart(fig_treemap, use_container_width=True)
694
+
695
  # 4. Comparative Charts (Keep outside the new tabs, as in original code structure)
696
  st.markdown("---")
697
  st.markdown("### 4. Comparative Charts")
698
+
699
+ # FIX: The three comparative charts are generated here and will be stacked vertically
700
+ # in the HTML report output.
701
+ col1, col2, col3 = st.columns(3) # Use Streamlit columns for the *Streamlit* preview
702
+
 
703
  grouped_counts = df['category'].value_counts().reset_index()
704
  grouped_counts.columns = ['Category', 'Count']
705
+
706
+ # Pie Chart
 
707
  with col1:
708
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
709
+ fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
710
  st.plotly_chart(fig_pie, use_container_width=True)
711
+
712
+ # Bar Chart (Category Count)
 
 
713
  with col2:
714
+ fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
715
+ fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
716
  st.plotly_chart(fig_bar_category, use_container_width=True)
717
+
718
+ # Bar Chart (Most Frequent Entities)
719
+ word_counts = df['text'].value_counts().reset_index()
720
+ word_counts.columns = ['Entity', 'Count']
721
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
722
+
 
 
 
723
  with col3:
724
+ if not repeating_entities.empty:
725
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
726
+ fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
727
+ st.plotly_chart(fig_bar_freq, use_container_width=True)
728
+ else:
729
+ st.info("No entities repeat for frequency chart.")
730
+
731
  st.markdown("---")
732
  st.markdown("### 5. Entity Co-occurrence Network")
733
+
734
+ # 5. Network Graph
735
  network_fig = generate_network_graph(df, st.session_state.last_text)
736
  st.plotly_chart(network_fig, use_container_width=True)
737
+
 
738
  st.markdown("---")
739
+ st.markdown("### 6. Topic Modeling Analysis")
740
+
741
+ # 6. Topic Modeling Bubble Chart
742
  if df_topic_data is not None and not df_topic_data.empty:
743
  bubble_figure = create_topic_word_bubbles(df_topic_data)
744
  if bubble_figure:
745
  st.plotly_chart(bubble_figure, use_container_width=True)
746
  else:
747
+ st.error("Error generating Topic Word Bubble Chart.")
748
  else:
749
+ st.info("Topic modeling requires more unique input (at least two unique entities).")
750
 
751
+ # --- Report Download ---
752
  st.markdown("---")
753
+ st.markdown("### Download Full HTML Report")
754
+
755
+ html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
 
 
 
 
 
 
 
756
  st.download_button(
757
+ label="Download HTML Report",
758
  data=html_report,
759
+ file_name="ner_topic_report.html",
760
+ mime="text/html",
761
+ type="primary"
762
  )
763