AIEcosystem commited on
Commit
9881c2e
·
verified ·
1 Parent(s): b2f8b8b

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +197 -177
src/streamlit_app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import time
3
  import streamlit as st
4
  import streamlit.components.v1 as components
@@ -10,23 +11,28 @@ import numpy as np
10
  import re
11
  import string
12
  import json
13
-
14
- # --- PPTX Imports (Kept for completeness) ---
15
  from io import BytesIO
16
  from pptx import Presentation
17
  from pptx.util import Inches, Pt
18
  from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
  import plotly.io as pio # Required for image export
20
- # -------------------------------------------
21
-
22
  # --- Stable Scikit-learn LDA Imports ---
23
  from sklearn.feature_extraction.text import TfidfVectorizer
24
  from sklearn.decomposition import LatentDirichletAllocation
25
- # ---------------------------------------
26
-
27
  from gliner import GLiNER
28
  from streamlit_extras.stylable_container import stylable_container
29
 
 
 
 
 
 
 
 
 
30
  # Using a try/except for comet_ml import
31
  try:
32
  from comet_ml import Experiment
@@ -36,11 +42,9 @@ except ImportError:
36
  def log_parameter(self, *args): pass
37
  def log_table(self, *args): pass
38
  def end(self): pass
39
-
40
  # --- Model Home Directory (Fix for deployment environments) ---
41
  # Set HF_HOME environment variable to a writable path
42
  os.environ['HF_HOME'] = '/tmp'
43
-
44
  # --- Color Map for Highlighting and Network Graph Nodes ---
45
  entity_color_map = {
46
  "person": "#10b981",
@@ -52,28 +56,23 @@ entity_color_map = {
52
  "cardinal": "#06b6d4",
53
  "money": "#f43f5e",
54
  "position": "#a855f7",
55
- }
56
-
57
  # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
58
  labels = list(entity_color_map.keys())
59
  category_mapping = {
60
  "People": ["person", "organization", "position"],
61
  "Locations": ["country", "city"],
62
  "Time": ["date", "time"],
63
- "Numbers": ["money", "cardinal"]
64
- }
65
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
66
-
67
  # --- Utility Functions for Analysis and Plotly ---
68
  def extract_label(node_name):
69
  """Extracts the label from a node string like 'Text (Label)'."""
70
  match = re.search(r'\(([^)]+)\)$', node_name)
71
  return match.group(1) if match else "Unknown"
72
-
73
  def remove_trailing_punctuation(text_string):
74
  """Removes trailing punctuation from a string."""
75
  return text_string.rstrip(string.punctuation)
76
-
77
  def highlight_entities(text, df_entities):
78
  """Generates HTML to display text with entities highlighted and colored."""
79
  if df_entities.empty:
@@ -94,31 +93,33 @@ def highlight_entities(text, df_entities):
94
  # Use a div to mimic the Streamlit input box style for the report
95
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
96
 
 
97
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
98
  """
99
  Performs basic Topic Modeling using LDA on the extracted entities,
100
- allowing for n-grams (up to 3 words) to capture multi-word entities.
101
  """
102
- # 1. Prepare Documents: Use unique entities
103
  documents = df_entities['text'].unique().tolist()
104
-
105
  if len(documents) < 2:
106
  return None
107
-
108
  N = min(num_top_words, len(documents))
109
 
110
  try:
111
- # 2. Vectorizer: Use TfidfVectorizer with ngram_range to capture multi-word entities.
 
112
  tfidf_vectorizer = TfidfVectorizer(
113
  max_df=0.95,
114
  min_df=2, # Only consider words/phrases that appear at least twice to find topics
115
  stop_words='english',
116
- ngram_range=(1, 3)
117
  )
118
 
119
  tfidf = tfidf_vectorizer.fit_transform(documents)
120
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
121
-
122
  # Check if the vocabulary is too small after tokenization/ngram generation
123
  if len(tfidf_feature_names) < num_topics:
124
  # Re-run with min_df=1 if vocab is too small
@@ -136,35 +137,43 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
136
  random_state=42, n_jobs=-1
137
  )
138
  lda.fit(tfidf)
139
-
140
  # 4. Extract Topic Data
141
  topic_data_list = []
142
  for topic_idx, topic in enumerate(lda.components_):
143
  top_words_indices = topic.argsort()[:-N - 1:-1]
 
144
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
145
  word_weights = [topic[i] for i in top_words_indices]
146
-
147
  for word, weight in zip(top_words, word_weights):
148
  topic_data_list.append({
149
  'Topic_ID': f'Topic #{topic_idx + 1}',
150
  'Word': word,
151
  'Weight': weight,
152
  })
153
-
154
  return pd.DataFrame(topic_data_list)
155
-
156
  except Exception as e:
 
 
157
  return None
 
 
 
 
158
 
159
  def create_topic_word_bubbles(df_topic_data):
160
  """Generates a Plotly Bubble Chart for top words across
161
  all topics, displaying the word directly on the bubble."""
162
  # Renaming columns to match the output of perform_topic_modeling
163
- df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
 
164
  df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position
165
  if df_topic_data.empty:
166
  return None
167
-
168
  fig = px.scatter(
169
  df_topic_data,
170
  x='x_pos',
@@ -174,7 +183,7 @@ def create_topic_word_bubbles(df_topic_data):
174
  # Set text to the word
175
  text='word',
176
  hover_name='word',
177
- size_max=40, # Reduced size_max for smaller bubbles
178
  title='Topic Word Weights (Bubble Chart)',
179
  color_discrete_sequence=px.colors.qualitative.Bold,
180
  labels={
@@ -188,7 +197,7 @@ def create_topic_word_bubbles(df_topic_data):
188
  fig.update_layout(
189
  xaxis_title="Entity/Word",
190
  yaxis_title="Word Weight",
191
- # Hides the vertical X-axis line, tick labels, and grid
192
  xaxis={'tickangle': -45, 'showgrid': False, 'showticklabels': False, 'zeroline': False, 'showline': False},
193
  yaxis={'showgrid': True},
194
  showlegend=True,
@@ -197,56 +206,54 @@ def create_topic_word_bubbles(df_topic_data):
197
  height=600,
198
  margin=dict(t=50, b=100, l=50, r=10),
199
  )
200
-
201
- # Update traces to set text color to white
202
  fig.update_traces(
 
203
  textposition='middle center',
204
- textfont=dict(color='white', size=10), # Fix for text visibility
 
 
 
205
  hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',
206
  marker=dict(line=dict(width=1, color='DarkSlateGrey'))
207
  )
208
-
209
  return fig
210
 
 
 
211
  def generate_network_graph(df, raw_text):
212
  """
213
  Generates a network graph visualization (Node Plot) with edges
214
- based on entity co-occurrence in sentences.
215
  """
 
216
  entity_counts = df['text'].value_counts().reset_index()
217
  entity_counts.columns = ['text', 'frequency']
218
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
219
-
220
  if unique_entities.shape[0] < 2:
221
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
222
-
223
- # Positioning logic (simplified circular layout with slight jitter)
224
  num_nodes = len(unique_entities)
225
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
226
  radius = 10
227
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
228
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
229
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
230
-
231
- # Co-occurrence Edges based on sentences
232
  edges = set()
233
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
234
-
235
  for sentence in sentences:
236
  entities_in_sentence = []
237
  for entity_text in unique_entities['text'].unique():
238
  if entity_text.lower() in sentence.lower():
239
  entities_in_sentence.append(entity_text)
240
  unique_entities_in_sentence = list(set(entities_in_sentence))
241
-
242
- # Create edges for all pairs in the sentence
243
  for i in range(len(unique_entities_in_sentence)):
244
  for j in range(i + 1, len(unique_entities_in_sentence)):
245
  node1 = unique_entities_in_sentence[i]
246
  node2 = unique_entities_in_sentence[j]
247
  edge_tuple = tuple(sorted((node1, node2)))
248
  edges.add(edge_tuple)
249
-
250
  edge_x = []
251
  edge_y = []
252
  for edge in edges:
@@ -254,10 +261,7 @@ def generate_network_graph(df, raw_text):
254
  if n1 in pos_map and n2 in pos_map:
255
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
256
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
257
-
258
  fig = go.Figure()
259
-
260
- # Edge Trace
261
  edge_trace = go.Scatter(
262
  x=edge_x, y=edge_y,
263
  line=dict(width=0.5, color='#888'),
@@ -267,8 +271,6 @@ def generate_network_graph(df, raw_text):
267
  showlegend=False
268
  )
269
  fig.add_trace(edge_trace)
270
-
271
- # Node Trace
272
  fig.add_trace(go.Scatter(
273
  x=unique_entities['x'],
274
  y=unique_entities['y'],
@@ -278,7 +280,6 @@ def generate_network_graph(df, raw_text):
278
  textposition="top center",
279
  showlegend=False,
280
  marker=dict(
281
- # Size nodes based on frequency
282
  size=unique_entities['frequency'] * 5 + 10,
283
  color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
284
  line_width=1,
@@ -294,8 +295,6 @@ def generate_network_graph(df, raw_text):
294
  "Frequency: %{customdata[2]}<extra></extra>"
295
  )
296
  ))
297
-
298
- # Custom Legend for Node Colors
299
  legend_traces = []
300
  seen_labels = set()
301
  for index, row in unique_entities.iterrows():
@@ -308,7 +307,6 @@ def generate_network_graph(df, raw_text):
308
  ))
309
  for trace in legend_traces:
310
  fig.add_trace(trace)
311
-
312
  fig.update_layout(
313
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
314
  showlegend=True,
@@ -321,8 +319,7 @@ def generate_network_graph(df, raw_text):
321
  height=600
322
  )
323
  return fig
324
-
325
- # --- CSV GENERATION FUNCTION ---
326
  def generate_entity_csv(df):
327
  """
328
  Generates a CSV file of the extracted entities in an in-memory buffer,
@@ -334,16 +331,14 @@ def generate_entity_csv(df):
334
  csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
335
  csv_buffer.seek(0)
336
  return csv_buffer
337
- # -----------------------------
338
-
339
- # --- HTML REPORT GENERATION FUNCTION ---
340
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
341
  """
342
  Generates a full HTML report containing all analysis results and visualizations.
343
- Includes mobile-specific CSS fixes.
344
  """
345
  # 1. Generate Visualizations (Plotly HTML)
346
-
347
  # 1a. Treemap
348
  fig_treemap = px.treemap(
349
  df,
@@ -355,101 +350,69 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
355
  )
356
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
357
  treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
358
-
359
  # 1b. Pie Chart
360
  grouped_counts = df['category'].value_counts().reset_index()
361
  grouped_counts.columns = ['Category', 'Count']
 
362
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
363
  fig_pie.update_layout(margin=dict(t=50, b=10))
364
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
365
-
366
  # 1c. Bar Chart (Category Count)
367
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
368
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
369
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
370
-
371
  # 1d. Bar Chart (Most Frequent Entities)
372
  word_counts = df['text'].value_counts().reset_index()
373
  word_counts.columns = ['Entity', 'Count']
374
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
375
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
376
  if not repeating_entities.empty:
 
377
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
378
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
379
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
380
-
381
  # 1e. Network Graph HTML
382
  network_fig = generate_network_graph(df, text_input)
383
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
384
-
385
  # 1f. Topic Charts HTML
386
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
387
  if df_topic_data is not None and not df_topic_data.empty:
388
  bubble_figure = create_topic_word_bubbles(df_topic_data)
389
  if bubble_figure:
390
- # Added config={'responsive': True} for HTML report resizing
391
  topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn", config={"responsive": True})}</div>'
392
  else:
393
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
394
  else:
395
- topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">'
396
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
397
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
398
  topic_charts_html += '</div>'
399
-
400
  # 2. Get Highlighted Text
401
  highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
402
-
403
  # 3. Entity Tables (Pandas to HTML)
404
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
405
  classes='table table-striped',
406
  index=False
407
  )
408
-
409
- # 4. Construct the Final HTML with Corrected Mobile CSS
410
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
411
  <meta charset="UTF-8">
412
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
413
  <title>Entity and Topic Analysis Report</title>
414
  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
415
  <style>
416
- body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px;background-color: #f4f4f9; color: #333; }}
417
- .container {{ max-width: 1200px; margin: 0 auto; background-color:#ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12pxrgba(0,0,0,0.1); }}
418
- h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom:10px; margin-top: 0; }}
419
- h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd;padding-bottom: 5px; }}
420
  h3 {{ color: #555; margin-top: 20px; }}
421
- .metadata {{ background-color: #e6f0ff; padding: 15px; border-radius:8px; margin-bottom: 20px; font-size: 0.9em; }}
422
- .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius:8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px;}}
423
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
424
- table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align:left; }}
425
  table th {{ background-color: #f0f0f0; }}
426
- .highlighted-text {{ border: 1px solid #888888; padding: 15px;border-radius: 5px; background-color: #ffffff; font-family: monospace;white-space: pre-wrap; margin-bottom: 20px; }}
427
-
428
- /* === MOBILE-SPECIFIC FIXES FOR REPORT OVERLAP === */
429
- @media (max-width: 600px) {
430
- body {
431
- padding: 10px;
432
- }
433
- .container {
434
- padding: 10px;
435
- border-radius: 0;
436
- }
437
- .chart-box {
438
- padding: 5px;
439
- overflow-x: auto; /* Allow horizontal scrolling for wide charts */
440
- }
441
- /* Ensures the Plotly chart inside has a minimum width */
442
- .chart-box > div {
443
- min-width: 400px;
444
- }
445
- /* Force tables to be scrollable */
446
- table {
447
- display: block;
448
- overflow-x: auto;
449
- white-space: nowrap;
450
- }
451
- }
452
- /* ============================================== */
453
  </style></head><body>
454
  <div class="container">
455
  <h1>Entity and Topic Analysis Report</h1>
@@ -478,10 +441,10 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
478
  </div></body></html>
479
  """
480
  return html_content
481
-
482
  # --- Page Configuration and Styling (No Sidebar) ---
483
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
484
 
 
485
  # --- Conditional Mobile Warning ---
486
  st.markdown(
487
  """
@@ -517,35 +480,50 @@ st.markdown(
517
  )
518
  # ----------------------------------
519
 
520
- # --- General Streamlit Style Fixes ---
 
 
 
 
 
 
 
521
  st.markdown(
522
  """
523
  <style>
 
524
  /* --- FIX: Tab Label Colors for Visibility --- */
 
525
  [data-testid="stConfigurableTabs"] button {
526
- color: #333333 !important;
527
- background-color: #f0f0f0;
528
  border: 1px solid #cccccc;
529
  }
530
  /* Target the ACTIVE tab label */
531
  [data-testid="stConfigurableTabs"] button[aria-selected="true"] {
532
- color: #FFFFFF !important;
533
- background-color: #007bff;
534
- border-bottom: 2px solid #007bff;
535
  }
536
- /* Expander header color fix */
 
537
  .streamlit-expanderHeader {
538
- color: #007bff;
539
  }
540
  </style>
541
  """,
542
  unsafe_allow_html=True
543
  )
544
 
545
- st.subheader("Entity and Topic Analysis Report Generator", divider="blue")
 
546
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
547
 
548
- tab1, tab2 = st.tabs(["Embed", "Important Notes"])
 
 
 
 
549
 
550
  with tab1:
551
  with st.expander("Embed"):
@@ -558,25 +536,32 @@ with tab1:
558
  height="450"
559
  ></iframe>
560
  '''
561
- st.code(code, language="html")
 
 
562
 
563
  with tab2:
564
  expander = st.expander("**Important Notes**")
 
 
565
  expander.markdown("""
566
  **Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
 
567
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
 
568
  **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
 
569
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
570
  """)
571
 
572
- st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 
573
 
574
  # --- Comet ML Setup (Placeholder/Conditional) ---
575
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
576
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
577
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
578
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
579
-
580
  # --- Model Loading ---
581
  @st.cache_resource
582
  def load_ner_model():
@@ -586,10 +571,9 @@ def load_ner_model():
586
  except Exception as e:
587
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
588
  st.stop()
589
-
590
  model = load_ner_model()
591
-
592
  # --- LONG DEFAULT TEXT (178 Words) ---
 
593
  DEFAULT_TEXT = (
594
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
595
  "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
@@ -606,9 +590,16 @@ DEFAULT_TEXT = (
606
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
607
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026."
608
  )
609
- # -----------------------------------
610
 
611
- # --- Session State Initialization (CRITICAL) ---
 
 
 
 
 
 
 
 
612
  if 'show_results' not in st.session_state:
613
  st.session_state.show_results = False
614
  if 'last_text' not in st.session_state:
@@ -619,11 +610,9 @@ if 'elapsed_time' not in st.session_state:
619
  st.session_state.elapsed_time = 0.0
620
  if 'topic_results' not in st.session_state:
621
  st.session_state.topic_results = None
622
- # --- FIX: Only set default text in session state, not in st.text_area value ---
623
  if 'my_text_area' not in st.session_state:
624
  st.session_state.my_text_area = DEFAULT_TEXT
625
-
626
- # --- Clear Button Function ---
627
  def clear_text():
628
  """Clears the text area (sets it to an empty string) and hides results."""
629
  st.session_state['my_text_area'] = ""
@@ -632,19 +621,16 @@ def clear_text():
632
  st.session_state.results_df = pd.DataFrame()
633
  st.session_state.elapsed_time = 0.0
634
  st.session_state.topic_results = None
635
-
636
  # --- Text Input and Clear Button ---
637
  word_limit = 1000
638
  text = st.text_area(
639
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
640
  height=250,
641
- key='my_text_area', # Streamlit automatically uses st.session_state.my_text_area here
642
  )
643
-
644
  word_count = len(text.split())
645
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
646
  st.button("Clear text", on_click=clear_text)
647
-
648
  # --- Results Trigger and Processing (Updated Logic) ---
649
  if st.button("Results"):
650
  if not text.strip():
@@ -658,25 +644,20 @@ if st.button("Results"):
658
  if text != st.session_state.last_text:
659
  st.session_state.last_text = text
660
  start_time = time.time()
661
-
662
  # --- Model Prediction & Dataframe Creation ---
663
  entities = model.predict_entities(text, labels)
664
  df = pd.DataFrame(entities)
665
-
666
  if not df.empty:
667
  df['text'] = df['text'].apply(remove_trailing_punctuation)
668
  df['category'] = df['label'].map(reverse_category_mapping)
669
  st.session_state.results_df = df
670
-
671
  unique_entity_count = len(df['text'].unique())
672
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
673
-
674
  st.session_state.topic_results = perform_topic_modeling(
675
  df,
676
  num_topics=2,
677
  num_top_words=N_TOP_WORDS_TO_USE
678
  )
679
-
680
  if comet_initialized:
681
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
682
  experiment.log_parameter("input_text", text)
@@ -685,37 +666,32 @@ if st.button("Results"):
685
  else:
686
  st.session_state.results_df = pd.DataFrame()
687
  st.session_state.topic_results = None
688
-
689
  end_time = time.time()
690
  st.session_state.elapsed_time = end_time - start_time
691
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
692
-
693
  st.session_state.show_results = True
694
-
695
- # --- Display Download Link and Results (Updated with Download Buttons) ---
696
  if st.session_state.show_results:
697
  df = st.session_state.results_df
698
  df_topic_data = st.session_state.topic_results
699
-
700
  if df.empty:
701
  st.warning("No entities were found in the provided text.")
702
  else:
703
  st.subheader("Analysis Results", divider="blue")
704
-
705
  # 1. Highlighted Text
706
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
707
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
708
-
709
  # 2. Detailed Entity Analysis Tabs
710
  st.markdown("### 2. Detailed Entity Analysis")
711
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
712
-
713
  with tab_category_details:
714
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
715
-
 
 
716
  unique_categories = list(category_mapping.keys())
717
  tabs_category = st.tabs(unique_categories)
718
-
719
  for category, tab in zip(unique_categories, tabs_category):
720
  df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
721
  with tab:
@@ -726,45 +702,89 @@ if st.session_state.show_results:
726
  use_container_width=True,
727
  column_config={'score': st.column_config.NumberColumn(format="%.4f")}
728
  )
729
-
 
 
 
 
 
 
 
 
 
 
 
 
730
  with tab_treemap_viz:
 
731
  fig_treemap = px.treemap(
732
  df,
733
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
734
  values='score',
735
  color='category',
736
- title="Entity Distribution by Category and Label",
737
  color_discrete_sequence=px.colors.qualitative.Dark24
738
  )
739
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
740
  st.plotly_chart(fig_treemap, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
 
742
-
743
- # 3. Download Options (NEW)
744
- st.markdown("### 3. Download Options")
745
- col_csv, col_html = st.columns(2)
746
-
747
- with col_csv:
748
- csv_data = generate_entity_csv(df)
749
- st.download_button(
750
- label="Download Entities as CSV",
751
- data=csv_data,
752
- file_name="entity_analysis_data.csv",
753
- mime="text/csv",
754
- type="primary"
755
- )
756
-
757
- with col_html:
758
- html_report = generate_html_report(
759
- df,
760
- st.session_state.last_text,
761
- st.session_state.elapsed_time,
762
- df_topic_data
763
- )
764
- st.download_button(
765
- label="Download Full HTML Report",
766
- data=html_report,
767
- file_name="entity_topic_report.html",
768
- mime="text/html",
769
- type="secondary"
770
- )
 
1
  import os
2
+ os.environ['HF_HOME'] = '/tmp'
3
  import time
4
  import streamlit as st
5
  import streamlit.components.v1 as components
 
11
  import re
12
  import string
13
  import json
14
+ # --- PPTX Imports ---
 
15
  from io import BytesIO
16
  from pptx import Presentation
17
  from pptx.util import Inches, Pt
18
  from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
  import plotly.io as pio # Required for image export
20
+ # ---------------------------
 
21
  # --- Stable Scikit-learn LDA Imports ---
22
  from sklearn.feature_extraction.text import TfidfVectorizer
23
  from sklearn.decomposition import LatentDirichletAllocation
24
+ # ------------------------------
 
25
  from gliner import GLiNER
26
  from streamlit_extras.stylable_container import stylable_container
27
 
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
  # Using a try/except for comet_ml import
37
  try:
38
  from comet_ml import Experiment
 
42
  def log_parameter(self, *args): pass
43
  def log_table(self, *args): pass
44
  def end(self): pass
 
45
  # --- Model Home Directory (Fix for deployment environments) ---
46
  # Set HF_HOME environment variable to a writable path
47
  os.environ['HF_HOME'] = '/tmp'
 
48
  # --- Color Map for Highlighting and Network Graph Nodes ---
49
  entity_color_map = {
50
  "person": "#10b981",
 
56
  "cardinal": "#06b6d4",
57
  "money": "#f43f5e",
58
  "position": "#a855f7",
59
+ }
 
60
  # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
61
  labels = list(entity_color_map.keys())
62
  category_mapping = {
63
  "People": ["person", "organization", "position"],
64
  "Locations": ["country", "city"],
65
  "Time": ["date", "time"],
66
+ "Numbers": ["money", "cardinal"]}
 
67
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 
68
  # --- Utility Functions for Analysis and Plotly ---
69
  def extract_label(node_name):
70
  """Extracts the label from a node string like 'Text (Label)'."""
71
  match = re.search(r'\(([^)]+)\)$', node_name)
72
  return match.group(1) if match else "Unknown"
 
73
  def remove_trailing_punctuation(text_string):
74
  """Removes trailing punctuation from a string."""
75
  return text_string.rstrip(string.punctuation)
 
76
  def highlight_entities(text, df_entities):
77
  """Generates HTML to display text with entities highlighted and colored."""
78
  if df_entities.empty:
 
93
  # Use a div to mimic the Streamlit input box style for the report
94
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
95
 
96
+
97
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
98
  """
99
  Performs basic Topic Modeling using LDA on the extracted entities,
100
+ allowing for n-grams to capture multi-word entities like 'Dr. Emily Carter'.
101
  """
102
+ # 1. Prepare Documents: Use unique entities (they are short, clean documents)
103
  documents = df_entities['text'].unique().tolist()
104
+
105
  if len(documents) < 2:
106
  return None
107
+
108
  N = min(num_top_words, len(documents))
109
 
110
  try:
111
+ # 2. Vectorizer: Use TfidfVectorizer, but allow unigrams, bigrams, and trigrams (ngram_range)
112
+ # to capture multi-word entities. We keep stop_words='english' for the *components* of the entity.
113
  tfidf_vectorizer = TfidfVectorizer(
114
  max_df=0.95,
115
  min_df=2, # Only consider words/phrases that appear at least twice to find topics
116
  stop_words='english',
117
+ ngram_range=(1, 3) # This is the KEY to capturing "Dr. Emily Carter" as a single token (if it appears enough times)
118
  )
119
 
120
  tfidf = tfidf_vectorizer.fit_transform(documents)
121
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
122
+
123
  # Check if the vocabulary is too small after tokenization/ngram generation
124
  if len(tfidf_feature_names) < num_topics:
125
  # Re-run with min_df=1 if vocab is too small
 
137
  random_state=42, n_jobs=-1
138
  )
139
  lda.fit(tfidf)
140
+
141
  # 4. Extract Topic Data
142
  topic_data_list = []
143
  for topic_idx, topic in enumerate(lda.components_):
144
  top_words_indices = topic.argsort()[:-N - 1:-1]
145
+ # These top_words will now include phrases like 'emily carter' or 'european space agency'
146
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
147
  word_weights = [topic[i] for i in top_words_indices]
148
+
149
  for word, weight in zip(top_words, word_weights):
150
  topic_data_list.append({
151
  'Topic_ID': f'Topic #{topic_idx + 1}',
152
  'Word': word,
153
  'Weight': weight,
154
  })
155
+
156
  return pd.DataFrame(topic_data_list)
157
+
158
  except Exception as e:
159
+ # A broader catch for robustness
160
+ # st.error(f"Topic modeling failed: {e}") # Keep commented out for cleaner app
161
  return None
162
+
163
+
164
+
165
+
166
 
167
  def create_topic_word_bubbles(df_topic_data):
168
  """Generates a Plotly Bubble Chart for top words across
169
  all topics, displaying the word directly on the bubble."""
170
  # Renaming columns to match the output of perform_topic_modeling
171
+ df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic',
172
+ 'Word': 'word', 'Weight': 'weight'})
173
  df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position
174
  if df_topic_data.empty:
175
  return None
176
+
177
  fig = px.scatter(
178
  df_topic_data,
179
  x='x_pos',
 
183
  # Set text to the word
184
  text='word',
185
  hover_name='word',
186
+ size_max=40,
187
  title='Topic Word Weights (Bubble Chart)',
188
  color_discrete_sequence=px.colors.qualitative.Bold,
189
  labels={
 
197
  fig.update_layout(
198
  xaxis_title="Entity/Word",
199
  yaxis_title="Word Weight",
200
+ # Hide x-axis labels since words are now labels
201
  xaxis={'tickangle': -45, 'showgrid': False, 'showticklabels': False, 'zeroline': False, 'showline': False},
202
  yaxis={'showgrid': True},
203
  showlegend=True,
 
206
  height=600,
207
  margin=dict(t=50, b=100, l=50, r=10),
208
  )
209
+
210
+ # Update traces to show the word text, set the text position, and set text color
211
  fig.update_traces(
212
+ # Position the text on top of the bubble
213
  textposition='middle center',
214
+ # --- THE KEY FIX IS HERE ---
215
+ # Set the text color to white for visibility against dark bubble colors
216
+ textfont=dict(color='white', size=10),
217
+ # ---------------------------
218
  hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',
219
  marker=dict(line=dict(width=1, color='DarkSlateGrey'))
220
  )
221
+
222
  return fig
223
 
224
+
225
+
226
  def generate_network_graph(df, raw_text):
227
  """
228
  Generates a network graph visualization (Node Plot) with edges
229
+ based on entity co-occurrence in sentences. (Content omitted for brevity but assumed to be here).
230
  """
231
+ # Using the existing generate_network_graph logic from previous context...
232
  entity_counts = df['text'].value_counts().reset_index()
233
  entity_counts.columns = ['text', 'frequency']
234
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
 
235
  if unique_entities.shape[0] < 2:
236
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
 
 
237
  num_nodes = len(unique_entities)
238
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
239
  radius = 10
240
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
241
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
242
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
 
 
243
  edges = set()
244
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
 
245
  for sentence in sentences:
246
  entities_in_sentence = []
247
  for entity_text in unique_entities['text'].unique():
248
  if entity_text.lower() in sentence.lower():
249
  entities_in_sentence.append(entity_text)
250
  unique_entities_in_sentence = list(set(entities_in_sentence))
 
 
251
  for i in range(len(unique_entities_in_sentence)):
252
  for j in range(i + 1, len(unique_entities_in_sentence)):
253
  node1 = unique_entities_in_sentence[i]
254
  node2 = unique_entities_in_sentence[j]
255
  edge_tuple = tuple(sorted((node1, node2)))
256
  edges.add(edge_tuple)
 
257
  edge_x = []
258
  edge_y = []
259
  for edge in edges:
 
261
  if n1 in pos_map and n2 in pos_map:
262
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
263
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
 
264
  fig = go.Figure()
 
 
265
  edge_trace = go.Scatter(
266
  x=edge_x, y=edge_y,
267
  line=dict(width=0.5, color='#888'),
 
271
  showlegend=False
272
  )
273
  fig.add_trace(edge_trace)
 
 
274
  fig.add_trace(go.Scatter(
275
  x=unique_entities['x'],
276
  y=unique_entities['y'],
 
280
  textposition="top center",
281
  showlegend=False,
282
  marker=dict(
 
283
  size=unique_entities['frequency'] * 5 + 10,
284
  color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
285
  line_width=1,
 
295
  "Frequency: %{customdata[2]}<extra></extra>"
296
  )
297
  ))
 
 
298
  legend_traces = []
299
  seen_labels = set()
300
  for index, row in unique_entities.iterrows():
 
307
  ))
308
  for trace in legend_traces:
309
  fig.add_trace(trace)
 
310
  fig.update_layout(
311
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
312
  showlegend=True,
 
319
  height=600
320
  )
321
  return fig
322
+ # --- NEW CSV GENERATION FUNCTION ---
 
323
  def generate_entity_csv(df):
324
  """
325
  Generates a CSV file of the extracted entities in an in-memory buffer,
 
331
  csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
332
  csv_buffer.seek(0)
333
  return csv_buffer
334
+ # -----------------------------------
335
+ # --- Existing App Functionality (HTML) ---
 
336
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
337
  """
338
  Generates a full HTML report containing all analysis results and visualizations.
339
+ (Content omitted for brevity but assumed to be here).
340
  """
341
  # 1. Generate Visualizations (Plotly HTML)
 
342
  # 1a. Treemap
343
  fig_treemap = px.treemap(
344
  df,
 
350
  )
351
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
352
  treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
 
353
  # 1b. Pie Chart
354
  grouped_counts = df['category'].value_counts().reset_index()
355
  grouped_counts.columns = ['Category', 'Count']
356
+ # Changed color_discrete_sequence from sequential.RdBu (which has reds) to sequential.Cividis
357
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
358
  fig_pie.update_layout(margin=dict(t=50, b=10))
359
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
 
360
  # 1c. Bar Chart (Category Count)
361
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
362
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
363
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
 
364
  # 1d. Bar Chart (Most Frequent Entities)
365
  word_counts = df['text'].value_counts().reset_index()
366
  word_counts.columns = ['Entity', 'Count']
367
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
368
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
369
  if not repeating_entities.empty:
370
+ # Changed color_discrete_sequence from sequential.Plasma (which has pink/magenta) to sequential.Viridis
371
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
372
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
373
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
 
374
  # 1e. Network Graph HTML
375
  network_fig = generate_network_graph(df, text_input)
376
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
 
377
  # 1f. Topic Charts HTML
378
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
379
  if df_topic_data is not None and not df_topic_data.empty:
380
  bubble_figure = create_topic_word_bubbles(df_topic_data)
381
  if bubble_figure:
382
+
383
  topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn", config={"responsive": True})}</div>'
384
  else:
385
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
386
  else:
387
+ topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">' # Changed border color
388
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
389
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
390
  topic_charts_html += '</div>'
 
391
  # 2. Get Highlighted Text
392
  highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
 
393
  # 3. Entity Tables (Pandas to HTML)
394
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
395
  classes='table table-striped',
396
  index=False
397
  )
398
+ # 4. Construct the Final HTML
 
399
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
400
  <meta charset="UTF-8">
401
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
402
  <title>Entity and Topic Analysis Report</title>
403
  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
404
  <style>
405
+ body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
406
+ .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
407
+ h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom: 10px; margin-top: 0; }}
408
+ h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
409
  h3 {{ color: #555; margin-top: 20px; }}
410
+ .metadata {{ background-color: #e6f0ff; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
411
+ .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
412
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
413
+ table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
414
  table th {{ background-color: #f0f0f0; }}
415
+ .highlighted-text {{ border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  </style></head><body>
417
  <div class="container">
418
  <h1>Entity and Topic Analysis Report</h1>
 
441
  </div></body></html>
442
  """
443
  return html_content
 
444
  # --- Page Configuration and Styling (No Sidebar) ---
445
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
446
 
447
+
448
  # --- Conditional Mobile Warning ---
449
  st.markdown(
450
  """
 
480
  )
481
  # ----------------------------------
482
 
483
+
484
+
485
+
486
+
487
+
488
+
489
+
490
+
491
  st.markdown(
492
  """
493
  <style>
494
+ /* ... (Keep your existing styles for main, stApp, stTextArea, stButton) ... */
495
  /* --- FIX: Tab Label Colors for Visibility --- */
496
+ /* Target the container for the tab labels (the buttons) */
497
  [data-testid="stConfigurableTabs"] button {
498
+ color: #333333 !important; /* Dark gray for inactive tabs */
499
+ background-color: #f0f0f0; /* Light gray background for inactive tabs */
500
  border: 1px solid #cccccc;
501
  }
502
  /* Target the ACTIVE tab label */
503
  [data-testid="stConfigurableTabs"] button[aria-selected="true"] {
504
+ color: #FFFFFF !important; /* White text for active tab */
505
+ background-color: #007bff; /* Blue background for active tab */
506
+ border-bottom: 2px solid #007bff; /* Optional: adds an accent line */
507
  }
508
+
509
+ /* Expander header color fix (since you overwrote it to white) */
510
  .streamlit-expanderHeader {
511
+ color: #007bff; /* Blue text for Expander header */
512
  }
513
  </style>
514
  """,
515
  unsafe_allow_html=True
516
  )
517
 
518
+
519
+ st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
520
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
521
 
522
+
523
+
524
+
525
+
526
+ tab1, tab2 = st.tabs(["Embed", "Important Notes"]) # Assuming you have defined the tabs
527
 
528
  with tab1:
529
  with st.expander("Embed"):
 
536
  height="450"
537
  ></iframe>
538
  '''
539
+ st.code(code, language="html") # Keeps the copy icon, as intended for tab1
540
+
541
+
542
 
543
  with tab2:
544
  expander = st.expander("**Important Notes**")
545
+ # Use st.markdown() with a code block (```) to display the notes
546
+ # without the copy-to-clipboard icon, and retaining the styling.
547
  expander.markdown("""
548
  **Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
549
+
550
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
551
+
552
  **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
553
+
554
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
555
  """)
556
 
557
+
558
+ st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
559
 
560
  # --- Comet ML Setup (Placeholder/Conditional) ---
561
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
562
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
563
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
564
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 
565
  # --- Model Loading ---
566
  @st.cache_resource
567
  def load_ner_model():
 
571
  except Exception as e:
572
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
573
  st.stop()
 
574
  model = load_ner_model()
 
575
  # --- LONG DEFAULT TEXT (178 Words) ---
576
+
577
  DEFAULT_TEXT = (
578
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
579
  "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
 
590
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
591
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026."
592
  )
 
593
 
594
+
595
+
596
+
597
+
598
+
599
+
600
+
601
+ # -----------------------------------
602
+ # --- Session State Initialization (CRITICAL FIX) ---
603
  if 'show_results' not in st.session_state:
604
  st.session_state.show_results = False
605
  if 'last_text' not in st.session_state:
 
610
  st.session_state.elapsed_time = 0.0
611
  if 'topic_results' not in st.session_state:
612
  st.session_state.topic_results = None
 
613
  if 'my_text_area' not in st.session_state:
614
  st.session_state.my_text_area = DEFAULT_TEXT
615
+ # --- Clear Button Function (MODIFIED) ---
 
616
  def clear_text():
617
  """Clears the text area (sets it to an empty string) and hides results."""
618
  st.session_state['my_text_area'] = ""
 
621
  st.session_state.results_df = pd.DataFrame()
622
  st.session_state.elapsed_time = 0.0
623
  st.session_state.topic_results = None
 
624
  # --- Text Input and Clear Button ---
625
  word_limit = 1000
626
  text = st.text_area(
627
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
628
  height=250,
629
+ key='my_text_area',
630
  )
 
631
  word_count = len(text.split())
632
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
633
  st.button("Clear text", on_click=clear_text)
 
634
  # --- Results Trigger and Processing (Updated Logic) ---
635
  if st.button("Results"):
636
  if not text.strip():
 
644
  if text != st.session_state.last_text:
645
  st.session_state.last_text = text
646
  start_time = time.time()
 
647
  # --- Model Prediction & Dataframe Creation ---
648
  entities = model.predict_entities(text, labels)
649
  df = pd.DataFrame(entities)
 
650
  if not df.empty:
651
  df['text'] = df['text'].apply(remove_trailing_punctuation)
652
  df['category'] = df['label'].map(reverse_category_mapping)
653
  st.session_state.results_df = df
 
654
  unique_entity_count = len(df['text'].unique())
655
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
 
656
  st.session_state.topic_results = perform_topic_modeling(
657
  df,
658
  num_topics=2,
659
  num_top_words=N_TOP_WORDS_TO_USE
660
  )
 
661
  if comet_initialized:
662
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
663
  experiment.log_parameter("input_text", text)
 
666
  else:
667
  st.session_state.results_df = pd.DataFrame()
668
  st.session_state.topic_results = None
 
669
  end_time = time.time()
670
  st.session_state.elapsed_time = end_time - start_time
671
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
 
672
  st.session_state.show_results = True
673
+ # --- Display Download Link and Results ---
 
674
  if st.session_state.show_results:
675
  df = st.session_state.results_df
676
  df_topic_data = st.session_state.topic_results
 
677
  if df.empty:
678
  st.warning("No entities were found in the provided text.")
679
  else:
680
  st.subheader("Analysis Results", divider="blue")
 
681
  # 1. Highlighted Text
682
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
683
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
684
+
685
  # 2. Detailed Entity Analysis Tabs
686
  st.markdown("### 2. Detailed Entity Analysis")
687
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
 
688
  with tab_category_details:
689
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
690
+
691
+
692
+
693
  unique_categories = list(category_mapping.keys())
694
  tabs_category = st.tabs(unique_categories)
 
695
  for category, tab in zip(unique_categories, tabs_category):
696
  df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
697
  with tab:
 
702
  use_container_width=True,
703
  column_config={'score': st.column_config.NumberColumn(format="%.4f")}
704
  )
705
+ else:
706
+ st.info(f"No entities of category **{category}** were found in the text.")
707
+
708
+
709
+ with st.expander("See Glossary of tags"):
710
+ st.write('''
711
+ - **text**: ['entity extracted from your text data']
712
+ - **label**: ['label (tag) assigned to a given extracted entity']
713
+ - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
714
+ - **start**: ['index of the start of the corresponding entity']
715
+ - **end**: ['index of the end of the corresponding entity']
716
+ ''')
717
+
718
  with tab_treemap_viz:
719
+ st.markdown("#### Treemap: Entity Distribution")
720
  fig_treemap = px.treemap(
721
  df,
722
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
723
  values='score',
724
  color='category',
 
725
  color_discrete_sequence=px.colors.qualitative.Dark24
726
  )
727
+ fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
728
  st.plotly_chart(fig_treemap, use_container_width=True)
729
+ # 3. Comparative Charts
730
+ st.markdown("---")
731
+ st.markdown("### 3. Comparative Charts")
732
+ col1, col2, col3 = st.columns(3)
733
+ grouped_counts = df['category'].value_counts().reset_index()
734
+ grouped_counts.columns = ['Category', 'Count']
735
+ with col1: # Pie Chart
736
+ # Changed color_discrete_sequence
737
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
738
+ fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
739
+ st.plotly_chart(fig_pie, use_container_width=True)
740
+ with col2: # Bar Chart (Category Count)
741
+ fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
742
+ fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
743
+ st.plotly_chart(fig_bar_category, use_container_width=True)
744
+ with col3: # Bar Chart (Most Frequent Entities)
745
+ word_counts = df['text'].value_counts().reset_index()
746
+ word_counts.columns = ['Entity', 'Count']
747
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
748
+ if not repeating_entities.empty:
749
+ # Changed color_discrete_sequence
750
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
751
+ fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
752
+ st.plotly_chart(fig_bar_freq, use_container_width=True)
753
+ else:
754
+ st.info("No entities repeat for frequency chart.")
755
+ st.markdown("---")
756
+ st.markdown("### 4. Entity Relationship Map")
757
+ network_fig = generate_network_graph(df, st.session_state.last_text)
758
+ st.plotly_chart(network_fig, use_container_width=True)
759
+ st.markdown("---")
760
+ st.markdown("### 5. Topic Modelling Analysis")
761
+ if df_topic_data is not None and not df_topic_data.empty:
762
+ bubble_figure = create_topic_word_bubbles(df_topic_data)
763
+ if bubble_figure:
764
+ st.plotly_chart(bubble_figure, use_container_width=True)
765
+ else:
766
+ st.error("Error generating Topic Word Bubble Chart.")
767
+ else:
768
+ st.info("Topic modeling requires more unique input (at least two unique entities).")
769
+ # --- Report Download ---
770
+ st.markdown("---")
771
+ st.markdown("### Download Full Report Artifacts")
772
+ # 1. HTML Report Download (Retained)
773
+ html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
774
+ st.download_button(
775
+ label="Download Comprehensive HTML Report",
776
+ data=html_report,
777
+ file_name="ner_topic_report.html",
778
+ mime="text/html",
779
+ type="primary"
780
+ )
781
 
782
+ # 2. CSV Data Download (NEW)
783
+ csv_buffer = generate_entity_csv(df)
784
+ st.download_button(
785
+ label="Download Extracted Entities (CSV)",
786
+ data=csv_buffer,
787
+ file_name="extracted_entities.csv",
788
+ mime="text/csv",
789
+ type="secondary"
790
+ )