AIEcosystem commited on
Commit
932f856
·
verified ·
1 Parent(s): b90f5cd

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +31 -133
src/streamlit_app.py CHANGED
@@ -24,7 +24,6 @@ from sklearn.decomposition import LatentDirichletAllocation
24
  # ------------------------------
25
  from gliner import GLiNER
26
  from streamlit_extras.stylable_container import stylable_container
27
-
28
  # Using a try/except for comet_ml import
29
  try:
30
  from comet_ml import Experiment
@@ -34,79 +33,56 @@ except ImportError:
34
  def log_parameter(self, *args): pass
35
  def log_table(self, *args): pass
36
  def end(self): pass
37
-
38
  # --- Model Home Directory (Fix for deployment environments) ---
39
  # Set HF_HOME environment variable to a writable path
40
  os.environ['HF_HOME'] = '/tmp'
41
-
42
  # --- Color Map for Highlighting and Network Graph Nodes ---
43
  entity_color_map = {
44
  "person": "#10b981",
45
  "country": "#3b82f6",
46
  "city": "#4ade80",
47
-
48
  "organization": "#f59e0b",
49
  "date": "#8b5cf6",
50
  "time": "#ec4899",
51
  "cardinal": "#06b6d4",
52
  "money": "#f43f5e",
53
  "position": "#a855f7",
54
-
55
- }
56
-
57
  # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
58
  labels = list(entity_color_map.keys())
59
-
60
-
61
-
62
  category_mapping = {
63
  "People": ["person", "organization", "position"],
64
  "Locations": ["country", "city"],
65
  "Time": ["date", "time"],
66
- "Numbers": ["money", "cardinal"]
67
- }
68
-
69
-
70
-
71
-
72
-
73
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
74
-
75
-
76
  # --- Utility Functions for Analysis and Plotly ---
77
  def extract_label(node_name):
78
  """Extracts the label from a node string like 'Text (Label)'."""
79
  match = re.search(r'\(([^)]+)\)$', node_name)
80
  return match.group(1) if match else "Unknown"
81
-
82
  def remove_trailing_punctuation(text_string):
83
  """Removes trailing punctuation from a string."""
84
  return text_string.rstrip(string.punctuation)
85
-
86
  def highlight_entities(text, df_entities):
87
  """Generates HTML to display text with entities highlighted and colored."""
88
  if df_entities.empty:
89
  return text
90
-
91
  # Sort entities by start index descending to insert highlights without affecting subsequent indices
92
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
93
  highlighted_text = text
94
-
95
  for entity in entities:
96
  start = entity['start']
97
  end = entity['end']
98
  label = entity['label']
99
  entity_text = entity['text']
100
  color = entity_color_map.get(label, '#000000')
101
-
102
  # Create a span with background color and tooltip
103
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
104
  # Replace the original text segment with the highlighted HTML
105
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
106
-
107
  # Use a div to mimic the Streamlit input box style for the report
108
- return f'<div style="border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
109
-
110
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
111
  """
112
  Performs basic Topic Modeling using LDA on the extracted entities
@@ -115,7 +91,6 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
115
  documents = df_entities['text'].unique().tolist()
116
  if len(documents) < 2:
117
  return None
118
-
119
  N = min(num_top_words, len(documents))
120
  try:
121
  tfidf_vectorizer = TfidfVectorizer(
@@ -125,7 +100,6 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
125
  )
126
  tfidf = tfidf_vectorizer.fit_transform(documents)
127
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
128
-
129
  lda = LatentDirichletAllocation(
130
  n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
131
  )
@@ -145,13 +119,11 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
145
  except Exception as e:
146
  st.error(f"Topic modeling failed: {e}")
147
  return None
148
-
149
  def create_topic_word_bubbles(df_topic_data):
150
  """Generates a Plotly Bubble Chart for top words across all topics."""
151
  # Renaming columns to match the output of perform_topic_modeling
152
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
153
  df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
154
-
155
  if df_topic_data.empty:
156
  return None
157
  fig = px.scatter(
@@ -177,14 +149,13 @@ def create_topic_word_bubbles(df_topic_data):
177
  xaxis={'tickangle': -45, 'showgrid': False},
178
  yaxis={'showgrid': True},
179
  showlegend=True,
180
- plot_bgcolor='#FFF0F5',
181
- paper_bgcolor='#FFF0F5',
182
  height=600,
183
  margin=dict(t=50, b=100, l=50, r=10),
184
  )
185
  fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>', marker=dict(line=dict(width=1, color='DarkSlateGrey')))
186
  return fig
187
-
188
  def generate_network_graph(df, raw_text):
189
  """
190
  Generates a network graph visualization (Node Plot) with edges
@@ -193,21 +164,16 @@ def generate_network_graph(df, raw_text):
193
  # Using the existing generate_network_graph logic from previous context...
194
  entity_counts = df['text'].value_counts().reset_index()
195
  entity_counts.columns = ['text', 'frequency']
196
-
197
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
198
  if unique_entities.shape[0] < 2:
199
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
200
-
201
  num_nodes = len(unique_entities)
202
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
203
-
204
  radius = 10
205
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
206
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
207
-
208
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
209
  edges = set()
210
-
211
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
212
  for sentence in sentences:
213
  entities_in_sentence = []
@@ -215,25 +181,20 @@ def generate_network_graph(df, raw_text):
215
  if entity_text.lower() in sentence.lower():
216
  entities_in_sentence.append(entity_text)
217
  unique_entities_in_sentence = list(set(entities_in_sentence))
218
-
219
  for i in range(len(unique_entities_in_sentence)):
220
  for j in range(i + 1, len(unique_entities_in_sentence)):
221
  node1 = unique_entities_in_sentence[i]
222
  node2 = unique_entities_in_sentence[j]
223
  edge_tuple = tuple(sorted((node1, node2)))
224
  edges.add(edge_tuple)
225
-
226
  edge_x = []
227
  edge_y = []
228
-
229
  for edge in edges:
230
  n1, n2 = edge
231
  if n1 in pos_map and n2 in pos_map:
232
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
233
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
234
-
235
  fig = go.Figure()
236
-
237
  edge_trace = go.Scatter(
238
  x=edge_x, y=edge_y,
239
  line=dict(width=0.5, color='#888'),
@@ -243,7 +204,6 @@ def generate_network_graph(df, raw_text):
243
  showlegend=False
244
  )
245
  fig.add_trace(edge_trace)
246
-
247
  fig.add_trace(go.Scatter(
248
  x=unique_entities['x'],
249
  y=unique_entities['y'],
@@ -268,7 +228,6 @@ def generate_network_graph(df, raw_text):
268
  "Frequency: %{customdata[2]}<extra></extra>"
269
  )
270
  ))
271
-
272
  legend_traces = []
273
  seen_labels = set()
274
  for index, row in unique_entities.iterrows():
@@ -281,7 +240,6 @@ def generate_network_graph(df, raw_text):
281
  ))
282
  for trace in legend_traces:
283
  fig.add_trace(trace)
284
-
285
  fig.update_layout(
286
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
287
  showlegend=True,
@@ -293,16 +251,7 @@ def generate_network_graph(df, raw_text):
293
  margin=dict(t=50, b=10, l=10, r=10),
294
  height=600
295
  )
296
-
297
  return fig
298
-
299
-
300
-
301
-
302
-
303
-
304
-
305
-
306
  # --- NEW CSV GENERATION FUNCTION ---
307
  def generate_entity_csv(df):
308
  """
@@ -316,16 +265,13 @@ def generate_entity_csv(df):
316
  csv_buffer.seek(0)
317
  return csv_buffer
318
  # -----------------------------------
319
-
320
  # --- Existing App Functionality (HTML) ---
321
-
322
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
323
  """
324
  Generates a full HTML report containing all analysis results and visualizations.
325
  (Content omitted for brevity but assumed to be here).
326
  """
327
  # 1. Generate Visualizations (Plotly HTML)
328
-
329
  # 1a. Treemap
330
  fig_treemap = px.treemap(
331
  df,
@@ -337,34 +283,30 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
337
  )
338
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
339
  treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
340
-
341
  # 1b. Pie Chart
342
  grouped_counts = df['category'].value_counts().reset_index()
343
  grouped_counts.columns = ['Category', 'Count']
344
- fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
 
345
  fig_pie.update_layout(margin=dict(t=50, b=10))
346
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
347
-
348
  # 1c. Bar Chart (Category Count)
349
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
350
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
351
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
352
-
353
  # 1d. Bar Chart (Most Frequent Entities)
354
  word_counts = df['text'].value_counts().reset_index()
355
  word_counts.columns = ['Entity', 'Count']
356
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
357
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
358
-
359
  if not repeating_entities.empty:
360
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
 
361
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
362
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
363
-
364
  # 1e. Network Graph HTML
365
  network_fig = generate_network_graph(df, text_input)
366
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
367
-
368
  # 1f. Topic Charts HTML
369
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
370
  if df_topic_data is not None and not df_topic_data.empty:
@@ -374,20 +316,17 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
374
  else:
375
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
376
  else:
377
- topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #FF69B4;">'
378
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
379
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
380
  topic_charts_html += '</div>'
381
-
382
  # 2. Get Highlighted Text
383
  highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
384
-
385
  # 3. Entity Tables (Pandas to HTML)
386
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
387
  classes='table table-striped',
388
  index=False
389
  )
390
-
391
  # 4. Construct the Final HTML
392
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
393
  <meta charset="UTF-8">
@@ -397,21 +336,20 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
397
  <style>
398
  body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
399
  .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
400
- h1 {{ color: #FF69B4; border-bottom: 3px solid #FF69B4; padding-bottom: 10px; margin-top: 0; }}
401
  h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
402
  h3 {{ color: #555; margin-top: 20px; }}
403
- .metadata {{ background-color: #FFE4E1; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
404
  .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
405
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
406
  table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
407
  table th {{ background-color: #f0f0f0; }}
408
- .highlighted-text {{ border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
409
  </style></head><body>
410
  <div class="container">
411
  <h1>Entity and Topic Analysis Report</h1>
412
  <div class="metadata">
413
- <p><strong>Generated at:</strong> {time.strftime('%Y-%m-%d')}</p>
414
-
415
  <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
416
  </div>
417
  <h2>1. Analyzed Text & Extracted Entities</h2>
@@ -435,8 +373,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
435
  </div></body></html>
436
  """
437
  return html_content
438
-
439
-
440
  # --- Page Configuration and Styling (No Sidebar) ---
441
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
442
  st.markdown(
@@ -444,21 +380,21 @@ st.markdown(
444
  <style>
445
  /* Overall app container - NO SIDEBAR */
446
  .main {
447
- background-color: #FFF0F5; /* Blanched Almond/Light Pink */
448
  color: #333333; /* Dark grey text for contrast */
449
  }
450
  .stApp {
451
- background-color: #FFF0F5;
452
  }
453
  /* Text Area background and text color (input fields) */
454
  .stTextArea textarea {
455
- background-color: #FFFAF0; /* Floral White/Near white for input fields */
456
  color: #000000; /* Black text for input */
457
- border: 1px solid #FF69B4; /* Deep Pink border */
458
  }
459
  /* Button styling */
460
  .stButton > button {
461
- background-color: #FF69B4; /* Deep Pink for the button */
462
  color: #FFFFFF; /* White text for contrast */
463
  border: none;
464
  padding: 10px 20px;
@@ -466,24 +402,25 @@ st.markdown(
466
  }
467
  /* Expander header and content background */
468
  .streamlit-expanderHeader, .streamlit-expanderContent {
469
- background-color: #FFE4E1; /* Misty Rose/Lighter Pink */
470
  color: #333333;
471
  }
472
  </style>
473
  """,
474
  unsafe_allow_html=True)
475
- st.subheader("Entity and Topic Analysis Report Generator", divider="rainbow")
476
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
477
-
478
  tab1, tab2 = st.tabs(["Important Notes", "Embed"])
479
  with tab1:
480
  expander = st.expander("**Important notes**")
481
  expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
 
482
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
 
483
  **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
 
484
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
485
 
486
-
487
  with tab2:
488
  with st.expander("Embed"):
489
  st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
@@ -496,15 +433,13 @@ with tab2:
496
  ></iframe>
497
  '''
498
  st.code(code, language="html")
499
-
500
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
501
-
502
  # --- Comet ML Setup (Placeholder/Conditional) ---
503
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
504
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
505
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
506
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
507
-
508
  # --- Model Loading ---
509
  @st.cache_resource
510
  def load_ner_model():
@@ -514,9 +449,7 @@ def load_ner_model():
514
  except Exception as e:
515
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
516
  st.stop()
517
-
518
  model = load_ner_model()
519
-
520
  # --- LONG DEFAULT TEXT (178 Words) ---
521
  DEFAULT_TEXT = (
522
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
@@ -531,8 +464,7 @@ DEFAULT_TEXT = (
531
  "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
532
  "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
533
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
534
- "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
535
- )
536
  # -----------------------------------
537
  # --- Session State Initialization (CRITICAL FIX) ---
538
  if 'show_results' not in st.session_state:
@@ -547,7 +479,6 @@ if 'topic_results' not in st.session_state:
547
  st.session_state.topic_results = None
548
  if 'my_text_area' not in st.session_state:
549
  st.session_state.my_text_area = DEFAULT_TEXT
550
-
551
  # --- Clear Button Function (MODIFIED) ---
552
  def clear_text():
553
  """Clears the text area (sets it to an empty string) and hides results."""
@@ -557,7 +488,6 @@ def clear_text():
557
  st.session_state.results_df = pd.DataFrame()
558
  st.session_state.elapsed_time = 0.0
559
  st.session_state.topic_results = None
560
-
561
  # --- Text Input and Clear Button ---
562
  word_limit = 1000
563
  text = st.text_area(
@@ -565,11 +495,9 @@ text = st.text_area(
565
  height=250,
566
  key='my_text_area',
567
  value=st.session_state.my_text_area)
568
-
569
  word_count = len(text.split())
570
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
571
  st.button("Clear text", on_click=clear_text)
572
-
573
  # --- Results Trigger and Processing (Updated Logic) ---
574
  if st.button("Results"):
575
  if not text.strip():
@@ -583,25 +511,20 @@ if st.button("Results"):
583
  if text != st.session_state.last_text:
584
  st.session_state.last_text = text
585
  start_time = time.time()
586
-
587
  # --- Model Prediction & Dataframe Creation ---
588
  entities = model.predict_entities(text, labels)
589
  df = pd.DataFrame(entities)
590
-
591
  if not df.empty:
592
  df['text'] = df['text'].apply(remove_trailing_punctuation)
593
  df['category'] = df['label'].map(reverse_category_mapping)
594
  st.session_state.results_df = df
595
-
596
  unique_entity_count = len(df['text'].unique())
597
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
598
-
599
  st.session_state.topic_results = perform_topic_modeling(
600
  df,
601
  num_topics=2,
602
  num_top_words=N_TOP_WORDS_TO_USE
603
  )
604
-
605
  if comet_initialized:
606
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
607
  experiment.log_parameter("input_text", text)
@@ -610,33 +533,25 @@ if st.button("Results"):
610
  else:
611
  st.session_state.results_df = pd.DataFrame()
612
  st.session_state.topic_results = None
613
-
614
  end_time = time.time()
615
  st.session_state.elapsed_time = end_time - start_time
616
-
617
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
618
  st.session_state.show_results = True
619
-
620
  # --- Display Download Link and Results ---
621
  if st.session_state.show_results:
622
  df = st.session_state.results_df
623
  df_topic_data = st.session_state.topic_results
624
-
625
  if df.empty:
626
  st.warning("No entities were found in the provided text.")
627
  else:
628
  st.subheader("Analysis Results", divider="blue")
629
-
630
  # 1. Highlighted Text
631
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
632
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
633
 
634
-
635
-
636
  # 2. Detailed Entity Analysis Tabs
637
  st.markdown("### 2. Detailed Entity Analysis")
638
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
639
-
640
  with tab_category_details:
641
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
642
  with st.expander("See Glossary of tags"):
@@ -647,7 +562,6 @@ if st.session_state.show_results:
647
  - **start**: ['index of the start of the corresponding entity']
648
  - **end**: ['index of the end of the corresponding entity']
649
  ''')
650
-
651
  unique_categories = list(category_mapping.keys())
652
  tabs_category = st.tabs(unique_categories)
653
  for category, tab in zip(unique_categories, tabs_category):
@@ -662,7 +576,6 @@ if st.session_state.show_results:
662
  )
663
  else:
664
  st.info(f"No entities of category **{category}** were found in the text.")
665
-
666
  with tab_treemap_viz:
667
  st.markdown("#### Treemap: Entity Distribution")
668
  fig_treemap = px.treemap(
@@ -670,50 +583,42 @@ if st.session_state.show_results:
670
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
671
  values='score',
672
  color='category',
673
-
674
  color_discrete_sequence=px.colors.qualitative.Dark24
675
  )
676
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
677
  st.plotly_chart(fig_treemap, use_container_width=True)
678
-
679
  # 3. Comparative Charts
680
  st.markdown("---")
681
  st.markdown("### 3. Comparative Charts")
682
-
683
  col1, col2, col3 = st.columns(3)
684
-
685
  grouped_counts = df['category'].value_counts().reset_index()
686
  grouped_counts.columns = ['Category', 'Count']
687
-
688
  with col1: # Pie Chart
689
- fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
 
690
  fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
691
  st.plotly_chart(fig_pie, use_container_width=True)
692
-
693
  with col2: # Bar Chart (Category Count)
694
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
695
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
696
  st.plotly_chart(fig_bar_category, use_container_width=True)
697
-
698
  with col3: # Bar Chart (Most Frequent Entities)
699
  word_counts = df['text'].value_counts().reset_index()
700
  word_counts.columns = ['Entity', 'Count']
701
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
702
  if not repeating_entities.empty:
703
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
 
704
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
705
  st.plotly_chart(fig_bar_freq, use_container_width=True)
706
  else:
707
  st.info("No entities repeat for frequency chart.")
708
-
709
  st.markdown("---")
710
  st.markdown("### 4. Entity Relationship Map")
711
  network_fig = generate_network_graph(df, st.session_state.last_text)
712
  st.plotly_chart(network_fig, use_container_width=True)
713
-
714
  st.markdown("---")
715
  st.markdown("### 5. Topic Modelling Analysis")
716
-
717
  if df_topic_data is not None and not df_topic_data.empty:
718
  bubble_figure = create_topic_word_bubbles(df_topic_data)
719
  if bubble_figure:
@@ -722,11 +627,9 @@ if st.session_state.show_results:
722
  st.error("Error generating Topic Word Bubble Chart.")
723
  else:
724
  st.info("Topic modeling requires more unique input (at least two unique entities).")
725
-
726
  # --- Report Download ---
727
  st.markdown("---")
728
  st.markdown("### Download Full Report Artifacts")
729
-
730
  # 1. HTML Report Download (Retained)
731
  html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
732
  st.download_button(
@@ -737,9 +640,7 @@ if st.session_state.show_results:
737
  type="primary"
738
  )
739
 
740
-
741
-
742
- # 2. CSV Data Download (NEW)
743
  csv_buffer = generate_entity_csv(df)
744
  st.download_button(
745
  label="Download Extracted Entities (CSV)",
@@ -748,6 +649,3 @@ if st.session_state.show_results:
748
  mime="text/csv",
749
  type="secondary"
750
  )
751
-
752
-
753
-
 
24
  # ------------------------------
25
  from gliner import GLiNER
26
  from streamlit_extras.stylable_container import stylable_container
 
27
  # Using a try/except for comet_ml import
28
  try:
29
  from comet_ml import Experiment
 
33
  def log_parameter(self, *args): pass
34
  def log_table(self, *args): pass
35
  def end(self): pass
 
36
  # --- Model Home Directory (Fix for deployment environments) ---
37
  # Set HF_HOME environment variable to a writable path
38
  os.environ['HF_HOME'] = '/tmp'
 
39
  # --- Color Map for Highlighting and Network Graph Nodes ---
40
  entity_color_map = {
41
  "person": "#10b981",
42
  "country": "#3b82f6",
43
  "city": "#4ade80",
 
44
  "organization": "#f59e0b",
45
  "date": "#8b5cf6",
46
  "time": "#ec4899",
47
  "cardinal": "#06b6d4",
48
  "money": "#f43f5e",
49
  "position": "#a855f7",
50
+ }
 
 
51
  # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
52
  labels = list(entity_color_map.keys())
 
 
 
53
  category_mapping = {
54
  "People": ["person", "organization", "position"],
55
  "Locations": ["country", "city"],
56
  "Time": ["date", "time"],
57
+ "Numbers": ["money", "cardinal"]}
 
 
 
 
 
 
58
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 
 
59
  # --- Utility Functions for Analysis and Plotly ---
60
  def extract_label(node_name):
61
  """Extracts the label from a node string like 'Text (Label)'."""
62
  match = re.search(r'\(([^)]+)\)$', node_name)
63
  return match.group(1) if match else "Unknown"
 
64
  def remove_trailing_punctuation(text_string):
65
  """Removes trailing punctuation from a string."""
66
  return text_string.rstrip(string.punctuation)
 
67
  def highlight_entities(text, df_entities):
68
  """Generates HTML to display text with entities highlighted and colored."""
69
  if df_entities.empty:
70
  return text
 
71
  # Sort entities by start index descending to insert highlights without affecting subsequent indices
72
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
73
  highlighted_text = text
 
74
  for entity in entities:
75
  start = entity['start']
76
  end = entity['end']
77
  label = entity['label']
78
  entity_text = entity['text']
79
  color = entity_color_map.get(label, '#000000')
 
80
  # Create a span with background color and tooltip
81
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
82
  # Replace the original text segment with the highlighted HTML
83
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
 
84
  # Use a div to mimic the Streamlit input box style for the report
85
+ return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 
86
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
87
  """
88
  Performs basic Topic Modeling using LDA on the extracted entities
 
91
  documents = df_entities['text'].unique().tolist()
92
  if len(documents) < 2:
93
  return None
 
94
  N = min(num_top_words, len(documents))
95
  try:
96
  tfidf_vectorizer = TfidfVectorizer(
 
100
  )
101
  tfidf = tfidf_vectorizer.fit_transform(documents)
102
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
 
103
  lda = LatentDirichletAllocation(
104
  n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
105
  )
 
119
  except Exception as e:
120
  st.error(f"Topic modeling failed: {e}")
121
  return None
 
122
  def create_topic_word_bubbles(df_topic_data):
123
  """Generates a Plotly Bubble Chart for top words across all topics."""
124
  # Renaming columns to match the output of perform_topic_modeling
125
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
126
  df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
 
127
  if df_topic_data.empty:
128
  return None
129
  fig = px.scatter(
 
149
  xaxis={'tickangle': -45, 'showgrid': False},
150
  yaxis={'showgrid': True},
151
  showlegend=True,
152
+ plot_bgcolor='#f9f9f9', # Changed from pink
153
+ paper_bgcolor='#f9f9f9', # Changed from pink
154
  height=600,
155
  margin=dict(t=50, b=100, l=50, r=10),
156
  )
157
  fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>', marker=dict(line=dict(width=1, color='DarkSlateGrey')))
158
  return fig
 
159
  def generate_network_graph(df, raw_text):
160
  """
161
  Generates a network graph visualization (Node Plot) with edges
 
164
  # Using the existing generate_network_graph logic from previous context...
165
  entity_counts = df['text'].value_counts().reset_index()
166
  entity_counts.columns = ['text', 'frequency']
 
167
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
168
  if unique_entities.shape[0] < 2:
169
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
 
170
  num_nodes = len(unique_entities)
171
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
 
172
  radius = 10
173
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
174
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
 
175
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
176
  edges = set()
 
177
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
178
  for sentence in sentences:
179
  entities_in_sentence = []
 
181
  if entity_text.lower() in sentence.lower():
182
  entities_in_sentence.append(entity_text)
183
  unique_entities_in_sentence = list(set(entities_in_sentence))
 
184
  for i in range(len(unique_entities_in_sentence)):
185
  for j in range(i + 1, len(unique_entities_in_sentence)):
186
  node1 = unique_entities_in_sentence[i]
187
  node2 = unique_entities_in_sentence[j]
188
  edge_tuple = tuple(sorted((node1, node2)))
189
  edges.add(edge_tuple)
 
190
  edge_x = []
191
  edge_y = []
 
192
  for edge in edges:
193
  n1, n2 = edge
194
  if n1 in pos_map and n2 in pos_map:
195
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
196
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
 
197
  fig = go.Figure()
 
198
  edge_trace = go.Scatter(
199
  x=edge_x, y=edge_y,
200
  line=dict(width=0.5, color='#888'),
 
204
  showlegend=False
205
  )
206
  fig.add_trace(edge_trace)
 
207
  fig.add_trace(go.Scatter(
208
  x=unique_entities['x'],
209
  y=unique_entities['y'],
 
228
  "Frequency: %{customdata[2]}<extra></extra>"
229
  )
230
  ))
 
231
  legend_traces = []
232
  seen_labels = set()
233
  for index, row in unique_entities.iterrows():
 
240
  ))
241
  for trace in legend_traces:
242
  fig.add_trace(trace)
 
243
  fig.update_layout(
244
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
245
  showlegend=True,
 
251
  margin=dict(t=50, b=10, l=10, r=10),
252
  height=600
253
  )
 
254
  return fig
 
 
 
 
 
 
 
 
255
  # --- NEW CSV GENERATION FUNCTION ---
256
  def generate_entity_csv(df):
257
  """
 
265
  csv_buffer.seek(0)
266
  return csv_buffer
267
  # -----------------------------------
 
268
  # --- Existing App Functionality (HTML) ---
 
269
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
270
  """
271
  Generates a full HTML report containing all analysis results and visualizations.
272
  (Content omitted for brevity but assumed to be here).
273
  """
274
  # 1. Generate Visualizations (Plotly HTML)
 
275
  # 1a. Treemap
276
  fig_treemap = px.treemap(
277
  df,
 
283
  )
284
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
285
  treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
 
286
  # 1b. Pie Chart
287
  grouped_counts = df['category'].value_counts().reset_index()
288
  grouped_counts.columns = ['Category', 'Count']
289
+ # Changed color_discrete_sequence from sequential.RdBu (which has reds) to sequential.Cividis
290
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
291
  fig_pie.update_layout(margin=dict(t=50, b=10))
292
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
 
293
  # 1c. Bar Chart (Category Count)
294
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
295
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
296
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
 
297
  # 1d. Bar Chart (Most Frequent Entities)
298
  word_counts = df['text'].value_counts().reset_index()
299
  word_counts.columns = ['Entity', 'Count']
300
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
301
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
 
302
  if not repeating_entities.empty:
303
+ # Changed color_discrete_sequence from sequential.Plasma (which has pink/magenta) to sequential.Viridis
304
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
305
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
306
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
 
307
  # 1e. Network Graph HTML
308
  network_fig = generate_network_graph(df, text_input)
309
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
 
310
  # 1f. Topic Charts HTML
311
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
312
  if df_topic_data is not None and not df_topic_data.empty:
 
316
  else:
317
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
318
  else:
319
+ topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">' # Changed border color
320
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
321
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
322
  topic_charts_html += '</div>'
 
323
  # 2. Get Highlighted Text
324
  highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
 
325
  # 3. Entity Tables (Pandas to HTML)
326
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
327
  classes='table table-striped',
328
  index=False
329
  )
 
330
  # 4. Construct the Final HTML
331
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
332
  <meta charset="UTF-8">
 
336
  <style>
337
  body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
338
  .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
339
+ h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom: 10px; margin-top: 0; }}
340
  h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
341
  h3 {{ color: #555; margin-top: 20px; }}
342
+ .metadata {{ background-color: #e6f0ff; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
343
  .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
344
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
345
  table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
346
  table th {{ background-color: #f0f0f0; }}
347
+ .highlighted-text {{ border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
348
  </style></head><body>
349
  <div class="container">
350
  <h1>Entity and Topic Analysis Report</h1>
351
  <div class="metadata">
352
+ <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
 
353
  <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
354
  </div>
355
  <h2>1. Analyzed Text & Extracted Entities</h2>
 
373
  </div></body></html>
374
  """
375
  return html_content
 
 
376
  # --- Page Configuration and Styling (No Sidebar) ---
377
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
378
  st.markdown(
 
380
  <style>
381
  /* Overall app container - NO SIDEBAR */
382
  .main {
383
+ background-color: #f4f4f9; /* Changed from light pink */
384
  color: #333333; /* Dark grey text for contrast */
385
  }
386
  .stApp {
387
+ background-color: #f4f4f9; /* Changed from light pink */
388
  }
389
  /* Text Area background and text color (input fields) */
390
  .stTextArea textarea {
391
+ background-color: #ffffff; /* Changed from near white/pinkish */
392
  color: #000000; /* Black text for input */
393
+ border: 1px solid #888888; /* Changed border from pink to grey */
394
  }
395
  /* Button styling */
396
  .stButton > button {
397
+ background-color: #007bff; /* Changed from Deep Pink to Blue */
398
  color: #FFFFFF; /* White text for contrast */
399
  border: none;
400
  padding: 10px 20px;
 
402
  }
403
  /* Expander header and content background */
404
  .streamlit-expanderHeader, .streamlit-expanderContent {
405
+ background-color: #e9ecef; /* Changed from lighter pink to light grey/blue */
406
  color: #333333;
407
  }
408
  </style>
409
  """,
410
  unsafe_allow_html=True)
411
+ st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
412
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 
413
  tab1, tab2 = st.tabs(["Important Notes", "Embed"])
414
  with tab1:
415
  expander = st.expander("**Important notes**")
416
  expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
417
+
418
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
419
+
420
  **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
421
+
422
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
423
 
 
424
  with tab2:
425
  with st.expander("Embed"):
426
  st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
 
433
  ></iframe>
434
  '''
435
  st.code(code, language="html")
436
+
437
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 
438
  # --- Comet ML Setup (Placeholder/Conditional) ---
439
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
440
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
441
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
442
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 
443
  # --- Model Loading ---
444
  @st.cache_resource
445
  def load_ner_model():
 
449
  except Exception as e:
450
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
451
  st.stop()
 
452
  model = load_ner_model()
 
453
  # --- LONG DEFAULT TEXT (178 Words) ---
454
  DEFAULT_TEXT = (
455
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
 
464
  "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
465
  "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
466
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
467
+ "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026.")
 
468
  # -----------------------------------
469
  # --- Session State Initialization (CRITICAL FIX) ---
470
  if 'show_results' not in st.session_state:
 
479
  st.session_state.topic_results = None
480
  if 'my_text_area' not in st.session_state:
481
  st.session_state.my_text_area = DEFAULT_TEXT
 
482
  # --- Clear Button Function (MODIFIED) ---
483
  def clear_text():
484
  """Clears the text area (sets it to an empty string) and hides results."""
 
488
  st.session_state.results_df = pd.DataFrame()
489
  st.session_state.elapsed_time = 0.0
490
  st.session_state.topic_results = None
 
491
  # --- Text Input and Clear Button ---
492
  word_limit = 1000
493
  text = st.text_area(
 
495
  height=250,
496
  key='my_text_area',
497
  value=st.session_state.my_text_area)
 
498
  word_count = len(text.split())
499
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
500
  st.button("Clear text", on_click=clear_text)
 
501
  # --- Results Trigger and Processing (Updated Logic) ---
502
  if st.button("Results"):
503
  if not text.strip():
 
511
  if text != st.session_state.last_text:
512
  st.session_state.last_text = text
513
  start_time = time.time()
 
514
  # --- Model Prediction & Dataframe Creation ---
515
  entities = model.predict_entities(text, labels)
516
  df = pd.DataFrame(entities)
 
517
  if not df.empty:
518
  df['text'] = df['text'].apply(remove_trailing_punctuation)
519
  df['category'] = df['label'].map(reverse_category_mapping)
520
  st.session_state.results_df = df
 
521
  unique_entity_count = len(df['text'].unique())
522
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
 
523
  st.session_state.topic_results = perform_topic_modeling(
524
  df,
525
  num_topics=2,
526
  num_top_words=N_TOP_WORDS_TO_USE
527
  )
 
528
  if comet_initialized:
529
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
530
  experiment.log_parameter("input_text", text)
 
533
  else:
534
  st.session_state.results_df = pd.DataFrame()
535
  st.session_state.topic_results = None
 
536
  end_time = time.time()
537
  st.session_state.elapsed_time = end_time - start_time
 
538
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
539
  st.session_state.show_results = True
 
540
  # --- Display Download Link and Results ---
541
  if st.session_state.show_results:
542
  df = st.session_state.results_df
543
  df_topic_data = st.session_state.topic_results
 
544
  if df.empty:
545
  st.warning("No entities were found in the provided text.")
546
  else:
547
  st.subheader("Analysis Results", divider="blue")
 
548
  # 1. Highlighted Text
549
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
550
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
551
 
 
 
552
  # 2. Detailed Entity Analysis Tabs
553
  st.markdown("### 2. Detailed Entity Analysis")
554
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
 
555
  with tab_category_details:
556
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
557
  with st.expander("See Glossary of tags"):
 
562
  - **start**: ['index of the start of the corresponding entity']
563
  - **end**: ['index of the end of the corresponding entity']
564
  ''')
 
565
  unique_categories = list(category_mapping.keys())
566
  tabs_category = st.tabs(unique_categories)
567
  for category, tab in zip(unique_categories, tabs_category):
 
576
  )
577
  else:
578
  st.info(f"No entities of category **{category}** were found in the text.")
 
579
  with tab_treemap_viz:
580
  st.markdown("#### Treemap: Entity Distribution")
581
  fig_treemap = px.treemap(
 
583
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
584
  values='score',
585
  color='category',
 
586
  color_discrete_sequence=px.colors.qualitative.Dark24
587
  )
588
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
589
  st.plotly_chart(fig_treemap, use_container_width=True)
 
590
  # 3. Comparative Charts
591
  st.markdown("---")
592
  st.markdown("### 3. Comparative Charts")
 
593
  col1, col2, col3 = st.columns(3)
 
594
  grouped_counts = df['category'].value_counts().reset_index()
595
  grouped_counts.columns = ['Category', 'Count']
 
596
  with col1: # Pie Chart
597
+ # Changed color_discrete_sequence
598
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
599
  fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
600
  st.plotly_chart(fig_pie, use_container_width=True)
 
601
  with col2: # Bar Chart (Category Count)
602
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
603
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
604
  st.plotly_chart(fig_bar_category, use_container_width=True)
 
605
  with col3: # Bar Chart (Most Frequent Entities)
606
  word_counts = df['text'].value_counts().reset_index()
607
  word_counts.columns = ['Entity', 'Count']
608
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
609
  if not repeating_entities.empty:
610
+ # Changed color_discrete_sequence
611
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
612
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
613
  st.plotly_chart(fig_bar_freq, use_container_width=True)
614
  else:
615
  st.info("No entities repeat for frequency chart.")
 
616
  st.markdown("---")
617
  st.markdown("### 4. Entity Relationship Map")
618
  network_fig = generate_network_graph(df, st.session_state.last_text)
619
  st.plotly_chart(network_fig, use_container_width=True)
 
620
  st.markdown("---")
621
  st.markdown("### 5. Topic Modelling Analysis")
 
622
  if df_topic_data is not None and not df_topic_data.empty:
623
  bubble_figure = create_topic_word_bubbles(df_topic_data)
624
  if bubble_figure:
 
627
  st.error("Error generating Topic Word Bubble Chart.")
628
  else:
629
  st.info("Topic modeling requires more unique input (at least two unique entities).")
 
630
  # --- Report Download ---
631
  st.markdown("---")
632
  st.markdown("### Download Full Report Artifacts")
 
633
  # 1. HTML Report Download (Retained)
634
  html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
635
  st.download_button(
 
640
  type="primary"
641
  )
642
 
643
+ # 2. CSV Data Download (NEW)
 
 
644
  csv_buffer = generate_entity_csv(df)
645
  st.download_button(
646
  label="Download Extracted Entities (CSV)",
 
649
  mime="text/csv",
650
  type="secondary"
651
  )