AIEcosystem commited on
Commit
b2f8b8b
·
verified ·
1 Parent(s): 142d571

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +149 -213
src/streamlit_app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- os.environ['HF_HOME'] = '/tmp'
3
  import time
4
  import streamlit as st
5
  import streamlit.components.v1 as components
@@ -11,28 +10,23 @@ import numpy as np
11
  import re
12
  import string
13
  import json
14
- # --- PPTX Imports ---
 
15
  from io import BytesIO
16
  from pptx import Presentation
17
  from pptx.util import Inches, Pt
18
  from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
  import plotly.io as pio # Required for image export
20
- # ---------------------------
 
21
  # --- Stable Scikit-learn LDA Imports ---
22
  from sklearn.feature_extraction.text import TfidfVectorizer
23
  from sklearn.decomposition import LatentDirichletAllocation
24
- # ------------------------------
 
25
  from gliner import GLiNER
26
  from streamlit_extras.stylable_container import stylable_container
27
 
28
-
29
-
30
-
31
-
32
-
33
-
34
-
35
-
36
  # Using a try/except for comet_ml import
37
  try:
38
  from comet_ml import Experiment
@@ -42,9 +36,11 @@ except ImportError:
42
  def log_parameter(self, *args): pass
43
  def log_table(self, *args): pass
44
  def end(self): pass
 
45
  # --- Model Home Directory (Fix for deployment environments) ---
46
  # Set HF_HOME environment variable to a writable path
47
  os.environ['HF_HOME'] = '/tmp'
 
48
  # --- Color Map for Highlighting and Network Graph Nodes ---
49
  entity_color_map = {
50
  "person": "#10b981",
@@ -56,23 +52,28 @@ entity_color_map = {
56
  "cardinal": "#06b6d4",
57
  "money": "#f43f5e",
58
  "position": "#a855f7",
59
- }
 
60
  # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
61
  labels = list(entity_color_map.keys())
62
  category_mapping = {
63
  "People": ["person", "organization", "position"],
64
  "Locations": ["country", "city"],
65
  "Time": ["date", "time"],
66
- "Numbers": ["money", "cardinal"]}
 
67
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 
68
  # --- Utility Functions for Analysis and Plotly ---
69
  def extract_label(node_name):
70
  """Extracts the label from a node string like 'Text (Label)'."""
71
  match = re.search(r'\(([^)]+)\)$', node_name)
72
  return match.group(1) if match else "Unknown"
 
73
  def remove_trailing_punctuation(text_string):
74
  """Removes trailing punctuation from a string."""
75
  return text_string.rstrip(string.punctuation)
 
76
  def highlight_entities(text, df_entities):
77
  """Generates HTML to display text with entities highlighted and colored."""
78
  if df_entities.empty:
@@ -93,33 +94,31 @@ def highlight_entities(text, df_entities):
93
  # Use a div to mimic the Streamlit input box style for the report
94
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
95
 
96
-
97
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
98
  """
99
  Performs basic Topic Modeling using LDA on the extracted entities,
100
- allowing for n-grams to capture multi-word entities like 'Dr. Emily Carter'.
101
  """
102
- # 1. Prepare Documents: Use unique entities (they are short, clean documents)
103
  documents = df_entities['text'].unique().tolist()
104
-
105
  if len(documents) < 2:
106
  return None
107
-
108
  N = min(num_top_words, len(documents))
109
 
110
  try:
111
- # 2. Vectorizer: Use TfidfVectorizer, but allow unigrams, bigrams, and trigrams (ngram_range)
112
- # to capture multi-word entities. We keep stop_words='english' for the *components* of the entity.
113
  tfidf_vectorizer = TfidfVectorizer(
114
  max_df=0.95,
115
  min_df=2, # Only consider words/phrases that appear at least twice to find topics
116
  stop_words='english',
117
- ngram_range=(1, 3) # This is the KEY to capturing "Dr. Emily Carter" as a single token (if it appears enough times)
118
  )
119
 
120
  tfidf = tfidf_vectorizer.fit_transform(documents)
121
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
122
-
123
  # Check if the vocabulary is too small after tokenization/ngram generation
124
  if len(tfidf_feature_names) < num_topics:
125
  # Re-run with min_df=1 if vocab is too small
@@ -137,43 +136,35 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
137
  random_state=42, n_jobs=-1
138
  )
139
  lda.fit(tfidf)
140
-
141
  # 4. Extract Topic Data
142
  topic_data_list = []
143
  for topic_idx, topic in enumerate(lda.components_):
144
  top_words_indices = topic.argsort()[:-N - 1:-1]
145
- # These top_words will now include phrases like 'emily carter' or 'european space agency'
146
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
147
  word_weights = [topic[i] for i in top_words_indices]
148
-
149
  for word, weight in zip(top_words, word_weights):
150
  topic_data_list.append({
151
  'Topic_ID': f'Topic #{topic_idx + 1}',
152
  'Word': word,
153
  'Weight': weight,
154
  })
155
-
156
  return pd.DataFrame(topic_data_list)
157
-
158
  except Exception as e:
159
- # A broader catch for robustness
160
- # st.error(f"Topic modeling failed: {e}") # Keep commented out for cleaner app
161
  return None
162
-
163
-
164
-
165
-
166
 
167
  def create_topic_word_bubbles(df_topic_data):
168
  """Generates a Plotly Bubble Chart for top words across
169
  all topics, displaying the word directly on the bubble."""
170
  # Renaming columns to match the output of perform_topic_modeling
171
- df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic',
172
- 'Word': 'word', 'Weight': 'weight'})
173
  df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position
174
  if df_topic_data.empty:
175
  return None
176
-
177
  fig = px.scatter(
178
  df_topic_data,
179
  x='x_pos',
@@ -183,7 +174,7 @@ def create_topic_word_bubbles(df_topic_data):
183
  # Set text to the word
184
  text='word',
185
  hover_name='word',
186
- size_max=40,
187
  title='Topic Word Weights (Bubble Chart)',
188
  color_discrete_sequence=px.colors.qualitative.Bold,
189
  labels={
@@ -197,7 +188,7 @@ def create_topic_word_bubbles(df_topic_data):
197
  fig.update_layout(
198
  xaxis_title="Entity/Word",
199
  yaxis_title="Word Weight",
200
- # Hide x-axis labels since words are now labels
201
  xaxis={'tickangle': -45, 'showgrid': False, 'showticklabels': False, 'zeroline': False, 'showline': False},
202
  yaxis={'showgrid': True},
203
  showlegend=True,
@@ -206,54 +197,56 @@ def create_topic_word_bubbles(df_topic_data):
206
  height=600,
207
  margin=dict(t=50, b=100, l=50, r=10),
208
  )
209
-
210
- # Update traces to show the word text, set the text position, and set text color
211
  fig.update_traces(
212
- # Position the text on top of the bubble
213
  textposition='middle center',
214
- # --- THE KEY FIX IS HERE ---
215
- # Set the text color to white for visibility against dark bubble colors
216
- textfont=dict(color='white', size=10),
217
- # ---------------------------
218
  hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',
219
  marker=dict(line=dict(width=1, color='DarkSlateGrey'))
220
  )
221
-
222
  return fig
223
 
224
-
225
-
226
  def generate_network_graph(df, raw_text):
227
  """
228
  Generates a network graph visualization (Node Plot) with edges
229
- based on entity co-occurrence in sentences. (Content omitted for brevity but assumed to be here).
230
  """
231
- # Using the existing generate_network_graph logic from previous context...
232
  entity_counts = df['text'].value_counts().reset_index()
233
  entity_counts.columns = ['text', 'frequency']
234
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
 
235
  if unique_entities.shape[0] < 2:
236
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
 
 
237
  num_nodes = len(unique_entities)
238
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
239
  radius = 10
240
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
241
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
242
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
 
 
243
  edges = set()
244
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
 
245
  for sentence in sentences:
246
  entities_in_sentence = []
247
  for entity_text in unique_entities['text'].unique():
248
  if entity_text.lower() in sentence.lower():
249
  entities_in_sentence.append(entity_text)
250
  unique_entities_in_sentence = list(set(entities_in_sentence))
 
 
251
  for i in range(len(unique_entities_in_sentence)):
252
  for j in range(i + 1, len(unique_entities_in_sentence)):
253
  node1 = unique_entities_in_sentence[i]
254
  node2 = unique_entities_in_sentence[j]
255
  edge_tuple = tuple(sorted((node1, node2)))
256
  edges.add(edge_tuple)
 
257
  edge_x = []
258
  edge_y = []
259
  for edge in edges:
@@ -261,7 +254,10 @@ def generate_network_graph(df, raw_text):
261
  if n1 in pos_map and n2 in pos_map:
262
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
263
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
 
264
  fig = go.Figure()
 
 
265
  edge_trace = go.Scatter(
266
  x=edge_x, y=edge_y,
267
  line=dict(width=0.5, color='#888'),
@@ -271,6 +267,8 @@ def generate_network_graph(df, raw_text):
271
  showlegend=False
272
  )
273
  fig.add_trace(edge_trace)
 
 
274
  fig.add_trace(go.Scatter(
275
  x=unique_entities['x'],
276
  y=unique_entities['y'],
@@ -280,6 +278,7 @@ def generate_network_graph(df, raw_text):
280
  textposition="top center",
281
  showlegend=False,
282
  marker=dict(
 
283
  size=unique_entities['frequency'] * 5 + 10,
284
  color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
285
  line_width=1,
@@ -295,6 +294,8 @@ def generate_network_graph(df, raw_text):
295
  "Frequency: %{customdata[2]}<extra></extra>"
296
  )
297
  ))
 
 
298
  legend_traces = []
299
  seen_labels = set()
300
  for index, row in unique_entities.iterrows():
@@ -307,6 +308,7 @@ def generate_network_graph(df, raw_text):
307
  ))
308
  for trace in legend_traces:
309
  fig.add_trace(trace)
 
310
  fig.update_layout(
311
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
312
  showlegend=True,
@@ -319,7 +321,8 @@ def generate_network_graph(df, raw_text):
319
  height=600
320
  )
321
  return fig
322
- # --- NEW CSV GENERATION FUNCTION ---
 
323
  def generate_entity_csv(df):
324
  """
325
  Generates a CSV file of the extracted entities in an in-memory buffer,
@@ -331,14 +334,16 @@ def generate_entity_csv(df):
331
  csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
332
  csv_buffer.seek(0)
333
  return csv_buffer
334
- # -----------------------------------
335
- # --- Existing App Functionality (HTML) ---
 
336
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
337
  """
338
  Generates a full HTML report containing all analysis results and visualizations.
339
- (Content omitted for brevity but assumed to be here).
340
  """
341
  # 1. Generate Visualizations (Plotly HTML)
 
342
  # 1a. Treemap
343
  fig_treemap = px.treemap(
344
  df,
@@ -350,81 +355,76 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
350
  )
351
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
352
  treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
 
353
  # 1b. Pie Chart
354
  grouped_counts = df['category'].value_counts().reset_index()
355
  grouped_counts.columns = ['Category', 'Count']
356
- # Changed color_discrete_sequence from sequential.RdBu (which has reds) to sequential.Cividis
357
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
358
  fig_pie.update_layout(margin=dict(t=50, b=10))
359
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
 
360
  # 1c. Bar Chart (Category Count)
361
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
362
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
363
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
 
364
  # 1d. Bar Chart (Most Frequent Entities)
365
  word_counts = df['text'].value_counts().reset_index()
366
  word_counts.columns = ['Entity', 'Count']
367
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
368
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
369
  if not repeating_entities.empty:
370
- # Changed color_discrete_sequence from sequential.Plasma (which has pink/magenta) to sequential.Viridis
371
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
372
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
373
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
 
374
  # 1e. Network Graph HTML
375
  network_fig = generate_network_graph(df, text_input)
376
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
 
377
  # 1f. Topic Charts HTML
378
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
379
  if df_topic_data is not None and not df_topic_data.empty:
380
  bubble_figure = create_topic_word_bubbles(df_topic_data)
381
  if bubble_figure:
382
-
383
  topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn", config={"responsive": True})}</div>'
384
  else:
385
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
386
  else:
387
- topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">' # Changed border color
388
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
389
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
390
  topic_charts_html += '</div>'
 
391
  # 2. Get Highlighted Text
392
  highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
 
393
  # 3. Entity Tables (Pandas to HTML)
394
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
395
  classes='table table-striped',
396
  index=False
397
  )
398
- # 4. Construct the Final HTML
 
399
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
400
  <meta charset="UTF-8">
401
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
402
  <title>Entity and Topic Analysis Report</title>
403
  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
404
  <style>
405
- body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px;
406
- background-color: #f4f4f9; color: #333; }}
407
- .container {{ max-width: 1200px; margin: 0 auto; background-color:
408
- #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px
409
- rgba(0,0,0,0.1); }}
410
- h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom:
411
- 10px; margin-top: 0; }}
412
- h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd;
413
- padding-bottom: 5px; }}
414
  h3 {{ color: #555; margin-top: 20px; }}
415
- .metadata {{ background-color: #e6f0ff; padding: 15px; border-radius:
416
- 8px; margin-bottom: 20px; font-size: 0.9em; }}
417
- .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius:
418
- 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px;
419
- }}
420
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
421
- table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align:
422
- left; }}
423
  table th {{ background-color: #f0f0f0; }}
424
- .highlighted-text {{ border: 1px solid #888888; padding: 15px;
425
- border-radius: 5px; background-color: #ffffff; font-family: monospace;
426
- white-space: pre-wrap; margin-bottom: 20px; }}
427
-
428
  /* === MOBILE-SPECIFIC FIXES FOR REPORT OVERLAP === */
429
  @media (max-width: 600px) {
430
  body {
@@ -478,15 +478,10 @@ white-space: pre-wrap; margin-bottom: 20px; }}
478
  </div></body></html>
479
  """
480
  return html_content
481
-
482
-
483
-
484
-
485
 
486
  # --- Page Configuration and Styling (No Sidebar) ---
487
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
488
 
489
-
490
  # --- Conditional Mobile Warning ---
491
  st.markdown(
492
  """
@@ -522,50 +517,35 @@ st.markdown(
522
  )
523
  # ----------------------------------
524
 
525
-
526
-
527
-
528
-
529
-
530
-
531
-
532
-
533
  st.markdown(
534
  """
535
  <style>
536
- /* ... (Keep your existing styles for main, stApp, stTextArea, stButton) ... */
537
  /* --- FIX: Tab Label Colors for Visibility --- */
538
- /* Target the container for the tab labels (the buttons) */
539
  [data-testid="stConfigurableTabs"] button {
540
- color: #333333 !important; /* Dark gray for inactive tabs */
541
- background-color: #f0f0f0; /* Light gray background for inactive tabs */
542
  border: 1px solid #cccccc;
543
  }
544
  /* Target the ACTIVE tab label */
545
  [data-testid="stConfigurableTabs"] button[aria-selected="true"] {
546
- color: #FFFFFF !important; /* White text for active tab */
547
- background-color: #007bff; /* Blue background for active tab */
548
- border-bottom: 2px solid #007bff; /* Optional: adds an accent line */
549
  }
550
-
551
- /* Expander header color fix (since you overwrote it to white) */
552
  .streamlit-expanderHeader {
553
- color: #007bff; /* Blue text for Expander header */
554
  }
555
  </style>
556
  """,
557
  unsafe_allow_html=True
558
  )
559
 
560
-
561
- st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
562
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
563
 
564
-
565
-
566
-
567
-
568
- tab1, tab2 = st.tabs(["Embed", "Important Notes"]) # Assuming you have defined the tabs
569
 
570
  with tab1:
571
  with st.expander("Embed"):
@@ -578,32 +558,25 @@ with tab1:
578
  height="450"
579
  ></iframe>
580
  '''
581
- st.code(code, language="html") # Keeps the copy icon, as intended for tab1
582
-
583
-
584
 
585
  with tab2:
586
  expander = st.expander("**Important Notes**")
587
- # Use st.markdown() with a code block (```) to display the notes
588
- # without the copy-to-clipboard icon, and retaining the styling.
589
  expander.markdown("""
590
  **Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
591
-
592
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
593
-
594
  **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
595
-
596
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
597
  """)
598
 
599
-
600
- st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
601
 
602
  # --- Comet ML Setup (Placeholder/Conditional) ---
603
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
604
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
605
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
606
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 
607
  # --- Model Loading ---
608
  @st.cache_resource
609
  def load_ner_model():
@@ -613,9 +586,10 @@ def load_ner_model():
613
  except Exception as e:
614
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
615
  st.stop()
 
616
  model = load_ner_model()
617
- # --- LONG DEFAULT TEXT (178 Words) ---
618
 
 
619
  DEFAULT_TEXT = (
620
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
621
  "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
@@ -632,16 +606,9 @@ DEFAULT_TEXT = (
632
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
633
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026."
634
  )
635
-
636
-
637
-
638
-
639
-
640
-
641
-
642
-
643
  # -----------------------------------
644
- # --- Session State Initialization (CRITICAL FIX) ---
 
645
  if 'show_results' not in st.session_state:
646
  st.session_state.show_results = False
647
  if 'last_text' not in st.session_state:
@@ -652,9 +619,11 @@ if 'elapsed_time' not in st.session_state:
652
  st.session_state.elapsed_time = 0.0
653
  if 'topic_results' not in st.session_state:
654
  st.session_state.topic_results = None
 
655
  if 'my_text_area' not in st.session_state:
656
  st.session_state.my_text_area = DEFAULT_TEXT
657
- # --- Clear Button Function (MODIFIED) ---
 
658
  def clear_text():
659
  """Clears the text area (sets it to an empty string) and hides results."""
660
  st.session_state['my_text_area'] = ""
@@ -663,16 +632,19 @@ def clear_text():
663
  st.session_state.results_df = pd.DataFrame()
664
  st.session_state.elapsed_time = 0.0
665
  st.session_state.topic_results = None
 
666
  # --- Text Input and Clear Button ---
667
  word_limit = 1000
668
  text = st.text_area(
669
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
670
  height=250,
671
- key='my_text_area',
672
  )
 
673
  word_count = len(text.split())
674
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
675
  st.button("Clear text", on_click=clear_text)
 
676
  # --- Results Trigger and Processing (Updated Logic) ---
677
  if st.button("Results"):
678
  if not text.strip():
@@ -686,20 +658,25 @@ if st.button("Results"):
686
  if text != st.session_state.last_text:
687
  st.session_state.last_text = text
688
  start_time = time.time()
 
689
  # --- Model Prediction & Dataframe Creation ---
690
  entities = model.predict_entities(text, labels)
691
  df = pd.DataFrame(entities)
 
692
  if not df.empty:
693
  df['text'] = df['text'].apply(remove_trailing_punctuation)
694
  df['category'] = df['label'].map(reverse_category_mapping)
695
  st.session_state.results_df = df
 
696
  unique_entity_count = len(df['text'].unique())
697
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
 
698
  st.session_state.topic_results = perform_topic_modeling(
699
  df,
700
  num_topics=2,
701
  num_top_words=N_TOP_WORDS_TO_USE
702
  )
 
703
  if comet_initialized:
704
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
705
  experiment.log_parameter("input_text", text)
@@ -708,32 +685,37 @@ if st.button("Results"):
708
  else:
709
  st.session_state.results_df = pd.DataFrame()
710
  st.session_state.topic_results = None
 
711
  end_time = time.time()
712
  st.session_state.elapsed_time = end_time - start_time
713
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
 
714
  st.session_state.show_results = True
715
- # --- Display Download Link and Results ---
 
716
  if st.session_state.show_results:
717
  df = st.session_state.results_df
718
  df_topic_data = st.session_state.topic_results
 
719
  if df.empty:
720
  st.warning("No entities were found in the provided text.")
721
  else:
722
  st.subheader("Analysis Results", divider="blue")
 
723
  # 1. Highlighted Text
724
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
725
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
726
-
727
  # 2. Detailed Entity Analysis Tabs
728
  st.markdown("### 2. Detailed Entity Analysis")
729
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
 
730
  with tab_category_details:
731
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
732
-
733
-
734
-
735
  unique_categories = list(category_mapping.keys())
736
  tabs_category = st.tabs(unique_categories)
 
737
  for category, tab in zip(unique_categories, tabs_category):
738
  df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
739
  with tab:
@@ -744,91 +726,45 @@ if st.session_state.show_results:
744
  use_container_width=True,
745
  column_config={'score': st.column_config.NumberColumn(format="%.4f")}
746
  )
747
- else:
748
- st.info(f"No entities of category **{category}** were found in the text.")
749
-
750
-
751
- with st.expander("See Glossary of tags"):
752
- st.write('''
753
- - **text**: ['entity extracted from your text data']
754
- - **label**: ['label (tag) assigned to a given extracted entity']
755
- - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
756
- - **start**: ['index of the start of the corresponding entity']
757
- - **end**: ['index of the end of the corresponding entity']
758
- ''')
759
-
760
  with tab_treemap_viz:
761
- st.markdown("#### Treemap: Entity Distribution")
762
  fig_treemap = px.treemap(
763
  df,
764
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
765
  values='score',
766
  color='category',
 
767
  color_discrete_sequence=px.colors.qualitative.Dark24
768
  )
769
- fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
770
  st.plotly_chart(fig_treemap, use_container_width=True)
771
- # 3. Comparative Charts
772
- st.markdown("---")
773
- st.markdown("### 3. Comparative Charts")
774
- col1, col2, col3 = st.columns(3)
775
- grouped_counts = df['category'].value_counts().reset_index()
776
- grouped_counts.columns = ['Category', 'Count']
777
- with col1: # Pie Chart
778
- # Changed color_discrete_sequence
779
- fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
780
- fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
781
- st.plotly_chart(fig_pie, use_container_width=True)
782
- with col2: # Bar Chart (Category Count)
783
- fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
784
- fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
785
- st.plotly_chart(fig_bar_category, use_container_width=True)
786
- with col3: # Bar Chart (Most Frequent Entities)
787
- word_counts = df['text'].value_counts().reset_index()
788
- word_counts.columns = ['Entity', 'Count']
789
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
790
- if not repeating_entities.empty:
791
- # Changed color_discrete_sequence
792
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
793
- fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
794
- st.plotly_chart(fig_bar_freq, use_container_width=True)
795
- else:
796
- st.info("No entities repeat for frequency chart.")
797
- st.markdown("---")
798
- st.markdown("### 4. Entity Relationship Map")
799
- network_fig = generate_network_graph(df, st.session_state.last_text)
800
- st.plotly_chart(network_fig, use_container_width=True)
801
- st.markdown("---")
802
- st.markdown("### 5. Topic Modelling Analysis")
803
- if df_topic_data is not None and not df_topic_data.empty:
804
- bubble_figure = create_topic_word_bubbles(df_topic_data)
805
- if bubble_figure:
806
- st.plotly_chart(bubble_figure, use_container_width=True)
807
- else:
808
- st.error("Error generating Topic Word Bubble Chart.")
809
- else:
810
- st.info("Topic modeling requires more unique input (at least two unique entities).")
811
- # --- Report Download ---
812
- st.markdown("---")
813
- st.markdown("### Download Full Report Artifacts")
814
- # 1. HTML Report Download (Retained)
815
- html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
816
- st.download_button(
817
- label="Download Comprehensive HTML Report",
818
- data=html_report,
819
- file_name="ner_topic_report.html",
820
- mime="text/html",
821
- type="primary"
822
- )
823
 
824
- # 2. CSV Data Download (NEW)
825
- csv_buffer = generate_entity_csv(df)
826
- st.download_button(
827
- label="Download Extracted Entities (CSV)",
828
- data=csv_buffer,
829
- file_name="extracted_entities.csv",
830
- mime="text/csv",
831
- type="secondary"
832
- )
833
 
 
 
 
834
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import time
3
  import streamlit as st
4
  import streamlit.components.v1 as components
 
10
  import re
11
  import string
12
  import json
13
+
14
+ # --- PPTX Imports (Kept for completeness) ---
15
  from io import BytesIO
16
  from pptx import Presentation
17
  from pptx.util import Inches, Pt
18
  from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
  import plotly.io as pio # Required for image export
20
+ # -------------------------------------------
21
+
22
  # --- Stable Scikit-learn LDA Imports ---
23
  from sklearn.feature_extraction.text import TfidfVectorizer
24
  from sklearn.decomposition import LatentDirichletAllocation
25
+ # ---------------------------------------
26
+
27
  from gliner import GLiNER
28
  from streamlit_extras.stylable_container import stylable_container
29
 
 
 
 
 
 
 
 
 
30
  # Using a try/except for comet_ml import
31
  try:
32
  from comet_ml import Experiment
 
36
  def log_parameter(self, *args): pass
37
  def log_table(self, *args): pass
38
  def end(self): pass
39
+
40
  # --- Model Home Directory (Fix for deployment environments) ---
41
  # Set HF_HOME environment variable to a writable path
42
  os.environ['HF_HOME'] = '/tmp'
43
+
44
  # --- Color Map for Highlighting and Network Graph Nodes ---
45
  entity_color_map = {
46
  "person": "#10b981",
 
52
  "cardinal": "#06b6d4",
53
  "money": "#f43f5e",
54
  "position": "#a855f7",
55
+ }
56
+
57
  # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
58
  labels = list(entity_color_map.keys())
59
  category_mapping = {
60
  "People": ["person", "organization", "position"],
61
  "Locations": ["country", "city"],
62
  "Time": ["date", "time"],
63
+ "Numbers": ["money", "cardinal"]
64
+ }
65
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
66
+
67
  # --- Utility Functions for Analysis and Plotly ---
68
  def extract_label(node_name):
69
  """Extracts the label from a node string like 'Text (Label)'."""
70
  match = re.search(r'\(([^)]+)\)$', node_name)
71
  return match.group(1) if match else "Unknown"
72
+
73
  def remove_trailing_punctuation(text_string):
74
  """Removes trailing punctuation from a string."""
75
  return text_string.rstrip(string.punctuation)
76
+
77
  def highlight_entities(text, df_entities):
78
  """Generates HTML to display text with entities highlighted and colored."""
79
  if df_entities.empty:
 
94
  # Use a div to mimic the Streamlit input box style for the report
95
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
96
 
 
97
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
98
  """
99
  Performs basic Topic Modeling using LDA on the extracted entities,
100
+ allowing for n-grams (up to 3 words) to capture multi-word entities.
101
  """
102
+ # 1. Prepare Documents: Use unique entities
103
  documents = df_entities['text'].unique().tolist()
104
+
105
  if len(documents) < 2:
106
  return None
107
+
108
  N = min(num_top_words, len(documents))
109
 
110
  try:
111
+ # 2. Vectorizer: Use TfidfVectorizer with ngram_range to capture multi-word entities.
 
112
  tfidf_vectorizer = TfidfVectorizer(
113
  max_df=0.95,
114
  min_df=2, # Only consider words/phrases that appear at least twice to find topics
115
  stop_words='english',
116
+ ngram_range=(1, 3)
117
  )
118
 
119
  tfidf = tfidf_vectorizer.fit_transform(documents)
120
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
121
+
122
  # Check if the vocabulary is too small after tokenization/ngram generation
123
  if len(tfidf_feature_names) < num_topics:
124
  # Re-run with min_df=1 if vocab is too small
 
136
  random_state=42, n_jobs=-1
137
  )
138
  lda.fit(tfidf)
139
+
140
  # 4. Extract Topic Data
141
  topic_data_list = []
142
  for topic_idx, topic in enumerate(lda.components_):
143
  top_words_indices = topic.argsort()[:-N - 1:-1]
 
144
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
145
  word_weights = [topic[i] for i in top_words_indices]
146
+
147
  for word, weight in zip(top_words, word_weights):
148
  topic_data_list.append({
149
  'Topic_ID': f'Topic #{topic_idx + 1}',
150
  'Word': word,
151
  'Weight': weight,
152
  })
153
+
154
  return pd.DataFrame(topic_data_list)
155
+
156
  except Exception as e:
 
 
157
  return None
 
 
 
 
158
 
159
  def create_topic_word_bubbles(df_topic_data):
160
  """Generates a Plotly Bubble Chart for top words across
161
  all topics, displaying the word directly on the bubble."""
162
  # Renaming columns to match the output of perform_topic_modeling
163
+ df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
 
164
  df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position
165
  if df_topic_data.empty:
166
  return None
167
+
168
  fig = px.scatter(
169
  df_topic_data,
170
  x='x_pos',
 
174
  # Set text to the word
175
  text='word',
176
  hover_name='word',
177
+ size_max=40, # Reduced size_max for smaller bubbles
178
  title='Topic Word Weights (Bubble Chart)',
179
  color_discrete_sequence=px.colors.qualitative.Bold,
180
  labels={
 
188
  fig.update_layout(
189
  xaxis_title="Entity/Word",
190
  yaxis_title="Word Weight",
191
+ # Hides the vertical X-axis line, tick labels, and grid
192
  xaxis={'tickangle': -45, 'showgrid': False, 'showticklabels': False, 'zeroline': False, 'showline': False},
193
  yaxis={'showgrid': True},
194
  showlegend=True,
 
197
  height=600,
198
  margin=dict(t=50, b=100, l=50, r=10),
199
  )
200
+
201
+ # Update traces to set text color to white
202
  fig.update_traces(
 
203
  textposition='middle center',
204
+ textfont=dict(color='white', size=10), # Fix for text visibility
 
 
 
205
  hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',
206
  marker=dict(line=dict(width=1, color='DarkSlateGrey'))
207
  )
208
+
209
  return fig
210
 
 
 
211
  def generate_network_graph(df, raw_text):
212
  """
213
  Generates a network graph visualization (Node Plot) with edges
214
+ based on entity co-occurrence in sentences.
215
  """
 
216
  entity_counts = df['text'].value_counts().reset_index()
217
  entity_counts.columns = ['text', 'frequency']
218
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
219
+
220
  if unique_entities.shape[0] < 2:
221
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
222
+
223
+ # Positioning logic (simplified circular layout with slight jitter)
224
  num_nodes = len(unique_entities)
225
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
226
  radius = 10
227
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
228
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
229
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
230
+
231
+ # Co-occurrence Edges based on sentences
232
  edges = set()
233
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
234
+
235
  for sentence in sentences:
236
  entities_in_sentence = []
237
  for entity_text in unique_entities['text'].unique():
238
  if entity_text.lower() in sentence.lower():
239
  entities_in_sentence.append(entity_text)
240
  unique_entities_in_sentence = list(set(entities_in_sentence))
241
+
242
+ # Create edges for all pairs in the sentence
243
  for i in range(len(unique_entities_in_sentence)):
244
  for j in range(i + 1, len(unique_entities_in_sentence)):
245
  node1 = unique_entities_in_sentence[i]
246
  node2 = unique_entities_in_sentence[j]
247
  edge_tuple = tuple(sorted((node1, node2)))
248
  edges.add(edge_tuple)
249
+
250
  edge_x = []
251
  edge_y = []
252
  for edge in edges:
 
254
  if n1 in pos_map and n2 in pos_map:
255
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
256
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
257
+
258
  fig = go.Figure()
259
+
260
+ # Edge Trace
261
  edge_trace = go.Scatter(
262
  x=edge_x, y=edge_y,
263
  line=dict(width=0.5, color='#888'),
 
267
  showlegend=False
268
  )
269
  fig.add_trace(edge_trace)
270
+
271
+ # Node Trace
272
  fig.add_trace(go.Scatter(
273
  x=unique_entities['x'],
274
  y=unique_entities['y'],
 
278
  textposition="top center",
279
  showlegend=False,
280
  marker=dict(
281
+ # Size nodes based on frequency
282
  size=unique_entities['frequency'] * 5 + 10,
283
  color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
284
  line_width=1,
 
294
  "Frequency: %{customdata[2]}<extra></extra>"
295
  )
296
  ))
297
+
298
+ # Custom Legend for Node Colors
299
  legend_traces = []
300
  seen_labels = set()
301
  for index, row in unique_entities.iterrows():
 
308
  ))
309
  for trace in legend_traces:
310
  fig.add_trace(trace)
311
+
312
  fig.update_layout(
313
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
314
  showlegend=True,
 
321
  height=600
322
  )
323
  return fig
324
+
325
+ # --- CSV GENERATION FUNCTION ---
326
  def generate_entity_csv(df):
327
  """
328
  Generates a CSV file of the extracted entities in an in-memory buffer,
 
334
  csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
335
  csv_buffer.seek(0)
336
  return csv_buffer
337
+ # -----------------------------
338
+
339
+ # --- HTML REPORT GENERATION FUNCTION ---
340
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
341
  """
342
  Generates a full HTML report containing all analysis results and visualizations.
343
+ Includes mobile-specific CSS fixes.
344
  """
345
  # 1. Generate Visualizations (Plotly HTML)
346
+
347
  # 1a. Treemap
348
  fig_treemap = px.treemap(
349
  df,
 
355
  )
356
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
357
  treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
358
+
359
  # 1b. Pie Chart
360
  grouped_counts = df['category'].value_counts().reset_index()
361
  grouped_counts.columns = ['Category', 'Count']
 
362
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
363
  fig_pie.update_layout(margin=dict(t=50, b=10))
364
  pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
365
+
366
  # 1c. Bar Chart (Category Count)
367
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
368
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
369
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
370
+
371
  # 1d. Bar Chart (Most Frequent Entities)
372
  word_counts = df['text'].value_counts().reset_index()
373
  word_counts.columns = ['Entity', 'Count']
374
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
375
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
376
  if not repeating_entities.empty:
 
377
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
378
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
379
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
380
+
381
  # 1e. Network Graph HTML
382
  network_fig = generate_network_graph(df, text_input)
383
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
384
+
385
  # 1f. Topic Charts HTML
386
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
387
  if df_topic_data is not None and not df_topic_data.empty:
388
  bubble_figure = create_topic_word_bubbles(df_topic_data)
389
  if bubble_figure:
390
+ # Added config={'responsive': True} for HTML report resizing
391
  topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn", config={"responsive": True})}</div>'
392
  else:
393
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
394
  else:
395
+ topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">'
396
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
397
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
398
  topic_charts_html += '</div>'
399
+
400
  # 2. Get Highlighted Text
401
  highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
402
+
403
  # 3. Entity Tables (Pandas to HTML)
404
  entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
405
  classes='table table-striped',
406
  index=False
407
  )
408
+
409
+ # 4. Construct the Final HTML with Corrected Mobile CSS
410
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
411
  <meta charset="UTF-8">
412
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
413
  <title>Entity and Topic Analysis Report</title>
414
  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
415
  <style>
416
+ body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px;background-color: #f4f4f9; color: #333; }}
417
+ .container {{ max-width: 1200px; margin: 0 auto; background-color:#ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12pxrgba(0,0,0,0.1); }}
418
+ h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom:10px; margin-top: 0; }}
419
+ h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd;padding-bottom: 5px; }}
 
 
 
 
 
420
  h3 {{ color: #555; margin-top: 20px; }}
421
+ .metadata {{ background-color: #e6f0ff; padding: 15px; border-radius:8px; margin-bottom: 20px; font-size: 0.9em; }}
422
+ .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius:8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px;}}
 
 
 
423
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
424
+ table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align:left; }}
 
425
  table th {{ background-color: #f0f0f0; }}
426
+ .highlighted-text {{ border: 1px solid #888888; padding: 15px;border-radius: 5px; background-color: #ffffff; font-family: monospace;white-space: pre-wrap; margin-bottom: 20px; }}
427
+
 
 
428
  /* === MOBILE-SPECIFIC FIXES FOR REPORT OVERLAP === */
429
  @media (max-width: 600px) {
430
  body {
 
478
  </div></body></html>
479
  """
480
  return html_content
 
 
 
 
481
 
482
  # --- Page Configuration and Styling (No Sidebar) ---
483
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
484
 
 
485
  # --- Conditional Mobile Warning ---
486
  st.markdown(
487
  """
 
517
  )
518
  # ----------------------------------
519
 
520
+ # --- General Streamlit Style Fixes ---
 
 
 
 
 
 
 
521
  st.markdown(
522
  """
523
  <style>
 
524
  /* --- FIX: Tab Label Colors for Visibility --- */
 
525
  [data-testid="stConfigurableTabs"] button {
526
+ color: #333333 !important;
527
+ background-color: #f0f0f0;
528
  border: 1px solid #cccccc;
529
  }
530
  /* Target the ACTIVE tab label */
531
  [data-testid="stConfigurableTabs"] button[aria-selected="true"] {
532
+ color: #FFFFFF !important;
533
+ background-color: #007bff;
534
+ border-bottom: 2px solid #007bff;
535
  }
536
+ /* Expander header color fix */
 
537
  .streamlit-expanderHeader {
538
+ color: #007bff;
539
  }
540
  </style>
541
  """,
542
  unsafe_allow_html=True
543
  )
544
 
545
+ st.subheader("Entity and Topic Analysis Report Generator", divider="blue")
 
546
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
547
 
548
+ tab1, tab2 = st.tabs(["Embed", "Important Notes"])
 
 
 
 
549
 
550
  with tab1:
551
  with st.expander("Embed"):
 
558
  height="450"
559
  ></iframe>
560
  '''
561
+ st.code(code, language="html")
 
 
562
 
563
  with tab2:
564
  expander = st.expander("**Important Notes**")
 
 
565
  expander.markdown("""
566
  **Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
 
567
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
 
568
  **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
 
569
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
570
  """)
571
 
572
+ st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 
573
 
574
  # --- Comet ML Setup (Placeholder/Conditional) ---
575
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
576
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
577
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
578
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
579
+
580
  # --- Model Loading ---
581
  @st.cache_resource
582
  def load_ner_model():
 
586
  except Exception as e:
587
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
588
  st.stop()
589
+
590
  model = load_ner_model()
 
591
 
592
+ # --- LONG DEFAULT TEXT (178 Words) ---
593
  DEFAULT_TEXT = (
594
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
595
  "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
 
606
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
607
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026."
608
  )
 
 
 
 
 
 
 
 
609
  # -----------------------------------
610
+
611
+ # --- Session State Initialization (CRITICAL) ---
612
  if 'show_results' not in st.session_state:
613
  st.session_state.show_results = False
614
  if 'last_text' not in st.session_state:
 
619
  st.session_state.elapsed_time = 0.0
620
  if 'topic_results' not in st.session_state:
621
  st.session_state.topic_results = None
622
+ # --- FIX: Only set default text in session state, not in st.text_area value ---
623
  if 'my_text_area' not in st.session_state:
624
  st.session_state.my_text_area = DEFAULT_TEXT
625
+
626
+ # --- Clear Button Function ---
627
  def clear_text():
628
  """Clears the text area (sets it to an empty string) and hides results."""
629
  st.session_state['my_text_area'] = ""
 
632
  st.session_state.results_df = pd.DataFrame()
633
  st.session_state.elapsed_time = 0.0
634
  st.session_state.topic_results = None
635
+
636
  # --- Text Input and Clear Button ---
637
  word_limit = 1000
638
  text = st.text_area(
639
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
640
  height=250,
641
+ key='my_text_area', # Streamlit automatically uses st.session_state.my_text_area here
642
  )
643
+
644
  word_count = len(text.split())
645
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
646
  st.button("Clear text", on_click=clear_text)
647
+
648
  # --- Results Trigger and Processing (Updated Logic) ---
649
  if st.button("Results"):
650
  if not text.strip():
 
658
  if text != st.session_state.last_text:
659
  st.session_state.last_text = text
660
  start_time = time.time()
661
+
662
  # --- Model Prediction & Dataframe Creation ---
663
  entities = model.predict_entities(text, labels)
664
  df = pd.DataFrame(entities)
665
+
666
  if not df.empty:
667
  df['text'] = df['text'].apply(remove_trailing_punctuation)
668
  df['category'] = df['label'].map(reverse_category_mapping)
669
  st.session_state.results_df = df
670
+
671
  unique_entity_count = len(df['text'].unique())
672
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
673
+
674
  st.session_state.topic_results = perform_topic_modeling(
675
  df,
676
  num_topics=2,
677
  num_top_words=N_TOP_WORDS_TO_USE
678
  )
679
+
680
  if comet_initialized:
681
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
682
  experiment.log_parameter("input_text", text)
 
685
  else:
686
  st.session_state.results_df = pd.DataFrame()
687
  st.session_state.topic_results = None
688
+
689
  end_time = time.time()
690
  st.session_state.elapsed_time = end_time - start_time
691
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
692
+
693
  st.session_state.show_results = True
694
+
695
+ # --- Display Download Link and Results (Updated with Download Buttons) ---
696
  if st.session_state.show_results:
697
  df = st.session_state.results_df
698
  df_topic_data = st.session_state.topic_results
699
+
700
  if df.empty:
701
  st.warning("No entities were found in the provided text.")
702
  else:
703
  st.subheader("Analysis Results", divider="blue")
704
+
705
  # 1. Highlighted Text
706
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
707
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
708
+
709
  # 2. Detailed Entity Analysis Tabs
710
  st.markdown("### 2. Detailed Entity Analysis")
711
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
712
+
713
  with tab_category_details:
714
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
715
+
 
 
716
  unique_categories = list(category_mapping.keys())
717
  tabs_category = st.tabs(unique_categories)
718
+
719
  for category, tab in zip(unique_categories, tabs_category):
720
  df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
721
  with tab:
 
726
  use_container_width=True,
727
  column_config={'score': st.column_config.NumberColumn(format="%.4f")}
728
  )
729
+
 
 
 
 
 
 
 
 
 
 
 
 
730
  with tab_treemap_viz:
 
731
  fig_treemap = px.treemap(
732
  df,
733
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
734
  values='score',
735
  color='category',
736
+ title="Entity Distribution by Category and Label",
737
  color_discrete_sequence=px.colors.qualitative.Dark24
738
  )
739
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
740
  st.plotly_chart(fig_treemap, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
 
 
 
 
 
 
 
 
 
 
742
 
743
+ # 3. Download Options (NEW)
744
+ st.markdown("### 3. Download Options")
745
+ col_csv, col_html = st.columns(2)
746
 
747
+ with col_csv:
748
+ csv_data = generate_entity_csv(df)
749
+ st.download_button(
750
+ label="Download Entities as CSV",
751
+ data=csv_data,
752
+ file_name="entity_analysis_data.csv",
753
+ mime="text/csv",
754
+ type="primary"
755
+ )
756
+
757
+ with col_html:
758
+ html_report = generate_html_report(
759
+ df,
760
+ st.session_state.last_text,
761
+ st.session_state.elapsed_time,
762
+ df_topic_data
763
+ )
764
+ st.download_button(
765
+ label="Download Full HTML Report",
766
+ data=html_report,
767
+ file_name="entity_topic_report.html",
768
+ mime="text/html",
769
+ type="secondary"
770
+ )