Update src/streamlit_app.py

#1
by nlpblogs - opened
Files changed (1) hide show
  1. src/streamlit_app.py +164 -751
src/streamlit_app.py CHANGED
@@ -1,815 +1,228 @@
1
  import os
2
- os.environ['HF_HOME'] = '/tmp'
3
  import time
4
  import streamlit as st
5
- import streamlit.components.v1 as components
6
  import pandas as pd
7
- import io
8
- import plotly.express as px
9
- import plotly.graph_objects as go
10
  import numpy as np
11
  import re
12
  import string
13
  import json
14
- # --- PPTX Imports ---
15
  from io import BytesIO
 
 
 
 
 
16
  from pptx import Presentation
17
  from pptx.util import Inches, Pt
18
- from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
- import plotly.io as pio # Required for image export
20
- # ---------------------------
21
- # --- Stable Scikit-learn LDA Imports ---
22
  from sklearn.feature_extraction.text import TfidfVectorizer
23
  from sklearn.decomposition import LatentDirichletAllocation
24
- # ------------------------------
25
- from gliner import GLiNER
26
- from streamlit_extras.stylable_container import stylable_container
27
-
28
-
29
-
30
-
31
-
32
-
33
-
34
-
35
-
36
-
37
-
38
-
39
-
40
-
41
-
42
-
43
-
44
-
45
-
46
-
47
-
48
-
49
-
50
 
51
-
52
-
53
-
54
-
55
-
56
-
57
-
58
-
59
-
60
-
61
- # Using a try/except for comet_ml import
62
- try:
63
- from comet_ml import Experiment
64
- except ImportError:
65
- class Experiment:
66
- def __init__(self, **kwargs): pass
67
- def log_parameter(self, *args): pass
68
- def log_table(self, *args): pass
69
- def end(self): pass
70
- # --- Model Home Directory (Fix for deployment environments) ---
71
- # Set HF_HOME environment variable to a writable path
72
  os.environ['HF_HOME'] = '/tmp'
73
- # --- Color Map for Highlighting and Network Graph Nodes ---
74
  entity_color_map = {
75
- "person": "#10b981",
76
- "country": "#3b82f6",
77
- "city": "#4ade80",
78
- "organization": "#f59e0b",
79
- "date": "#8b5cf6",
80
- "time": "#ec4899",
81
- "cardinal": "#06b6d4",
82
- "money": "#f43f5e",
83
- "position": "#a855f7",
84
- }
85
- # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
86
  labels = list(entity_color_map.keys())
87
  category_mapping = {
88
  "People": ["person", "organization", "position"],
89
  "Locations": ["country", "city"],
90
  "Time": ["date", "time"],
91
- "Numbers": ["money", "cardinal"]}
92
- reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
93
- # --- Utility Functions for Analysis and Plotly ---
94
- def extract_label(node_name):
95
- """Extracts the label from a node string like 'Text (Label)'."""
96
- match = re.search(r'\(([^)]+)\)$', node_name)
97
- return match.group(1) if match else "Unknown"
98
  def remove_trailing_punctuation(text_string):
99
- """Removes trailing punctuation from a string."""
100
  return text_string.rstrip(string.punctuation)
 
101
  def highlight_entities(text, df_entities):
102
- """Generates HTML to display text with entities highlighted and colored."""
103
  if df_entities.empty:
104
  return text
105
- # Sort entities by start index descending to insert highlights without affecting subsequent indices
106
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
107
  highlighted_text = text
108
  for entity in entities:
109
- start = entity['start']
110
- end = entity['end']
111
- label = entity['label']
112
- entity_text = entity['text']
113
  color = entity_color_map.get(label, '#000000')
114
- # Create a span with background color and tooltip
115
- highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
116
- # Replace the original text segment with the highlighted HTML
117
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
118
- # Use a div to mimic the Streamlit input box style for the report
119
- return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
120
-
121
 
122
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
123
- """
124
- Performs basic Topic Modeling using LDA on the extracted entities,
125
- allowing for n-grams to capture multi-word entities like 'Dr. Emily Carter'.
126
- """
127
- # 1. Prepare Documents: Use unique entities (they are short, clean documents)
128
  documents = df_entities['text'].unique().tolist()
129
-
130
- if len(documents) < 2:
131
- return None
132
-
133
- N = min(num_top_words, len(documents))
134
-
135
  try:
136
- # 2. Vectorizer: Use TfidfVectorizer, but allow unigrams, bigrams, and trigrams (ngram_range)
137
- # to capture multi-word entities. We keep stop_words='english' for the *components* of the entity.
138
- tfidf_vectorizer = TfidfVectorizer(
139
- max_df=0.95,
140
- min_df=2, # Only consider words/phrases that appear at least twice to find topics
141
- stop_words='english',
142
- ngram_range=(1, 3) # This is the KEY to capturing "Dr. Emily Carter" as a single token (if it appears enough times)
143
- )
144
-
145
  tfidf = tfidf_vectorizer.fit_transform(documents)
146
- tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
147
-
148
- # Check if the vocabulary is too small after tokenization/ngram generation
149
- if len(tfidf_feature_names) < num_topics:
150
- # Re-run with min_df=1 if vocab is too small
151
- tfidf_vectorizer = TfidfVectorizer(
152
- max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3)
153
- )
154
- tfidf = tfidf_vectorizer.fit_transform(documents)
155
- tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
156
- if len(tfidf_feature_names) < num_topics:
157
- return None
158
-
159
- # 3. LDA Model Fit
160
- lda = LatentDirichletAllocation(
161
- n_components=num_topics, max_iter=5, learning_method='online',
162
- random_state=42, n_jobs=-1
163
- )
164
  lda.fit(tfidf)
165
 
166
- # 4. Extract Topic Data
167
- topic_data_list = []
168
- for topic_idx, topic in enumerate(lda.components_):
169
- top_words_indices = topic.argsort()[:-N - 1:-1]
170
- # These top_words will now include phrases like 'emily carter' or 'european space agency'
171
- top_words = [tfidf_feature_names[i] for i in top_words_indices]
172
- word_weights = [topic[i] for i in top_words_indices]
173
-
174
- for word, weight in zip(top_words, word_weights):
175
- topic_data_list.append({
176
- 'Topic_ID': f'Topic #{topic_idx + 1}',
177
- 'Word': word,
178
- 'Weight': weight,
179
- })
180
-
181
- return pd.DataFrame(topic_data_list)
182
-
183
- except Exception as e:
184
- # A broader catch for robustness
185
- # st.error(f"Topic modeling failed: {e}") # Keep commented out for cleaner app
186
- return None
187
-
188
 
 
189
 
190
-
191
-
192
  def create_topic_word_bubbles(df_topic_data):
193
- """Generates a Plotly Bubble Chart for top words across
194
- all topics, displaying the word directly on the bubble."""
195
- # Renaming columns to match the output of perform_topic_modeling
196
- df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic',
197
- 'Word': 'word', 'Weight': 'weight'})
198
- df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position
199
- if df_topic_data.empty:
200
- return None
201
-
202
- fig = px.scatter(
203
- df_topic_data,
204
- x='x_pos',
205
- y='weight',
206
- size='weight',
207
- color='topic',
208
- # Set text to the word
209
- text='word',
210
- hover_name='word',
211
- size_max=40,
212
- title='Topic Word Weights (Bubble Chart)',
213
- color_discrete_sequence=px.colors.qualitative.Bold,
214
- labels={
215
- 'x_pos': 'Entity/Word Index',
216
- 'weight': 'Word Weight',
217
- 'topic': 'Topic ID'
218
- },
219
- custom_data=['word', 'weight', 'topic']
220
- )
221
-
222
- fig.update_layout(
223
- xaxis_title="Entity/Word",
224
- yaxis_title="Word Weight",
225
- # Hide x-axis labels since words are now labels
226
- xaxis={'tickangle': -45, 'showgrid': False, 'showticklabels': False, 'zeroline': False, 'showline': False},
227
- yaxis={'showgrid': True},
228
- showlegend=True,
229
- plot_bgcolor='#f9f9f9',
230
- paper_bgcolor='#f9f9f9',
231
- height=600,
232
- margin=dict(t=50, b=100, l=50, r=10),
233
- )
234
-
235
- # Update traces to show the word text, set the text position, and set text color
236
- fig.update_traces(
237
- # Position the text on top of the bubble
238
- textposition='middle center',
239
- # --- THE KEY FIX IS HERE ---
240
- # Set the text color to white for visibility against dark bubble colors
241
- textfont=dict(color='white', size=10),
242
- # ---------------------------
243
- hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',
244
- marker=dict(line=dict(width=1, color='DarkSlateGrey'))
245
- )
246
-
247
  return fig
248
 
249
-
250
-
251
  def generate_network_graph(df, raw_text):
252
- """
253
- Generates a network graph visualization (Node Plot) with edges
254
- based on entity co-occurrence in sentences. (Content omitted for brevity but assumed to be here).
255
- """
256
- # Using the existing generate_network_graph logic from previous context...
257
- entity_counts = df['text'].value_counts().reset_index()
258
- entity_counts.columns = ['text', 'frequency']
259
- unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
260
- if unique_entities.shape[0] < 2:
261
- return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
262
- num_nodes = len(unique_entities)
263
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
264
- radius = 10
265
- unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
266
- unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
267
- pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
268
- edges = set()
269
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
270
- for sentence in sentences:
271
- entities_in_sentence = []
272
- for entity_text in unique_entities['text'].unique():
273
- if entity_text.lower() in sentence.lower():
274
- entities_in_sentence.append(entity_text)
275
- unique_entities_in_sentence = list(set(entities_in_sentence))
276
- for i in range(len(unique_entities_in_sentence)):
277
- for j in range(i + 1, len(unique_entities_in_sentence)):
278
- node1 = unique_entities_in_sentence[i]
279
- node2 = unique_entities_in_sentence[j]
280
- edge_tuple = tuple(sorted((node1, node2)))
281
- edges.add(edge_tuple)
282
- edge_x = []
283
- edge_y = []
284
- for edge in edges:
285
- n1, n2 = edge
286
- if n1 in pos_map and n2 in pos_map:
287
- edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
288
- edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
289
  fig = go.Figure()
290
- edge_trace = go.Scatter(
291
- x=edge_x, y=edge_y,
292
- line=dict(width=0.5, color='#888'),
293
- hoverinfo='none',
294
- mode='lines',
295
- name='Co-occurrence Edges',
296
- showlegend=False
297
- )
298
- fig.add_trace(edge_trace)
299
  fig.add_trace(go.Scatter(
300
- x=unique_entities['x'],
301
- y=unique_entities['y'],
302
- mode='markers+text',
303
- name='Entities',
304
- text=unique_entities['text'],
305
- textposition="top center",
306
- showlegend=False,
307
- marker=dict(
308
- size=unique_entities['frequency'] * 5 + 10,
309
- color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
310
- line_width=1,
311
- line_color='black',
312
- opacity=0.9
313
- ),
314
- textfont=dict(size=10),
315
- customdata=unique_entities[['label', 'score', 'frequency']],
316
- hovertemplate=(
317
- "<b>%{text}</b><br>" +
318
- "Label: %{customdata[0]}<br>" +
319
- "Score: %{customdata[1]:.2f}<br>" +
320
- "Frequency: %{customdata[2]}<extra></extra>"
321
- )
322
  ))
323
- legend_traces = []
324
- seen_labels = set()
325
- for index, row in unique_entities.iterrows():
326
- label = row['label']
327
- if label not in seen_labels:
328
- seen_labels.add(label)
329
- color = entity_color_map.get(label, '#cccccc')
330
- legend_traces.append(go.Scatter(
331
- x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True
332
- ))
333
- for trace in legend_traces:
334
- fig.add_trace(trace)
335
- fig.update_layout(
336
- title='Entity Co-occurrence Network (Edges = Same Sentence)',
337
- showlegend=True,
338
- hovermode='closest',
339
- xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
340
- yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
341
- plot_bgcolor='#f9f9f9',
342
- paper_bgcolor='#f9f9f9',
343
- margin=dict(t=50, b=10, l=10, r=10),
344
- height=600
345
- )
346
  return fig
347
- # --- NEW CSV GENERATION FUNCTION ---
348
- def generate_entity_csv(df):
349
- """
350
- Generates a CSV file of the extracted entities in an in-memory buffer,
351
- including text, label, category, score, start, and end indices.
352
- """
353
- csv_buffer = BytesIO()
354
- # Select desired columns and write to buffer
355
- df_export = df[['text', 'label', 'category', 'score', 'start', 'end']]
356
- csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
357
- csv_buffer.seek(0)
358
- return csv_buffer
359
- # -----------------------------------
360
- # --- Existing App Functionality (HTML) ---
361
- def generate_html_report(df, text_input, elapsed_time, df_topic_data):
362
- """
363
- Generates a full HTML report containing all analysis results and visualizations.
364
- (Content omitted for brevity but assumed to be here).
365
- """
366
- # 1. Generate Visualizations (Plotly HTML)
367
- # 1a. Treemap
368
- fig_treemap = px.treemap(
369
- df,
370
- path=[px.Constant("All Entities"), 'category', 'label', 'text'],
371
- values='score',
372
- color='category',
373
- title="Entity Distribution by Category and Label",
374
- color_discrete_sequence=px.colors.qualitative.Dark24
375
- )
376
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
377
- treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
378
- # 1b. Pie Chart
379
- grouped_counts = df['category'].value_counts().reset_index()
380
- grouped_counts.columns = ['Category', 'Count']
381
- # Changed color_discrete_sequence from sequential.RdBu (which has reds) to sequential.Cividis
382
- fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
383
- fig_pie.update_layout(margin=dict(t=50, b=10))
384
- pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
385
- # 1c. Bar Chart (Category Count)
386
- fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
387
- fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
388
- bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
389
- # 1d. Bar Chart (Most Frequent Entities)
390
- word_counts = df['text'].value_counts().reset_index()
391
- word_counts.columns = ['Entity', 'Count']
392
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
393
- bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
394
- if not repeating_entities.empty:
395
- # Changed color_discrete_sequence from sequential.Plasma (which has pink/magenta) to sequential.Viridis
396
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
397
- fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
398
- bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
399
- # 1e. Network Graph HTML
400
- network_fig = generate_network_graph(df, text_input)
401
- network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
402
- # 1f. Topic Charts HTML
403
- topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
404
- if df_topic_data is not None and not df_topic_data.empty:
405
- bubble_figure = create_topic_word_bubbles(df_topic_data)
406
- if bubble_figure:
407
-
408
- topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn", config={"responsive": True})}</div>'
409
- else:
410
- topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
411
- else:
412
- topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">' # Changed border color
413
- topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
414
- topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
415
- topic_charts_html += '</div>'
416
- # 2. Get Highlighted Text
417
- highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
418
- # 3. Entity Tables (Pandas to HTML)
419
- entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
420
- classes='table table-striped',
421
- index=False
422
- )
423
- # 4. Construct the Final HTML
424
- html_content = f"""<!DOCTYPE html><html lang="en"><head>
425
- <meta charset="UTF-8">
426
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
427
- <title>Entity and Topic Analysis Report</title>
428
- <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
429
- <style>
430
- body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
431
- .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
432
- h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom: 10px; margin-top: 0; }}
433
- h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
434
- h3 {{ color: #555; margin-top: 20px; }}
435
- .metadata {{ background-color: #e6f0ff; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
436
- .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
437
- table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
438
- table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
439
- table th {{ background-color: #f0f0f0; }}
440
- .highlighted-text {{ border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
441
- </style></head><body>
442
- <div class="container">
443
- <h1>Entity and Topic Analysis Report</h1>
444
- <div class="metadata">
445
- <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
446
- <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
447
- </div>
448
- <h2>1. Analyzed Text & Extracted Entities</h2>
449
- <h3>Original Text with Highlighted Entities</h3>
450
- <div class="highlighted-text-container">
451
- {highlighted_text_html}
452
- </div>
453
- <h2>2. Full Extracted Entities Table</h2>
454
- {entity_table_html}
455
- <h2>3. Data Visualizations</h2>
456
- <h3>3.1 Entity Distribution Treemap</h3>
457
- <div class="chart-box">{treemap_html}</div>
458
- <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
459
- <div class="chart-box">{pie_html}</div>
460
- <div class="chart-box">{bar_category_html}</div>
461
- <div class="chart-box">{bar_freq_html}</div>
462
- <h3>3.3 Entity Relationship Map (Edges = Same Sentence)</h3>
463
- <div class="chart-box">{network_html}</div>
464
- <h2>4. Topic Modelling</h2>
465
- {topic_charts_html}
466
- </div></body></html>
467
- """
468
- return html_content
469
- # --- Page Configuration and Styling (No Sidebar) ---
470
- st.set_page_config(layout="wide", page_title="NER & Topic Report App")
471
 
 
472
 
473
- # --- Conditional Mobile Warning ---
474
- st.markdown(
475
- """
476
- <style>
477
- /* CSS Media Query: Only show the content inside this selector when the screen width is 600px or less (typical mobile size) */
478
- @media (max-width: 600px) {
479
- #mobile-warning-container {
480
- display: block; /* Show the warning container */
481
- background-color: #ffcccc; /* Light red/pink background */
482
- color: #cc0000; /* Dark red text */
483
- padding: 10px;
484
- border-radius: 5px;
485
- text-align: center;
486
- margin-bottom: 20px;
487
- font-weight: bold;
488
- border: 1px solid #cc0000;
489
- }
490
- }
491
 
492
- /* Hide the content by default (for larger screens) */
493
- @media (min-width: 601px) {
494
- #mobile-warning-container {
495
- display: none; /* Hide the warning container on desktop */
496
- }
497
- }
498
- </style>
499
 
500
- <div id="mobile-warning-container">
501
- ⚠️ **Tip for Mobile Users:** For the best viewing experience of the charts and tables, please switch your browser to **"Desktop Site"** view.
502
- </div>
503
- """,
504
- unsafe_allow_html=True
505
- )
506
- # ----------------------------------
507
-
508
-
509
-
510
-
511
-
512
-
513
-
514
-
515
-
516
- st.markdown(
 
 
 
 
 
 
 
517
  """
518
- <style>
519
- /* ... (Keep your existing styles for main, stApp, stTextArea, stButton) ... */
520
- /* --- FIX: Tab Label Colors for Visibility --- */
521
- /* Target the container for the tab labels (the buttons) */
522
- [data-testid="stConfigurableTabs"] button {
523
- color: #333333 !important; /* Dark gray for inactive tabs */
524
- background-color: #f0f0f0; /* Light gray background for inactive tabs */
525
- border: 1px solid #cccccc;
526
- }
527
- /* Target the ACTIVE tab label */
528
- [data-testid="stConfigurableTabs"] button[aria-selected="true"] {
529
- color: #FFFFFF !important; /* White text for active tab */
530
- background-color: #007bff; /* Blue background for active tab */
531
- border-bottom: 2px solid #007bff; /* Optional: adds an accent line */
532
- }
533
-
534
- /* Expander header color fix (since you overwrote it to white) */
535
- .streamlit-expanderHeader {
536
- color: #007bff; /* Blue text for Expander header */
537
- }
538
- </style>
539
- """,
540
- unsafe_allow_html=True
541
- )
542
-
543
-
544
- st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
545
- st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
546
-
547
-
548
 
 
 
 
549
 
 
550
 
551
- tab1, tab2 = st.tabs(["Embed", "Important Notes"]) # Assuming you have defined the tabs
552
-
553
- with tab1:
554
- with st.expander("Embed"):
555
- st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
556
- code = '''
557
- <iframe
558
- src="https://aiecosystem-dataharvest.hf.space"
559
- frameborder="0"
560
- width="850"
561
- height="450"
562
- ></iframe>
563
- '''
564
- st.code(code, language="html") # Keeps the copy icon, as intended for tab1
565
-
566
-
 
 
 
 
 
 
 
 
567
 
568
- with tab2:
569
- expander = st.expander("**Important Notes**")
570
- # Use st.markdown() with a code block (```) to display the notes
571
- # without the copy-to-clipboard icon, and retaining the styling.
572
- expander.markdown("""
573
- **Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
574
 
575
- **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
 
576
 
577
- **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
578
 
579
- **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
580
- """)
 
 
 
581
 
582
-
583
- st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
584
-
585
- # --- Comet ML Setup (Placeholder/Conditional) ---
586
- COMET_API_KEY = os.environ.get("COMET_API_KEY")
587
- COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
588
- COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
589
- comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
590
- # --- Model Loading ---
591
- @st.cache_resource
592
- def load_ner_model():
593
- """Loads the GLiNER model and caches it."""
594
- try:
595
- return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
596
- except Exception as e:
597
- st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
598
- st.stop()
599
- model = load_ner_model()
600
- # --- LONG DEFAULT TEXT (178 Words) ---
601
-
602
- DEFAULT_TEXT = (
603
- "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
604
- "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
605
- "leap forward for commercial space technology across the entire **European Union**. The agreement, finalized "
606
- "on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
607
- "software platform. This version of the **Astra** platform is critical for processing and managing the vast amounts of data being sent "
608
- "back from the recent Mars rover mission. This project underscores the ESA's commitment to advancing "
609
- "space capabilities within the **European Union**. The core team, including lead engineer Marcus Davies, will hold "
610
- "their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
611
- "media platform X (under the username @TechCEO) was overwhelmingly positive, with many major tech "
612
- "publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
613
- "end of the year, further strengthening the technological standing of the **European Union**. The platform is designed to be compatible with both Windows and Linux operating systems. "
614
- "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
615
- "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
616
- "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026."
617
- )
618
-
619
-
620
-
621
-
622
-
623
-
624
-
625
-
626
- # -----------------------------------
627
- # --- Session State Initialization (CRITICAL FIX) ---
628
- if 'show_results' not in st.session_state:
629
- st.session_state.show_results = False
630
- if 'last_text' not in st.session_state:
631
- st.session_state.last_text = ""
632
- if 'results_df' not in st.session_state:
633
- st.session_state.results_df = pd.DataFrame()
634
- if 'elapsed_time' not in st.session_state:
635
- st.session_state.elapsed_time = 0.0
636
- if 'topic_results' not in st.session_state:
637
- st.session_state.topic_results = None
638
- if 'my_text_area' not in st.session_state:
639
- st.session_state.my_text_area = DEFAULT_TEXT
640
- # --- Clear Button Function (MODIFIED) ---
641
- def clear_text():
642
- """Clears the text area (sets it to an empty string) and hides results."""
643
- st.session_state['my_text_area'] = ""
644
- st.session_state.show_results = False
645
- st.session_state.last_text = ""
646
- st.session_state.results_df = pd.DataFrame()
647
- st.session_state.elapsed_time = 0.0
648
- st.session_state.topic_results = None
649
- # --- Text Input and Clear Button ---
650
- word_limit = 1000
651
- text = st.text_area(
652
- f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
653
- height=250,
654
- key='my_text_area',
655
- )
656
- word_count = len(text.split())
657
- st.markdown(f"**Word count:** {word_count}/{word_limit}")
658
- st.button("Clear text", on_click=clear_text)
659
- # --- Results Trigger and Processing (Updated Logic) ---
660
- if st.button("Results"):
661
- if not text.strip():
662
- st.warning("Please enter some text to extract entities.")
663
- st.session_state.show_results = False
664
- elif word_count > word_limit:
665
- st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
666
- st.session_state.show_results = False
667
- else:
668
- with st.spinner("Extracting entities and generating report data...", show_time=True):
669
- if text != st.session_state.last_text:
670
- st.session_state.last_text = text
671
- start_time = time.time()
672
- # --- Model Prediction & Dataframe Creation ---
673
- entities = model.predict_entities(text, labels)
674
- df = pd.DataFrame(entities)
675
- if not df.empty:
676
- df['text'] = df['text'].apply(remove_trailing_punctuation)
677
- df['category'] = df['label'].map(reverse_category_mapping)
678
- st.session_state.results_df = df
679
- unique_entity_count = len(df['text'].unique())
680
- N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
681
- st.session_state.topic_results = perform_topic_modeling(
682
- df,
683
- num_topics=2,
684
- num_top_words=N_TOP_WORDS_TO_USE
685
- )
686
- if comet_initialized:
687
- experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
688
- experiment.log_parameter("input_text", text)
689
- experiment.log_table("predicted_entities", df)
690
- experiment.end()
691
- else:
692
- st.session_state.results_df = pd.DataFrame()
693
- st.session_state.topic_results = None
694
- end_time = time.time()
695
- st.session_state.elapsed_time = end_time - start_time
696
- st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
697
- st.session_state.show_results = True
698
- # --- Display Download Link and Results ---
699
- if st.session_state.show_results:
700
- df = st.session_state.results_df
701
- df_topic_data = st.session_state.topic_results
702
- if df.empty:
703
- st.warning("No entities were found in the provided text.")
704
- else:
705
- st.subheader("Analysis Results", divider="blue")
706
- # 1. Highlighted Text
707
- st.markdown("### 1. Analyzed Text with Highlighted Entities")
708
- st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
709
-
710
- # 2. Detailed Entity Analysis Tabs
711
- st.markdown("### 2. Detailed Entity Analysis")
712
- tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
713
- with tab_category_details:
714
- st.markdown("#### Detailed Entities Table (Grouped by Category)")
715
-
716
-
717
-
718
- unique_categories = list(category_mapping.keys())
719
- tabs_category = st.tabs(unique_categories)
720
- for category, tab in zip(unique_categories, tabs_category):
721
- df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
722
- with tab:
723
- st.markdown(f"##### {category} Entities ({len(df_category)} total)")
724
- if not df_category.empty:
725
- st.dataframe(
726
- df_category,
727
- use_container_width=True,
728
- column_config={'score': st.column_config.NumberColumn(format="%.4f")}
729
- )
730
- else:
731
- st.info(f"No entities of category **{category}** were found in the text.")
732
-
733
-
734
- with st.expander("See Glossary of tags"):
735
- st.write('''
736
- - **text**: ['entity extracted from your text data']
737
- - **label**: ['label (tag) assigned to a given extracted entity']
738
- - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
739
- - **start**: ['index of the start of the corresponding entity']
740
- - **end**: ['index of the end of the corresponding entity']
741
- ''')
742
-
743
- with tab_treemap_viz:
744
- st.markdown("#### Treemap: Entity Distribution")
745
- fig_treemap = px.treemap(
746
- df,
747
- path=[px.Constant("All Entities"), 'category', 'label', 'text'],
748
- values='score',
749
- color='category',
750
- color_discrete_sequence=px.colors.qualitative.Dark24
751
- )
752
- fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
753
- st.plotly_chart(fig_treemap, use_container_width=True)
754
- # 3. Comparative Charts
755
- st.markdown("---")
756
- st.markdown("### 3. Comparative Charts")
757
- col1, col2, col3 = st.columns(3)
758
- grouped_counts = df['category'].value_counts().reset_index()
759
- grouped_counts.columns = ['Category', 'Count']
760
- with col1: # Pie Chart
761
- # Changed color_discrete_sequence
762
- fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
763
- fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
764
- st.plotly_chart(fig_pie, use_container_width=True)
765
- with col2: # Bar Chart (Category Count)
766
- fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
767
- fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
768
- st.plotly_chart(fig_bar_category, use_container_width=True)
769
- with col3: # Bar Chart (Most Frequent Entities)
770
- word_counts = df['text'].value_counts().reset_index()
771
- word_counts.columns = ['Entity', 'Count']
772
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
773
- if not repeating_entities.empty:
774
- # Changed color_discrete_sequence
775
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
776
- fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
777
- st.plotly_chart(fig_bar_freq, use_container_width=True)
778
- else:
779
- st.info("No entities repeat for frequency chart.")
780
- st.markdown("---")
781
- st.markdown("### 4. Entity Relationship Map")
782
- network_fig = generate_network_graph(df, st.session_state.last_text)
783
- st.plotly_chart(network_fig, use_container_width=True)
784
- st.markdown("---")
785
- st.markdown("### 5. Topic Modelling Analysis")
786
- if df_topic_data is not None and not df_topic_data.empty:
787
- bubble_figure = create_topic_word_bubbles(df_topic_data)
788
- if bubble_figure:
789
- st.plotly_chart(bubble_figure, use_container_width=True)
790
- else:
791
- st.error("Error generating Topic Word Bubble Chart.")
792
  else:
793
- st.info("Topic modeling requires more unique input (at least two unique entities).")
794
- # --- Report Download ---
795
- st.markdown("---")
796
- st.markdown("### Download Full Report Artifacts")
797
- # 1. HTML Report Download (Retained)
798
- html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
799
- st.download_button(
800
- label="Download Comprehensive HTML Report",
801
- data=html_report,
802
- file_name="ner_topic_report.html",
803
- mime="text/html",
804
- type="primary"
805
- )
806
 
807
- # 2. CSV Data Download (NEW)
808
- csv_buffer = generate_entity_csv(df)
809
- st.download_button(
810
- label="Download Extracted Entities (CSV)",
811
- data=csv_buffer,
812
- file_name="extracted_entities.csv",
813
- mime="text/csv",
814
- type="secondary"
815
- )
 
 
 
 
 
1
  import os
 
2
  import time
3
  import streamlit as st
 
4
  import pandas as pd
 
 
 
5
  import numpy as np
6
  import re
7
  import string
8
  import json
 
9
  from io import BytesIO
10
+
11
+ # --- Visualization & PPTX ---
12
+ import plotly.express as px
13
+ import plotly.graph_objects as go
14
+ import plotly.io as pio
15
  from pptx import Presentation
16
  from pptx.util import Inches, Pt
17
+
18
+ # --- NLP & Analysis ---
19
+ from gliner import GLiNER
 
20
  from sklearn.feature_extraction.text import TfidfVectorizer
21
  from sklearn.decomposition import LatentDirichletAllocation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ # --- 1. CONFIGURATION & STYLING ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  os.environ['HF_HOME'] = '/tmp'
25
+
26
  entity_color_map = {
27
+ "person": "#10b981", "country": "#3b82f6", "city": "#4ade80",
28
+ "organization": "#f59e0b", "date": "#8b5cf6", "time": "#ec4899",
29
+ "cardinal": "#06b6d4", "money": "#f43f5e", "position": "#a855f7"
30
+ }
31
+
 
 
 
 
 
 
32
  labels = list(entity_color_map.keys())
33
  category_mapping = {
34
  "People": ["person", "organization", "position"],
35
  "Locations": ["country", "city"],
36
  "Time": ["date", "time"],
37
+ "Numbers": ["money", "cardinal"]
38
+ }
39
+ reverse_category_mapping = {label: cat for cat, lbls in category_mapping.items() for label in lbls}
40
+
41
+ # --- 2. CORE UTILITY FUNCTIONS ---
42
+
 
43
  def remove_trailing_punctuation(text_string):
 
44
  return text_string.rstrip(string.punctuation)
45
+
46
  def highlight_entities(text, df_entities):
 
47
  if df_entities.empty:
48
  return text
49
+ # Sort entities by start index descending to prevent index shifting
50
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
51
  highlighted_text = text
52
  for entity in entities:
53
+ start, end = entity['start'], entity['end']
54
+ label, entity_text = entity['label'], entity['text']
 
 
55
  color = entity_color_map.get(label, '#000000')
56
+ highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; font-weight: bold;">{entity_text}</span>'
 
 
57
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
58
+ return f'<div class="highlighted-text" style="border: 1px solid #ddd; padding: 15px; border-radius: 8px; background-color: #ffffff; line-height: 2; white-space: pre-wrap;">{highlighted_text}</div>'
 
 
59
 
60
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
 
 
 
 
 
61
  documents = df_entities['text'].unique().tolist()
62
+ if len(documents) < 2: return None
 
 
 
 
 
63
  try:
64
+ tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), min_df=1)
 
 
 
 
 
 
 
 
65
  tfidf = tfidf_vectorizer.fit_transform(documents)
66
+ feature_names = tfidf_vectorizer.get_feature_names_out()
67
+ lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  lda.fit(tfidf)
69
 
70
+ topic_data = []
71
+ for idx, topic in enumerate(lda.components_):
72
+ top_indices = topic.argsort()[:-num_top_words - 1:-1]
73
+ for i in top_indices:
74
+ topic_data.append({'Topic_ID': f'Topic #{idx + 1}', 'Word': feature_names[i], 'Weight': topic[i]})
75
+ return pd.DataFrame(topic_data)
76
+ except: return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # --- 3. VISUALIZATION FUNCTIONS (FIXED TITLES) ---
79
 
 
 
80
  def create_topic_word_bubbles(df_topic_data):
81
+ df = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
82
+ df['x_pos'] = range(len(df))
83
+ fig = px.scatter(df, x='x_pos', y='weight', size='weight', color='topic', text='word', title='Topic Word Weights')
84
+ # FIX: Increased top margin for title visibility
85
+ fig.update_layout(margin=dict(t=80, b=50), xaxis_showticklabels=False, plot_bgcolor='#f9f9f9')
86
+ fig.update_traces(textposition='middle center', textfont=dict(color='white', size=10))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  return fig
88
 
 
 
89
  def generate_network_graph(df, raw_text):
90
+ counts = df['text'].value_counts().reset_index(name='frequency')
91
+ unique = df.drop_duplicates(subset=['text']).merge(counts, on='text')
92
+ num_nodes = len(unique)
 
 
 
 
 
 
 
 
93
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
94
+ unique['x'] = 10 * np.cos(thetas)
95
+ unique['y'] = 10 * np.sin(thetas)
96
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  fig = go.Figure()
 
 
 
 
 
 
 
 
 
98
  fig.add_trace(go.Scatter(
99
+ x=unique['x'], y=unique['y'], mode='markers+text', text=unique['text'],
100
+ marker=dict(size=unique['frequency']*5 + 15, color=[entity_color_map.get(l, '#ccc') for l in unique['label']])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  ))
102
+ # FIX: Added top margin for Title
103
+ fig.update_layout(title="Entity Relationship Map", margin=dict(t=80), showlegend=False, xaxis_visible=False, yaxis_visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # --- 4. EXPORT FUNCTIONS ---
107
 
108
+ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
109
+ # Prepare all charts with fixed layout margins
110
+ fig_tree = px.treemap(df, path=[px.Constant("All"), 'category', 'label', 'text'], values='score', title="Entity Hierarchy")
111
+ fig_tree.update_layout(margin=dict(t=60, b=20, l=20, r=20))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ tree_html = fig_tree.to_html(full_html=False, include_plotlyjs='cdn')
114
+ net_html = generate_network_graph(df, text_input).to_html(full_html=False, include_plotlyjs='cdn')
 
 
 
 
 
115
 
116
+ html_template = f"""
117
+ <html>
118
+ <head>
119
+ <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
120
+ <style>
121
+ body {{ font-family: sans-serif; background: #f4f7f6; padding: 30px; }}
122
+ .card {{ background: white; padding: 25px; border-radius: 12px; margin-bottom: 25px; box-shadow: 0 2px 10px rgba(0,0,0,0.05); }}
123
+ /* FIX: Critical for title visibility */
124
+ .chart-box {{ min-height: 500px; overflow: visible !important; border: 1px solid #eee; }}
125
+ h1, h2 {{ color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 10px; }}
126
+ </style>
127
+ </head>
128
+ <body>
129
+ <div class="card">
130
+ <h1>NER & Topic Analysis Report</h1>
131
+ <p>Processing Time: {elapsed_time:.2f}s</p>
132
+ <h2>1. Highlighted Entities</h2>
133
+ {highlight_entities(text_input, df)}
134
+ <h2>2. Visual Analytics</h2>
135
+ <div class="chart-box">{tree_html}</div>
136
+ <div class="chart-box">{net_html}</div>
137
+ </div>
138
+ </body>
139
+ </html>
140
  """
141
+ return html_template
142
+
143
+ def generate_pptx_report(df):
144
+ prs = Presentation()
145
+ slide = prs.slides.add_slide(prs.slide_layouts[0])
146
+ slide.shapes.title.text = "Entity Analysis"
147
+ slide = prs.slides.add_slide(prs.slide_layouts[1])
148
+ slide.shapes.title.text = "Entity List"
149
+ tf = slide.placeholders[1].text_frame
150
+ for i, row in df.head(15).iterrows():
151
+ p = tf.add_paragraph()
152
+ p.text = f"{row['text']} ({row['label']})"
153
+ buffer = BytesIO()
154
+ prs.save(buffer)
155
+ buffer.seek(0)
156
+ return buffer
157
+
158
+ # --- 5. STREAMLIT UI & LOGIC ---
159
+
160
+ st.set_page_config(layout="wide", page_title="DataHarvest NER")
 
 
 
 
 
 
 
 
 
 
161
 
162
+ @st.cache_resource
163
+ def load_model():
164
+ return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True)
165
 
166
+ model = load_model()
167
 
168
+ # Session State Init
169
+ if 'results_df' not in st.session_state:
170
+ st.session_state.results_df = pd.DataFrame()
171
+ st.session_state.show = False
172
+
173
+ st.subheader("Entity & Topic Analysis Report Generator", divider="blue")
174
+
175
+ text = st.text_area("Paste text here (max 1000 words):", height=250)
176
+
177
+ if st.button("Run Analysis"):
178
+ if text:
179
+ with st.spinner("Processing..."):
180
+ start = time.time()
181
+ entities = model.predict_entities(text, labels)
182
+ df = pd.DataFrame(entities)
183
+ if not df.empty:
184
+ df['text'] = df['text'].apply(remove_trailing_punctuation)
185
+ df['category'] = df['label'].map(reverse_category_mapping)
186
+ st.session_state.results_df = df
187
+ st.session_state.elapsed = time.time() - start
188
+ st.session_state.topics = perform_topic_modeling(df)
189
+ st.session_state.show = True
190
+ else:
191
+ st.warning("No entities found.")
192
 
193
+ if st.session_state.show:
194
+ df = st.session_state.results_df
 
 
 
 
195
 
196
+ st.markdown("### 1. Extracted Entities")
197
+ st.markdown(highlight_entities(text, df), unsafe_allow_html=True)
198
 
199
+ t1, t2, t3 = st.tabs(["Charts", "Network Map", "Topics"])
200
 
201
+ with t1:
202
+ fig_tree = px.treemap(df, path=['category', 'label', 'text'], values='score', title="Entity Treemap")
203
+ # Ensure the preview also has margins
204
+ fig_tree.update_layout(margin=dict(t=50))
205
+ st.plotly_chart(fig_tree, use_container_width=True)
206
 
207
+ with t2:
208
+ st.plotly_chart(generate_network_graph(df, text), use_container_width=True)
209
+
210
+ with t3:
211
+ if st.session_state.topics is not None:
212
+ st.plotly_chart(create_topic_word_bubbles(st.session_state.topics), use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  else:
214
+ st.info("Not enough data for topic modeling.")
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
+ st.divider()
217
+ st.markdown("### Download Artifacts")
218
+ c1, c2, c3 = st.columns(3)
219
+
220
+ with c1:
221
+ st.download_button("Download HTML Report",
222
+ generate_html_report(df, text, st.session_state.elapsed, st.session_state.topics),
223
+ "report.html", "text/html", type="primary")
224
+ with c2:
225
+ csv = df.to_csv(index=False).encode('utf-8')
226
+ st.download_button("Download CSV Data", csv, "entities.csv", "text/csv")
227
+ with c3:
228
+ st.download_button("Download PPTX Summary", generate_pptx_report(df), "summary.pptx")