AIEcosystem commited on
Commit
1c3f8f0
·
verified ·
1 Parent(s): b04eb29

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +506 -492
src/streamlit_app.py CHANGED
@@ -18,464 +18,490 @@ from gliner import GLiNER
18
  from streamlit_extras.stylable_container import stylable_container
19
  # Using a try/except for comet_ml import
20
  try:
21
- from comet_ml import Experiment
22
  except ImportError:
23
- class Experiment:
24
- def __init__(self, **kwargs): pass
25
- def log_parameter(self, *args): pass
26
- def log_table(self, *args): pass
27
- def end(self): pass
28
  # --- Model Home Directory (Fix for deployment environments) ---
29
  # Set HF_HOME environment variable to a writable path
30
  os.environ['HF_HOME'] = '/tmp'
31
  # --- Color Map for Highlighting and Network Graph Nodes ---
32
  entity_color_map = {
33
- "person": "#10b981",
34
- "username": "#3b82f6",
35
- "hashtag": "#4ade80",
36
- "mention" : "#f97316",
37
- "organization": "#f59e0b",
38
- "community": "#8b5cf6",
39
- "position": "#ec4899",
40
- "location": "#06b6d4",
41
- "event": "#f43f5e",
42
- "product": "#a855f7",
43
- "platform": "#eab308",
44
- "date": "#6366f1",
45
- "media_type": "#14b8a6",
46
- "url": "#60a5fa",
47
- "nationality_religion": "#fb7185"
48
  }
49
  # --- Utility Functions ---
50
  def extract_label(node_name):
51
- """Extracts the label from a node string like 'Text (Label)'."""
52
- match = re.search(r'\(([^)]+)\)$', node_name)
53
- return match.group(1) if match else "Unknown"
54
 
55
  def remove_trailing_punctuation(text_string):
56
- """Removes trailing punctuation from a string."""
57
- return text_string.rstrip(string.punctuation)
58
 
59
  def highlight_entities(text, df_entities):
60
- """Generates HTML to display text with entities highlighted and colored."""
61
- if df_entities.empty:
62
- return text
63
  # Sort entities by start index descending to insert highlights without affecting subsequent indices
64
- entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
65
- highlighted_text = text
66
- for entity in entities:
67
- start = entity['start']
68
- end = entity['end']
69
- label = entity['label']
70
- entity_text = entity['text']
71
- color = entity_color_map.get(label, '#000000')
72
-
73
- # Create a span with background color and tooltip
74
- highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
75
-
76
- # Replace the original text segment with the highlighted HTML
77
- highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
78
  # Use a div to mimic the Streamlit input box style for the report
79
- return f'<div style="border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
80
-
81
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
82
- """
83
- Performs basic Topic Modeling using LDA on the extracted entities
84
- and returns structured data for visualization.
85
-
86
- Includes updated TF-IDF parameters (stop_words='english', max_df=0.95, min_df=1).
87
- """
88
  # Aggregate all unique entity text into a single document list
89
- documents = df_entities['text'].unique().tolist()
90
-
91
- if len(documents) < 2:
92
- return None
93
-
94
- N = min(num_top_words, len(documents))
95
-
96
- try:
97
- # UPDATED: Added stop_words='english' to filter common words tokenized
98
- # from multi-word entities (e.g., "The" from "The White House").
99
- tfidf_vectorizer = TfidfVectorizer(
100
- max_df=0.95,
101
- min_df=1, # Retained at 1 to keep all unique entities
102
- stop_words='english' # <-- THIS IS THE KEY ADDITION
103
- )
104
- tfidf = tfidf_vectorizer.fit_transform(documents)
105
- tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
106
-
107
- lda = LatentDirichletAllocation(
108
- n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1
109
- )
110
- lda.fit(tfidf)
111
-
112
- topic_data_list = []
113
- for topic_idx, topic in enumerate(lda.components_):
114
- top_words_indices = topic.argsort()[:-N - 1:-1]
115
- top_words = [tfidf_feature_names[i] for i in top_words_indices]
116
- word_weights = [topic[i] for i in top_words_indices]
117
-
118
- for word, weight in zip(top_words, word_weights):
 
119
  topic_data_list.append({
120
  'Topic_ID': f'Topic #{topic_idx + 1}',
121
  'Word': word,
122
  'Weight': weight,
123
  })
124
-
125
- return pd.DataFrame(topic_data_list)
126
-
127
- except Exception as e:
128
- st.error(f"Topic modeling failed: {e}")
129
- return None
130
 
131
  def create_topic_word_bubbles(df_topic_data):
132
- """Generates a Plotly Bubble Chart for top words across all topics."""
133
-
134
- if df_topic_data.empty:
135
- return None
136
-
137
- fig = px.scatter(
138
- df_topic_data,
139
- x='Word',
140
- y='Topic_ID',
141
- size='Weight',
142
- color='Topic_ID',
143
- size_max=80,
144
- title='Topic Word Weights (Bubble Chart)',
145
- color_discrete_sequence=px.colors.qualitative.Bold,
146
- hover_data={'Word': True, 'Weight': ':.3f', 'Topic_ID': False}
147
  )
148
-
149
- fig.update_layout(
150
- xaxis_title="Entity/Word (Bubble size = Word Weight)",
151
- yaxis_title="Topic ID",
152
- xaxis={'tickangle': -45, 'showgrid': False},
153
- yaxis={'showgrid': True, 'autorange': 'reversed'},
154
- showlegend=True,
155
- plot_bgcolor='#FFF0F5',
156
- paper_bgcolor='#FFF0F5',
157
- height=600,
158
- margin=dict(t=50, b=100, l=50, r=10),
159
  )
160
-
161
- fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
162
-
163
- return fig
164
 
165
  def generate_network_graph(df, raw_text):
166
- """
167
- Generates a network graph visualization (Node Plot) with edges
168
- based on entity co-occurrence in sentences.
169
- """
170
- entity_counts = df['text'].value_counts().reset_index()
171
- entity_counts.columns = ['text', 'frequency']
172
-
173
  # Merge counts with unique entities (text + label)
174
- unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
175
-
176
- if unique_entities.shape[0] < 2:
177
- # Return a simple figure with a message if not enough data
178
- return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
179
-
180
- num_nodes = len(unique_entities)
181
- thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
182
-
183
- radius = 10
184
-
185
  # Assign circular positions + a little randomness
186
- unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
187
- unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
188
-
189
  # Map entity text to its coordinates for easy lookup
190
- pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
191
-
192
  # ----------------------------------------------------------------------
193
  # 1. Identify Edges (Co-occurrence in sentences)
194
  # ----------------------------------------------------------------------
195
- edges = set()
196
-
197
  # Simple sentence segmentation (handles standard punctuation followed by space)
198
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
199
-
200
- for sentence in sentences:
201
- # Find unique entities that are substrings of this sentence
202
- entities_in_sentence = []
203
- for entity_text in unique_entities['text'].unique():
204
- if entity_text.lower() in sentence.lower():
205
- entities_in_sentence.append(entity_text)
206
-
207
- # Create edges (pairs) based on co-occurrence
208
- unique_entities_in_sentence = list(set(entities_in_sentence))
209
-
210
- # Create all unique pairs (edges)
211
- for i in range(len(unique_entities_in_sentence)):
212
- for j in range(i + 1, len(unique_entities_in_sentence)):
213
  node1 = unique_entities_in_sentence[i]
214
  node2 = unique_entities_in_sentence[j]
215
 
216
  # Ensure consistent order for the set to avoid duplicates like (A, B) and (B, A)
217
  edge_tuple = tuple(sorted((node1, node2)))
218
  edges.add(edge_tuple)
219
-
220
  # ----------------------------------------------------------------------
221
  # 2. Create Plotly Trace Data for Edges
222
  # ----------------------------------------------------------------------
223
- edge_x = []
224
- edge_y = []
225
-
226
- for edge in edges:
227
- n1, n2 = edge
228
- if n1 in pos_map and n2 in pos_map:
229
- # Append coordinates for line segment: [x1, x2, None] for separation
230
- edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
231
- edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
232
-
233
- fig = go.Figure()
234
-
235
  # Add Edge Trace (Lines)
236
- edge_trace = go.Scatter(
237
- x=edge_x, y=edge_y,
238
- line=dict(width=0.5, color='#888'),
239
- hoverinfo='none',
240
- mode='lines',
241
- name='Co-occurrence Edges',
242
- showlegend=False # Edges don't need a legend entry
243
  )
244
- fig.add_trace(edge_trace)
245
-
246
  # ----------------------------------------------------------------------
247
  # 3. Add Node Trace (Markers)
248
  # ----------------------------------------------------------------------
249
- fig.add_trace(go.Scatter(
250
- x=unique_entities['x'],
251
- y=unique_entities['y'],
252
- mode='markers+text',
253
- name='Entities',
254
- text=unique_entities['text'],
255
- textposition="top center",
256
- # FIX: Explicitly set showlegend=False for the main node trace
257
- # as we are creating separate traces for the legend colors below.
258
- showlegend=False,
259
- marker=dict(
260
- size=unique_entities['frequency'] * 5 + 10,
261
- color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
262
- line_width=1,
263
- line_color='black',
264
- opacity=0.9
265
- ),
266
- textfont=dict(size=10),
267
- customdata=unique_entities[['label', 'score', 'frequency']],
268
- hovertemplate=(
269
- "<b>%{text}</b><br>" +
270
- "Label: %{customdata[0]}<br>" +
271
- "Score: %{customdata[1]:.2f}<br>" +
272
- "Frequency: %{customdata[2]}<extra></extra>"
273
- )
274
- ))
275
-
276
  # Adding discrete traces for the legend based on unique labels
277
- legend_traces = []
278
- seen_labels = set()
279
- for index, row in unique_entities.iterrows():
280
- label = row['label']
281
- if label not in seen_labels:
282
- seen_labels.add(label)
283
- color = entity_color_map.get(label, '#cccccc')
284
- legend_traces.append(go.Scatter(
285
  x=[None],
286
  y=[None],
287
  mode='markers',
288
  marker=dict(size=10, color=color),
289
- name=f"{label.capitalize()}",
290
  showlegend=True # Ensure legend traces are explicitly visible
291
- ))
292
- for trace in legend_traces:
293
- fig.add_trace(trace)
294
-
295
- fig.update_layout(
296
- title='Entity Co-occurrence Network (Edges = Same Sentence)',
297
- showlegend=True,
298
- hovermode='closest',
299
- # Set explicit range to ensure padding for text labels on the edge
300
- xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
301
- yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
302
- plot_bgcolor='#f9f9f9',
303
- paper_bgcolor='#f9f9f9',
304
- margin=dict(t=50, b=10, l=10, r=10),
305
- height=600
306
  )
307
-
308
- return fig
309
-
310
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
311
- """
312
- Generates a full HTML report containing all analysis results and visualizations.
313
- FIXED: Treemap color (added color_continuous_scale) and chart overlap (set explicit heights).
314
- """
315
-
316
- # 1. Generate Visualizations (Plotly HTML)
317
-
318
- # 1a. Treemap - FIX: Added color_continuous_scale to ensure color renders in static HTML
319
- fig_treemap = px.treemap(
320
- df,
321
- path=[px.Constant("All Entities"), 'category', 'label', 'text'],
322
- values='score',
323
- color='category',
324
- title="Entity Distribution by Category and Label",
325
- color_continuous_scale=px.colors.sequential.Agsunset # Force a color scale
326
- )
327
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), height=500) # Added height for treemap
328
- treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
329
-
330
- # 1b. Pie Chart - FIX: Set explicit height to prevent overlap in the grid
331
- grouped_counts = df['category'].value_counts().reset_index()
332
- grouped_counts.columns = ['Category', 'Count']
333
- fig_pie = px.pie(grouped_counts, values='Count', names='Category', title='Distribution of Entities by Category', color_discrete_sequence=px.colors.sequential.RdBu)
334
- fig_pie.update_layout(margin=dict(t=50, b=10), height=400)
335
- pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
336
-
337
- # 1c. Bar Chart (Category Count) - FIX: Set explicit height
338
- fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=px.colors.qualitative.Pastel)
339
- fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=10), height=400)
340
- bar_category_html = fig_bar_category.to_html(full_html=False, include_plotlyjs='cdn')
341
 
342
- # 1d. Bar Chart (Most Frequent Entities) - FIX: Set explicit height
343
- word_counts = df['text'].value_counts().reset_index()
344
- word_counts.columns = ['Entity', 'Count']
345
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  # Top 10 repeating entities
347
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
348
- bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
349
-
350
- if not repeating_entities.empty:
351
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count', color='Entity', title='Top 10 Most Frequent Entities', color_discrete_sequence=px.colors.sequential.Plasma)
352
- fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=10), height=400)
353
- bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
354
-
 
 
 
355
  # 1e. Network Graph HTML - UPDATED to pass text_input
356
- network_fig = generate_network_graph(df, text_input)
357
- network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
358
-
359
  # 1f. Topic Charts HTML (Now a single Bubble Chart with Placeholder logic)
360
- topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
361
- if df_topic_data is not None and not df_topic_data.empty:
362
- bubble_figure = create_topic_word_bubbles(df_topic_data)
363
- if bubble_figure:
364
- topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn")}</div>'
365
- else:
366
- topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
367
- else:
368
- # Placeholder for low data
369
- topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #FF69B4;">'
370
- topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
371
- topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
372
- topic_charts_html += '</div>'
373
-
374
  # 2. Get Highlighted Text
375
- highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
376
-
377
  # 3. Entity Tables (Pandas to HTML)
378
- entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
379
- classes='table table-striped',
380
- index=False
381
  )
382
-
383
  # 4. Construct the Final HTML
384
- html_content = f"""<!DOCTYPE html><html lang="en"><head>
385
- <meta charset="UTF-8">
386
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
387
- <title>Entity and Topic Analysis Report</title>
388
- <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
389
- <style>
390
- body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
391
- .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
392
- h1 {{ color: #FF69B4; border-bottom: 3px solid #FF69B4; padding-bottom: 10px; margin-top: 0; }}
393
- h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
394
- h3 {{ color: #555; margin-top: 20px; }}
395
- .metadata {{ background-color: #FFE4E1; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
396
- .grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin-top: 20px; }}
397
- .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }}
398
- table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
399
- table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
400
- table th {{ background-color: #f0f0f0; }}
401
- /* Specific styling for highlighted text element */
402
- .highlighted-text {{ border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
403
- @media (max-width: 768px) {{ .grid {{ grid-template-columns: 1fr; }} }}
404
- </style></head><body>
405
- <div class="container">
406
- <h1>Entity and Topic Analysis Report</h1>
407
-
408
- <div class="metadata">
409
- <p><strong>Generated At:</strong> {time.strftime('%Y-%m-%d %H:%M:%S')}</p>
410
- <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
411
- </div>
412
- <!-- Section 1: Original Text & Highlighted Entities -->
413
- <h2>1. Analyzed Text & Extracted Entities</h2>
414
- <h3>Original Text with Highlighted Entities</h3>
415
- <div class="highlighted-text-container">
416
- {highlighted_text_html}
417
- </div>
418
-
419
- <!-- Section 2: Full Extracted Entities Table -->
420
- <h2>2. Full Extracted Entities Table</h2>
421
- {entity_table_html}
422
- <!-- Section 3: Visualizations (Treemap, Pie, Bar Charts) -->
423
- <h2>3. Data Visualizations</h2>
424
-
425
- <h3>3.1 Entity Distribution Treemap</h3>
426
- <div class="chart-box">{treemap_html}</div>
427
- <h3>3.2 Comparative Charts (Pie, Category Count, Frequency)</h3>
428
- <div class="grid">
429
- <div class="chart-box">{pie_html}</div>
430
- <div class="chart-box">{bar_category_html}</div>
431
- <div class="chart-box">{bar_freq_html}</div>
432
- </div>
433
- <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
434
- <div class="chart-box">{network_html}</div>
435
-
436
- <!-- Section 4: Topic Modeling -->
437
- <h2>4. Topic Modeling (LDA on Entities)</h2>
438
- {topic_charts_html}
439
-
440
- </div></body></html>
441
- """
442
- return html_content
443
-
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  # --- Page Configuration and Styling (No Sidebar) ---
445
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
446
  st.markdown(
447
- """
448
- <style>
449
- /* Overall app container - NO SIDEBAR */
450
- .main {
451
- background-color: #FFF0F5; /* Blanched Almond/Light Pink */
452
- color: #333333; /* Dark grey text for contrast */
453
  }
454
- .stApp {
455
- background-color: #FFF0F5;
456
  }
457
- /* Text Area background and text color (input fields) */
458
- .stTextArea textarea {
459
- background-color: #FFFAF0; /* Floral White/Near white for input fields */
460
- color: #000000; /* Black text for input */
461
- border: 1px solid #FF69B4; /* Deep Pink border */
462
  }
463
- /* Button styling */
464
- .stButton > button {
465
- background-color: #FF69B4; /* Deep Pink for the button */
466
- color: #FFFFFF; /* White text for contrast */
467
- border: none;
468
- padding: 10px 20px;
469
- border-radius: 5px;
470
  }
471
- /* Expander header and content background */
472
- .streamlit-expanderHeader, .streamlit-expanderContent {
473
- background-color: #FFE4E1; /* Misty Rose/Lighter Pink */
474
- color: #333333;
475
  }
476
- </style>
477
- """,
478
- unsafe_allow_html=True)
479
  st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
480
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
481
  expander = st.expander("**Important notes**")
@@ -489,77 +515,77 @@ comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAM
489
  # --- Label Definitions and Category Mapping ---
490
  labels = list(entity_color_map.keys())
491
  category_mapping = {
492
- "People & Groups": ["person", "username", "hashtag", "mention", "community", "position", "nationality_religion"],
493
- "Location & Organization": ["location", "organization"],
494
- "Temporal & Events": ["event", "date"],
495
- "Digital & Products": ["platform", "product", "media_type", "url"],
496
  }
497
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
498
  # --- Model Loading ---
499
  @st.cache_resource
500
  def load_ner_model():
501
- """Loads the GLiNER model and caches it."""
502
- try:
503
- # Use nested_ner=True and num_gen_sequences=2 for potentially higher recall
504
- return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
505
- except Exception as e:
506
- st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
507
- st.stop()
508
-
509
  model = load_ner_model()
510
 
511
  # --- LONG DEFAULT TEXT (178 Words) ---
512
  DEFAULT_TEXT = (
513
- "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
514
- "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
515
- "leap forward for commercial space technology across the entire European Union. The agreement, finalized "
516
- "on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
517
- "software platform. This platform is critical for processing and managing the vast amounts of data being sent "
518
- "back from the recent Mars rover mission. The core team, including lead engineer Marcus Davies, will hold "
519
- "their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
520
- "media platform X (under the username @TechSolutionsCEO) was overwhelmingly positive, with many major tech "
521
- "publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
522
- "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
523
- "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
524
- "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
525
- "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
526
  )
527
  # -----------------------------------
528
 
529
  # --- Session State Initialization (CRITICAL FIX) ---
530
  if 'show_results' not in st.session_state:
531
- st.session_state.show_results = False
532
  if 'last_text' not in st.session_state:
533
- st.session_state.last_text = ""
534
  if 'results_df' not in st.session_state:
535
- st.session_state.results_df = pd.DataFrame()
536
  if 'elapsed_time' not in st.session_state:
537
- st.session_state.elapsed_time = 0.0
538
  if 'topic_results' not in st.session_state:
539
- st.session_state.topic_results = None
540
  # FIX: Initialize the text area key with default text before st.text_area is called
541
  if 'my_text_area' not in st.session_state:
542
- st.session_state.my_text_area = DEFAULT_TEXT
543
 
544
  # --- Clear Button Function (MODIFIED) ---
545
  def clear_text():
546
- """Clears the text area (sets it to an empty string) and hides results."""
547
  # MODIFIED: Set to empty string for true clearing
548
- st.session_state['my_text_area'] = ""
549
- st.session_state.show_results = False
550
- st.session_state.last_text = ""
551
- st.session_state.results_df = pd.DataFrame()
552
- st.session_state.elapsed_time = 0.0
553
- st.session_state.topic_results = None
554
 
555
  # --- Text Input and Clear Button ---
556
  word_limit = 1000
557
  # The text area now safely uses the pre-initialized session state value
558
  text = st.text_area(
559
- f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
560
- height=250,
561
- key='my_text_area',
562
- value=st.session_state.my_text_area
563
  )
564
 
565
  word_count = len(text.split())
@@ -568,15 +594,15 @@ st.button("Clear text", on_click=clear_text)
568
 
569
  # --- Results Trigger and Processing (Updated Logic) ---
570
  if st.button("Results"):
571
- if not text.strip():
572
- st.warning("Please enter some text to extract entities.")
573
- st.session_state.show_results = False
574
- elif word_count > word_limit:
575
- st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
576
- st.session_state.show_results = False
577
- else:
578
- with st.spinner("Extracting entities and generating report data...", show_time=True):
579
- if text != st.session_state.last_text:
580
  st.session_state.last_text = text
581
  start_time = time.time()
582
 
@@ -599,6 +625,7 @@ if st.button("Results"):
599
  )
600
 
601
  if comet_initialized:
 
602
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
603
  experiment.log_parameter("input_text", text)
604
  experiment.log_table("predicted_entities", df)
@@ -611,8 +638,8 @@ if st.button("Results"):
611
  st.session_state.elapsed_time = end_time - start_time
612
 
613
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
614
- st.session_state.show_results = True
615
-
616
  # --- Display Download Link and Results (FIXED INDENTATION AND NEW LAYOUT) ---
617
  if st.session_state.show_results:
618
  df = st.session_state.results_df
@@ -671,14 +698,15 @@ if st.session_state.show_results:
671
  # TAB 2: Treemap
672
  with tab_treemap_viz:
673
  st.markdown("#### Treemap: Entity Distribution")
674
- # Treemap (Uses the corrected color in the report generation function)
 
675
  fig_treemap = px.treemap(
676
  df,
677
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
678
- values='score',
679
  color='category',
680
  title="Entity Distribution by Category and Label",
681
- color_continuous_scale=px.colors.sequential.Agsunset # Added color scale here for Streamlit preview too
682
  )
683
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
684
  st.plotly_chart(fig_treemap, use_container_width=True)
@@ -687,16 +715,22 @@ if st.session_state.show_results:
687
  st.markdown("---")
688
  st.markdown("### 4. Comparative Charts")
689
 
 
 
690
  col1, col2, col3 = st.columns(3)
691
 
692
  # Pie Chart
693
  grouped_counts = df['category'].value_counts().reset_index()
694
  grouped_counts.columns = ['Category', 'Count']
695
- fig_pie = px.pie(grouped_counts, values='Count', names='Category', title='Distribution by Category', color_discrete_sequence=px.colors.sequential.RdBu)
 
 
696
  with col1:
697
  st.plotly_chart(fig_pie, use_container_width=True)
698
  # Category Count Bar Chart
699
- fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=px.colors.qualitative.Pastel)
 
 
700
  with col2:
701
  st.plotly_chart(fig_bar_category, use_container_width=True)
702
  # Most Frequent Entities Bar Chart
@@ -705,68 +739,48 @@ if st.session_state.show_results:
705
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
706
  fig_bar_freq = go.Figure().update_layout(title="No repeating entities for plot")
707
  if not repeating_entities.empty:
708
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count', color='Entity', title='Top 10 Most Frequent Entities', color_discrete_sequence=px.colors.sequential.Plasma)
 
 
709
  with col3:
710
  st.plotly_chart(fig_bar_freq, use_container_width=True)
711
-
712
- # 5. Network Graph (NOW OUTSIDE ALL TABS)
713
  st.markdown("---")
714
  st.markdown("### 5. Entity Co-occurrence Network")
715
- st.markdown("Edges connect entities that appear in the same sentence.")
716
- fig_network = generate_network_graph(df, st.session_state.last_text)
717
- if not isinstance(fig_network, go.Figure):
718
- # If the function returned the string message (not enough data)
719
- st.info(fig_network.layout.title.text)
720
- else:
721
- st.plotly_chart(fig_network, use_container_width=True)
722
-
723
  # 6. Topic Modeling
724
  st.markdown("---")
725
  st.markdown("### 6. Topic Modeling (LDA on Entities)")
726
  if df_topic_data is not None and not df_topic_data.empty:
727
- st.markdown("##### Topic Word Weights (Bubble Chart)")
728
  bubble_figure = create_topic_word_bubbles(df_topic_data)
729
- st.plotly_chart(bubble_figure, use_container_width=True)
 
 
 
730
  else:
731
- st.info("Topic Modeling requires at least two unique entities to generate the Topic Bubble Chart.")
732
 
733
- # 7. Download Button (HTML Report)
734
- # Generate the full report HTML for download
735
- report_html_content = generate_html_report(
736
- df,
737
- st.session_state.last_text,
738
- st.session_state.elapsed_time,
739
- df_topic_data
740
- )
741
 
742
- # Convert HTML content to bytes for download
743
- b64_html = io.BytesIO(report_html_content.encode('utf-8'))
 
 
 
 
 
744
 
745
- st.markdown("---")
746
- with stylable_container(
747
- key="download_container",
748
- css_styles="""
749
- button {
750
- background-color: #007bff;
751
- color: white;
752
- font-weight: bold;
753
- border: 2px solid #007bff;
754
- padding: 10px 20px;
755
- border-radius: 8px;
756
- }
757
- button:hover {
758
- background-color: #0056b3;
759
- }
760
- """
761
- ):
762
- st.download_button(
763
- label="Download Full HTML Report 📥",
764
- data=b64_html,
765
- file_name=f"entity_topic_report_{time.strftime('%Y%m%d_%H%M%S')}.html",
766
- mime="text/html",
767
- )
768
-
769
-
770
 
771
 
772
 
 
18
  from streamlit_extras.stylable_container import stylable_container
19
  # Using a try/except for comet_ml import
20
  try:
21
+ from comet_ml import Experiment
22
  except ImportError:
23
+ class Experiment:
24
+ def __init__(self, **kwargs): pass
25
+ def log_parameter(self, *args): pass
26
+ def log_table(self, *args): pass
27
+ def end(self): pass
28
  # --- Model Home Directory (Fix for deployment environments) ---
29
  # Set HF_HOME environment variable to a writable path
30
  os.environ['HF_HOME'] = '/tmp'
31
  # --- Color Map for Highlighting and Network Graph Nodes ---
32
  entity_color_map = {
33
+ "person": "#10b981",
34
+ "username": "#3b82f6",
35
+ "hashtag": "#4ade80",
36
+ "mention" : "#f97316",
37
+ "organization": "#f59e0b",
38
+ "community": "#8b5cf6",
39
+ "position": "#ec4899",
40
+ "location": "#06b6d4",
41
+ "event": "#f43f5e",
42
+ "product": "#a855f7",
43
+ "platform": "#eab308",
44
+ "date": "#6366f1",
45
+ "media_type": "#14b8a6",
46
+ "url": "#60a5fa",
47
+ "nationality_religion": "#fb7185"
48
  }
49
  # --- Utility Functions ---
50
  def extract_label(node_name):
51
+ """Extracts the label from a node string like 'Text (Label)'."""
52
+ match = re.search(r'\(([^)]+)\)$', node_name)
53
+ return match.group(1) if match else "Unknown"
54
 
55
  def remove_trailing_punctuation(text_string):
56
+ """Removes trailing punctuation from a string."""
57
+ return text_string.rstrip(string.punctuation)
58
 
59
  def highlight_entities(text, df_entities):
60
+ """Generates HTML to display text with entities highlighted and colored."""
61
+ if df_entities.empty:
62
+ return text
63
  # Sort entities by start index descending to insert highlights without affecting subsequent indices
64
+ entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
65
+ highlighted_text = text
66
+ for entity in entities:
67
+ start = entity['start']
68
+ end = entity['end']
69
+ label = entity['label']
70
+ entity_text = entity['text']
71
+ color = entity_color_map.get(label, '#000000')
72
+
73
+ # Create a span with background color and tooltip
74
+ highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
75
+
76
+ # Replace the original text segment with the highlighted HTML
77
+ highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
78
  # Use a div to mimic the Streamlit input box style for the report
79
+ return f'<div style="border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
80
+
81
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
82
+ """
83
+ Performs basic Topic Modeling using LDA on the extracted entities
84
+ and returns structured data for visualization.
85
+
86
+ Includes updated TF-IDF parameters (stop_words='english', max_df=0.95, min_df=1).
87
+ """
88
  # Aggregate all unique entity text into a single document list
89
+ documents = df_entities['text'].unique().tolist()
90
+
91
+ if len(documents) < 2:
92
+ return None
93
+
94
+ N = min(num_top_words, len(documents))
95
+
96
+ try:
97
+ # UPDATED: Added stop_words='english' to filter common words tokenized
98
+ # from multi-word entities (e.g., "The" from "The White House").
99
+ tfidf_vectorizer = TfidfVectorizer(
100
+ max_df=0.95,
101
+ min_df=1, # Retained at 1 to keep all unique entities
102
+ stop_words='english' # <-- THIS IS THE KEY ADDITION
103
+ )
104
+ tfidf = tfidf_vectorizer.fit_transform(documents)
105
+ tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
106
+
107
+ lda = LatentDirichletAllocation(
108
+ n_components=num_topics, max_iter=5, learning_method='online',
109
+ random_state=42, n_jobs=-1
110
+ )
111
+ lda.fit(tfidf)
112
+
113
+ topic_data_list = []
114
+ for topic_idx, topic in enumerate(lda.components_):
115
+ top_words_indices = topic.argsort()[:-N - 1:-1]
116
+ top_words = [tfidf_feature_names[i] for i in top_words_indices]
117
+ word_weights = [topic[i] for i in top_words_indices]
118
+
119
+ for word, weight in zip(top_words, word_weights):
120
  topic_data_list.append({
121
  'Topic_ID': f'Topic #{topic_idx + 1}',
122
  'Word': word,
123
  'Weight': weight,
124
  })
125
+
126
+ return pd.DataFrame(topic_data_list)
127
+
128
+ except Exception as e:
129
+ st.error(f"Topic modeling failed: {e}")
130
+ return None
131
 
132
  def create_topic_word_bubbles(df_topic_data):
133
+ """Generates a Plotly Bubble Chart for top words across all topics."""
134
+
135
+ if df_topic_data.empty:
136
+ return None
137
+
138
+ fig = px.scatter(
139
+ df_topic_data,
140
+ x='Word',
141
+ y='Topic_ID',
142
+ size='Weight',
143
+ color='Topic_ID',
144
+ size_max=80,
145
+ title='Topic Word Weights (Bubble Chart)',
146
+ color_discrete_sequence=px.colors.qualitative.Bold,
147
+ hover_data={'Word': True, 'Weight': ':.3f', 'Topic_ID': False}
148
  )
149
+
150
+ fig.update_layout(
151
+ xaxis_title="Entity/Word (Bubble size = Word Weight)",
152
+ yaxis_title="Topic ID",
153
+ xaxis={'tickangle': -45, 'showgrid': False},
154
+ yaxis={'showgrid': True, 'autorange': 'reversed'},
155
+ showlegend=True,
156
+ plot_bgcolor='#FFF0F5',
157
+ paper_bgcolor='#FFF0F5',
158
+ height=600,
159
+ margin=dict(t=50, b=100, l=50, r=10),
160
  )
161
+
162
+ fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
163
+
164
+ return fig
165
 
166
  def generate_network_graph(df, raw_text):
167
+ """
168
+ Generates a network graph visualization (Node Plot) with edges
169
+ based on entity co-occurrence in sentences.
170
+ """
171
+ entity_counts = df['text'].value_counts().reset_index()
172
+ entity_counts.columns = ['text', 'frequency']
173
+
174
  # Merge counts with unique entities (text + label)
175
+ unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
176
+
177
+ if unique_entities.shape[0] < 2:
178
+ # Return a simple figure with a message if not enough data
179
+ return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
180
+
181
+ num_nodes = len(unique_entities)
182
+ thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
183
+
184
+ radius = 10
185
+
186
  # Assign circular positions + a little randomness
187
+ unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
188
+ unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
189
+
190
  # Map entity text to its coordinates for easy lookup
191
+ pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
192
+
193
  # ----------------------------------------------------------------------
194
  # 1. Identify Edges (Co-occurrence in sentences)
195
  # ----------------------------------------------------------------------
196
+ edges = set()
197
+
198
  # Simple sentence segmentation (handles standard punctuation followed by space)
199
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
200
+
201
+ for sentence in sentences:
202
+ # Find unique entities that are substrings of this sentence
203
+ entities_in_sentence = []
204
+ for entity_text in unique_entities['text'].unique():
205
+ if entity_text.lower() in sentence.lower():
206
+ entities_in_sentence.append(entity_text)
207
+
208
+ # Create edges (pairs) based on co-occurrence
209
+ unique_entities_in_sentence = list(set(entities_in_sentence))
210
+
211
+ # Create all unique pairs (edges)
212
+ for i in range(len(unique_entities_in_sentence)):
213
+ for j in range(i + 1, len(unique_entities_in_sentence)):
214
  node1 = unique_entities_in_sentence[i]
215
  node2 = unique_entities_in_sentence[j]
216
 
217
  # Ensure consistent order for the set to avoid duplicates like (A, B) and (B, A)
218
  edge_tuple = tuple(sorted((node1, node2)))
219
  edges.add(edge_tuple)
220
+
221
  # ----------------------------------------------------------------------
222
  # 2. Create Plotly Trace Data for Edges
223
  # ----------------------------------------------------------------------
224
+ edge_x = []
225
+ edge_y = []
226
+
227
+ for edge in edges:
228
+ n1, n2 = edge
229
+ if n1 in pos_map and n2 in pos_map:
230
+ # Append coordinates for line segment: [x1, x2, None] for separation
231
+ edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
232
+ edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
233
+
234
+ fig = go.Figure()
235
+
236
  # Add Edge Trace (Lines)
237
+ edge_trace = go.Scatter(
238
+ x=edge_x, y=edge_y,
239
+ line=dict(width=0.5, color='#888'),
240
+ hoverinfo='none',
241
+ mode='lines',
242
+ name='Co-occurrence Edges',
243
+ showlegend=False # Edges don't need a legend entry
244
  )
245
+ fig.add_trace(edge_trace)
246
+
247
  # ----------------------------------------------------------------------
248
  # 3. Add Node Trace (Markers)
249
  # ----------------------------------------------------------------------
250
+ fig.add_trace(go.Scatter(
251
+ x=unique_entities['x'],
252
+ y=unique_entities['y'],
253
+ mode='markers+text',
254
+ name='Entities',
255
+ text=unique_entities['text'],
256
+ textposition="top center",
257
+ # FIX: Explicitly set showlegend=False for the main node trace
258
+ # as we are creating separate traces for the legend colors below.
259
+ showlegend=False,
260
+ marker=dict(
261
+ size=unique_entities['frequency'] * 5 + 10,
262
+ color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
263
+ line_width=1,
264
+ line_color='black',
265
+ opacity=0.9
266
+ ),
267
+ textfont=dict(size=10),
268
+ customdata=unique_entities[['label', 'score', 'frequency']],
269
+ hovertemplate=(
270
+ "<b>%{text}</b><br>" +
271
+ "Label: %{customdata[0]}<br>" +
272
+ "Score: %{customdata[1]:.2f}<br>" +
273
+ "Frequency: %{customdata[2]}<extra></extra>"
274
+ )
275
+ ))
276
+
277
  # Adding discrete traces for the legend based on unique labels
278
+ legend_traces = []
279
+ seen_labels = set()
280
+ for index, row in unique_entities.iterrows():
281
+ label = row['label']
282
+ if label not in seen_labels:
283
+ seen_labels.add(label)
284
+ color = entity_color_map.get(label, '#cccccc')
285
+ legend_traces.append(go.Scatter(
286
  x=[None],
287
  y=[None],
288
  mode='markers',
289
  marker=dict(size=10, color=color),
290
+ name=f"{label.capitalize()}",
291
  showlegend=True # Ensure legend traces are explicitly visible
292
+ ))
293
+ for trace in legend_traces:
294
+ fig.add_trace(trace)
295
+
296
+ fig.update_layout(
297
+ title='Entity Co-occurrence Network (Edges = Same Sentence)',
298
+ showlegend=True,
299
+ hovermode='closest',
300
+ # Set explicit range to ensure padding for text labels on the edge
301
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
302
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
303
+ plot_bgcolor='#f9f9f9',
304
+ paper_bgcolor='#f9f9f9',
305
+ margin=dict(t=50, b=10, l=10, r=10),
306
+ height=600
307
  )
308
+
309
+ return fig
310
+
311
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
312
+ """
313
+ Generates a full HTML report containing all analysis results and visualizations.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
+ FIX 1: Added a discrete color sequence to the Treemap to prevent black color.
316
+ FIX 2: Adjusted CSS grid properties and added min-width to grid items to prevent plot overlap.
317
+ """
318
+
319
+ # 1. Generate Visualizations (Plotly HTML)
320
+
321
+ # 1a. Treemap
322
+ # FIX 1: Explicitly set a color_discrete_sequence to prevent the Treemap from being black
323
+ fig_treemap = px.treemap(
324
+ df,
325
+ path=[px.Constant("All Entities"), 'category', 'label', 'text'],
326
+ values='score',
327
+ color='category',
328
+ title="Entity Distribution by Category and Label",
329
+ color_discrete_sequence=px.colors.qualitative.Dark24 # Use a robust color sequence
330
+ )
331
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
332
+ treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
333
+
334
+ # 1b. Pie Chart
335
+ grouped_counts = df['category'].value_counts().reset_index()
336
+ grouped_counts.columns = ['Category', 'Count']
337
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',
338
+ title='Distribution of Entities by Category',
339
+ color_discrete_sequence=px.colors.sequential.RdBu)
340
+ fig_pie.update_layout(margin=dict(t=50, b=10))
341
+ pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
342
+
343
+ # 1c. Bar Chart (Category Count)
344
+ fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',
345
+ color='Category', title='Total Entities per Category',
346
+ color_discrete_sequence=px.colors.qualitative.Pastel)
347
+ fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},
348
+ margin=dict(t=50, b=10))
349
+ bar_category_html = fig_bar_category.to_html(full_html=False,
350
+ include_plotlyjs='cdn')
351
+
352
+ # 1d. Bar Chart (Most Frequent Entities)
353
+ word_counts = df['text'].value_counts().reset_index()
354
+ word_counts.columns = ['Entity', 'Count']
355
+
356
  # Top 10 repeating entities
357
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
358
+ bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
359
+
360
+ if not repeating_entities.empty:
361
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',
362
+ color='Entity', title='Top 10 Most Frequent Entities',
363
+ color_discrete_sequence=px.colors.sequential.Plasma)
364
+ fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},
365
+ margin=dict(t=50, b=10))
366
+ bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
367
+
368
  # 1e. Network Graph HTML - UPDATED to pass text_input
369
+ network_fig = generate_network_graph(df, text_input)
370
+ network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
371
+
372
  # 1f. Topic Charts HTML (Now a single Bubble Chart with Placeholder logic)
373
+ topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
374
+ if df_topic_data is not None and not df_topic_data.empty:
375
+ bubble_figure = create_topic_word_bubbles(df_topic_data)
376
+ if bubble_figure:
377
+ topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn")}</div>'
378
+ else:
379
+ topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
380
+ else:
381
+ # Placeholder for low data
382
+ topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #FF69B4;">'
383
+ topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
384
+ topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
385
+ topic_charts_html += '</div>'
386
+
387
  # 2. Get Highlighted Text
388
+ highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
389
+
390
  # 3. Entity Tables (Pandas to HTML)
391
+ entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
392
+ classes='table table-striped',
393
+ index=False
394
  )
395
+
396
  # 4. Construct the Final HTML
397
+ html_content = f"""<!DOCTYPE html><html lang="en"><head>
398
+ <meta charset="UTF-8">
399
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
400
+ <title>Entity and Topic Analysis Report</title>
401
+ <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
402
+ <style>
403
+ body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
404
+ .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
405
+ h1 {{ color: #FF69B4; border-bottom: 3px solid #FF69B4; padding-bottom: 10px; margin-top: 0; }}
406
+ h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
407
+ h3 {{ color: #555; margin-top: 20px; }}
408
+ .metadata {{ background-color: #FFE4E1; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
409
+ /* FIX 2: Modified grid to ensure each item gets min 30% of the container width */
410
+ .grid {{
411
+ display: grid;
412
+ grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); /* Adjusted min-width for better fit */
413
+ gap: 20px;
414
+ margin-top: 20px;
415
+ }}
416
+ .chart-box {{
417
+ background-color: #f9f9f9;
418
+ padding: 15px;
419
+ border-radius: 8px;
420
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
421
+ /* Important: Set a minimum width for the chart box in the grid */
422
+ min-width: 0;
423
+ }}
424
+ table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
425
+ table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
426
+ table th {{ background-color: #f0f0f0; }}
427
+ /* Specific styling for highlighted text element */
428
+ .highlighted-text {{ border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
429
+ @media (max-width: 1050px) {{ /* Increased breakpoint to help prevent overlap */
430
+ .grid {{
431
+ grid-template-columns: 1fr; /* Stack charts vertically on smaller screens */
432
+ }}
433
+ }}
434
+ </style></head><body>
435
+ <div class="container">
436
+ <h1>Entity and Topic Analysis Report</h1>
437
+
438
+ <div class="metadata">
439
+ <p><strong>Generated At:</strong> {time.strftime('%Y-%m-%d %H:%M:%S')}</p>
440
+ <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
441
+ </div>
442
+ <h2>1. Analyzed Text & Extracted Entities</h2>
443
+ <h3>Original Text with Highlighted Entities</h3>
444
+ <div class="highlighted-text-container">
445
+ {highlighted_text_html}
446
+ </div>
447
+
448
+ <h2>2. Full Extracted Entities Table</h2>
449
+ {entity_table_html}
450
+ <h2>3. Data Visualizations</h2>
451
+
452
+ <h3>3.1 Entity Distribution Treemap</h3>
453
+ <div class="chart-box">{treemap_html}</div>
454
+ <h3>3.2 Comparative Charts (Pie, Category Count, Frequency)</h3>
455
+ <div class="grid">
456
+ <div class="chart-box">{pie_html}</div>
457
+ <div class="chart-box">{bar_category_html}</div>
458
+ <div class="chart-box">{bar_freq_html}</div>
459
+ </div>
460
+ <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
461
+ <div class="chart-box">{network_html}</div>
462
+
463
+ <h2>4. Topic Modeling (LDA on Entities)</h2>
464
+ {topic_charts_html}
465
+
466
+ </div></body></html>
467
+ """
468
+ return html_content
469
+
470
  # --- Page Configuration and Styling (No Sidebar) ---
471
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
472
  st.markdown(
473
+ """
474
+ <style>
475
+ /* Overall app container - NO SIDEBAR */
476
+ .main {
477
+ background-color: #FFF0F5; /* Blanched Almond/Light Pink */
478
+ color: #333333; /* Dark grey text for contrast */
479
  }
480
+ .stApp {
481
+ background-color: #FFF0F5;
482
  }
483
+ /* Text Area background and text color (input fields) */
484
+ .stTextArea textarea {
485
+ background-color: #FFFAF0; /* Floral White/Near white for input fields */
486
+ color: #000000; /* Black text for input */
487
+ border: 1px solid #FF69B4; /* Deep Pink border */
488
  }
489
+ /* Button styling */
490
+ .stButton > button {
491
+ background-color: #FF69B4; /* Deep Pink for the button */
492
+ color: #FFFFFF; /* White text for contrast */
493
+ border: none;
494
+ padding: 10px 20px;
495
+ border-radius: 5px;
496
  }
497
+ /* Expander header and content background */
498
+ .streamlit-expanderHeader, .streamlit-expanderContent {
499
+ background-color: #FFE4E1; /* Misty Rose/Lighter Pink */
500
+ color: #333333;
501
  }
502
+ </style>
503
+ """,
504
+ unsafe_allow_html=True)
505
  st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
506
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
507
  expander = st.expander("**Important notes**")
 
515
  # --- Label Definitions and Category Mapping ---
516
  labels = list(entity_color_map.keys())
517
  category_mapping = {
518
+ "People & Groups": ["person", "username", "hashtag", "mention", "community", "position", "nationality_religion"],
519
+ "Location & Organization": ["location", "organization"],
520
+ "Temporal & Events": ["event", "date"],
521
+ "Digital & Products": ["platform", "product", "media_type", "url"],
522
  }
523
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
524
  # --- Model Loading ---
525
  @st.cache_resource
526
  def load_ner_model():
527
+ """Loads the GLiNER model and caches it."""
528
+ try:
529
+ # Use nested_ner=True and num_gen_sequences=2 for potentially higher recall
530
+ return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
531
+ except Exception as e:
532
+ st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
533
+ st.stop()
534
+
535
  model = load_ner_model()
536
 
537
  # --- LONG DEFAULT TEXT (178 Words) ---
538
  DEFAULT_TEXT = (
539
+ "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
540
+ "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
541
+ "leap forward for commercial space technology across the entire European Union. The agreement, finalized "
542
+ "on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
543
+ "software platform. This platform is critical for processing and managing the vast amounts of data being sent "
544
+ "back from the recent Mars rover mission. The core team, including lead engineer Marcus Davies, will hold "
545
+ "their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
546
+ "media platform X (under the username @TechSolutionsCEO) was overwhelmingly positive, with many major tech "
547
+ "publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
548
+ "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
549
+ "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
550
+ "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
551
+ "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
552
  )
553
  # -----------------------------------
554
 
555
  # --- Session State Initialization (CRITICAL FIX) ---
556
  if 'show_results' not in st.session_state:
557
+ st.session_state.show_results = False
558
  if 'last_text' not in st.session_state:
559
+ st.session_state.last_text = ""
560
  if 'results_df' not in st.session_state:
561
+ st.session_state.results_df = pd.DataFrame()
562
  if 'elapsed_time' not in st.session_state:
563
+ st.session_state.elapsed_time = 0.0
564
  if 'topic_results' not in st.session_state:
565
+ st.session_state.topic_results = None
566
  # FIX: Initialize the text area key with default text before st.text_area is called
567
  if 'my_text_area' not in st.session_state:
568
+ st.session_state.my_text_area = DEFAULT_TEXT
569
 
570
  # --- Clear Button Function (MODIFIED) ---
571
  def clear_text():
572
+ """Clears the text area (sets it to an empty string) and hides results."""
573
  # MODIFIED: Set to empty string for true clearing
574
+ st.session_state['my_text_area'] = ""
575
+ st.session_state.show_results = False
576
+ st.session_state.last_text = ""
577
+ st.session_state.results_df = pd.DataFrame()
578
+ st.session_state.elapsed_time = 0.0
579
+ st.session_state.topic_results = None
580
 
581
  # --- Text Input and Clear Button ---
582
  word_limit = 1000
583
  # The text area now safely uses the pre-initialized session state value
584
  text = st.text_area(
585
+ f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
586
+ height=250,
587
+ key='my_text_area',
588
+ value=st.session_state.my_text_area
589
  )
590
 
591
  word_count = len(text.split())
 
594
 
595
  # --- Results Trigger and Processing (Updated Logic) ---
596
  if st.button("Results"):
597
+ if not text.strip():
598
+ st.warning("Please enter some text to extract entities.")
599
+ st.session_state.show_results = False
600
+ elif word_count > word_limit:
601
+ st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
602
+ st.session_state.show_results = False
603
+ else:
604
+ with st.spinner("Extracting entities and generating report data...", show_time=True):
605
+ if text != st.session_state.last_text:
606
  st.session_state.last_text = text
607
  start_time = time.time()
608
 
 
625
  )
626
 
627
  if comet_initialized:
628
+ # FIX APPLIED HERE: Corrected indentation for the following lines
629
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
630
  experiment.log_parameter("input_text", text)
631
  experiment.log_table("predicted_entities", df)
 
638
  st.session_state.elapsed_time = end_time - start_time
639
 
640
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
641
+ st.session_state.show_results = True
642
+
643
  # --- Display Download Link and Results (FIXED INDENTATION AND NEW LAYOUT) ---
644
  if st.session_state.show_results:
645
  df = st.session_state.results_df
 
698
  # TAB 2: Treemap
699
  with tab_treemap_viz:
700
  st.markdown("#### Treemap: Entity Distribution")
701
+ # Treemap
702
+ # FIX 1 (Streamlit): Added a robust color sequence here too for consistency in the Streamlit plot
703
  fig_treemap = px.treemap(
704
  df,
705
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
706
+ values='score',
707
  color='category',
708
  title="Entity Distribution by Category and Label",
709
+ color_discrete_sequence=px.colors.qualitative.Dark24 # Applied fix here
710
  )
711
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
712
  st.plotly_chart(fig_treemap, use_container_width=True)
 
715
  st.markdown("---")
716
  st.markdown("### 4. Comparative Charts")
717
 
718
+ # FIX 2 (Streamlit): The Streamlit plot columns (col1, col2, col3) naturally handle overlap,
719
+ # so no change is needed here, the fix is only in the HTML report.
720
  col1, col2, col3 = st.columns(3)
721
 
722
  # Pie Chart
723
  grouped_counts = df['category'].value_counts().reset_index()
724
  grouped_counts.columns = ['Category', 'Count']
725
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',
726
+ title='Distribution by Category',
727
+ color_discrete_sequence=px.colors.sequential.RdBu)
728
  with col1:
729
  st.plotly_chart(fig_pie, use_container_width=True)
730
  # Category Count Bar Chart
731
+ fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',
732
+ color='Category', title='Total Entities per Category',
733
+ color_discrete_sequence=px.colors.qualitative.Pastel)
734
  with col2:
735
  st.plotly_chart(fig_bar_category, use_container_width=True)
736
  # Most Frequent Entities Bar Chart
 
739
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
740
  fig_bar_freq = go.Figure().update_layout(title="No repeating entities for plot")
741
  if not repeating_entities.empty:
742
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',
743
+ color='Entity', title='Top 10 Most Frequent Entities',
744
+ color_discrete_sequence=px.colors.sequential.Plasma)
745
  with col3:
746
  st.plotly_chart(fig_bar_freq, use_container_width=True)
747
+
748
+ # 5. Network Graph
749
  st.markdown("---")
750
  st.markdown("### 5. Entity Co-occurrence Network")
751
+ network_fig = generate_network_graph(df, st.session_state.last_text)
752
+ st.plotly_chart(network_fig, use_container_width=True)
753
+
 
 
 
 
 
754
  # 6. Topic Modeling
755
  st.markdown("---")
756
  st.markdown("### 6. Topic Modeling (LDA on Entities)")
757
  if df_topic_data is not None and not df_topic_data.empty:
 
758
  bubble_figure = create_topic_word_bubbles(df_topic_data)
759
+ if bubble_figure:
760
+ st.plotly_chart(bubble_figure, use_container_width=True)
761
+ else:
762
+ st.error("Visualization for Topic Modeling failed.")
763
  else:
764
+ st.info("Topic Modeling requires at least two unique entities and sufficient data to generate meaningful topics.")
765
 
766
+ # Final Report Download
767
+ st.markdown("---")
768
+ st.markdown("### Download Full HTML Report 🚀")
 
 
 
 
 
769
 
770
+ # Generate the full HTML content
771
+ html_report = generate_html_report(
772
+ df=df,
773
+ text_input=st.session_state.last_text,
774
+ elapsed_time=st.session_state.elapsed_time,
775
+ df_topic_data=df_topic_data
776
+ )
777
 
778
+ st.download_button(
779
+ label="Download Analysis Report (.html)",
780
+ data=html_report,
781
+ file_name="entity_analysis_report.html",
782
+ mime="text/html"
783
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
784
 
785
 
786