AIEcosystem commited on
Commit
e64256b
·
verified ·
1 Parent(s): f18fcc7

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +57 -832
src/streamlit_app.py CHANGED
@@ -1,841 +1,66 @@
1
- import os
2
- os.environ['HF_HOME'] = '/tmp'
3
- import time
4
  import streamlit as st
5
- import streamlit.components.v1 as components
6
- import pandas as pd
7
- import io
8
  import plotly.express as px
9
- import plotly.graph_objects as go
10
- import numpy as np
11
- import re
12
- import string
13
- import json
14
- # --- PPTX Imports ---
15
  from io import BytesIO
16
  from pptx import Presentation
17
- from pptx.util import Inches, Pt
18
- from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
- import plotly.io as pio # Required for image export (needs kaleido installed)
20
- # ---------------------------
21
- # --- Stable Scikit-learn LDA Imports ---
22
- from sklearn.feature_extraction.text import TfidfVectorizer
23
- from sklearn.decomposition import LatentDirichletAllocation
24
- # ------------------------------
25
- from gliner import GLiNER
26
- from streamlit_extras.stylable_container import stylable_container
27
-
28
- # Using a try/except for comet_ml import
29
- try:
30
- from comet_ml import Experiment
31
- except ImportError:
32
- class Experiment:
33
- def __init__(self, **kwargs): pass
34
- def log_parameter(self, *args): pass
35
- def log_table(self, *args): pass
36
- def end(self): pass
37
-
38
- # --- Model Home Directory (Fix for deployment environments) ---
39
- # Set HF_HOME environment variable to a writable path
40
- os.environ['HF_HOME'] = '/tmp'
41
-
42
- # --- Color Map for Highlighting and Network Graph Nodes (Monochrome Palette) ---
43
- entity_color_map = {
44
- "person": "#444444", # Dark Gray
45
- "username": "#666666", # Medium-Dark Gray
46
- "hashtag": "#888888", # Medium Gray
47
- "mention" : "#aaaaaa", # Medium-Light Gray
48
- "organization": "#333333", # Very Dark Gray
49
- "community": "#bbbbbb", # Light Gray
50
- "position": "#555555", # Slightly Dark Gray
51
- "location": "#777777", # Neutral Gray
52
- "event": "#999999", # Silver
53
- "product": "#cccccc", # Light Gray/Silver
54
- "platform": "#222222", # Black-ish
55
- "date": "#dddddd", # Very Light Gray
56
- "media_type": "#333333", # Very Dark Gray
57
- "url": "#666666", # Medium-Dark Gray
58
- "nationality_religion": "#aaaaaa" # Medium-Light Gray
59
- }
60
-
61
- # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
62
- labels = list(entity_color_map.keys())
63
- category_mapping = {
64
- "People & Groups": ["person", "username", "hashtag", "mention", "community", "position", "nationality_religion"],
65
- "Location & Organization": ["location", "organization"],
66
- "Temporal & Events": ["event", "date"],
67
- "Digital & Products": ["platform", "product", "media_type", "url"],
68
- }
69
-
70
- # FIX: Corrected the dictionary comprehension to avoid redundant iteration variable (preventing UnboundLocalError)
71
- reverse_category_mapping = {label: category
72
- for category, label_list in category_mapping.items() for label in label_list}
73
-
74
-
75
- # --- Utility Functions for Analysis and Plotly ---
76
- def extract_label(node_name):
77
- """Extracts the label from a node string like 'Text (Label)'."""
78
- match = re.search(r'\(([^)]+)\)$', node_name)
79
- return match.group(1) if match else "Unknown"
80
-
81
- def remove_trailing_punctuation(text_string):
82
- """Removes trailing punctuation from a string."""
83
- return text_string.rstrip(string.punctuation)
84
-
85
- def highlight_entities(text, df_entities):
86
- """Generates HTML to display text with entities highlighted and colored."""
87
- if df_entities.empty:
88
- return text
89
-
90
- # Sort entities by start index descending to insert highlights without affecting subsequent indices
91
- entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
92
- highlighted_text = text
93
-
94
- for entity in entities:
95
- start = entity['start']
96
- end = entity['end']
97
- label = entity['label']
98
- entity_text = entity['text']
99
- # Use monochrome map
100
- color = entity_color_map.get(label, '#000000')
101
-
102
- # Create a span with background color and tooltip
103
- highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
104
- # Replace the original text segment with the highlighted HTML
105
- highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
106
-
107
- # Use a div to mimic the Streamlit input box style for the report - now in monochrome
108
- return f'<div style="border: 1px solid #AAAAAA; padding: 15px; border-radius: 5px; background-color: #FFFFFF; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
109
-
110
- def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
111
- """
112
- Performs basic Topic Modeling using LDA on the extracted entities
113
- and returns structured data for visualization.
114
- """
115
- documents = df_entities['text'].unique().tolist()
116
- if len(documents) < 2:
117
- return None
118
-
119
- N = min(num_top_words, len(documents))
120
- try:
121
- tfidf_vectorizer = TfidfVectorizer(
122
- max_df=0.95,
123
- min_df=1,
124
- stop_words='english'
125
- )
126
- tfidf = tfidf_vectorizer.fit_transform(documents)
127
- tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
128
-
129
- lda = LatentDirichletAllocation(
130
- n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
131
- )
132
- lda.fit(tfidf)
133
- topic_data_list = []
134
- for topic_idx, topic in enumerate(lda.components_):
135
- top_words_indices = topic.argsort()[:-N - 1:-1]
136
- top_words = [tfidf_feature_names[i] for i in top_words_indices]
137
- word_weights = [topic[i] for i in top_words_indices]
138
- for word, weight in zip(top_words, word_weights):
139
- topic_data_list.append({
140
- 'Topic_ID': f'Topic #{topic_idx + 1}',
141
- 'Word': word,
142
- 'Weight': weight,
143
- })
144
- return pd.DataFrame(topic_data_list)
145
- except Exception as e:
146
- st.error(f"Topic modeling failed: {e}")
147
- return None
148
-
149
- def create_topic_word_bubbles(df_topic_data):
150
- """Generates a Plotly Bubble Chart for top words across all topics."""
151
- # Renaming columns to match the output of perform_topic_modeling
152
- df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
153
- df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
154
-
155
- if df_topic_data.empty:
156
- return None
157
- fig = px.scatter(
158
- df_topic_data,
159
- x='x_pos',
160
- y='weight',
161
- size='weight',
162
- color='topic',
163
- hover_name='word',
164
- size_max=80,
165
- title='Topic Word Weights (Bubble Chart)',
166
- color_discrete_sequence=px.colors.sequential.Greys, # Using grayscale palette
167
- labels={
168
- 'x_pos': 'Entity/Word Index',
169
- 'weight': 'Word Weight',
170
- 'topic': 'Topic ID'
171
- },
172
- custom_data=['word', 'weight', 'topic']
173
- )
174
- fig.update_layout(
175
- xaxis_title="Entity/Word (Bubble size = Word Weight)",
176
- yaxis_title="Word Weight",
177
- xaxis={'tickangle': -45, 'showgrid': False},
178
- yaxis={'showgrid': True},
179
- showlegend=True,
180
- plot_bgcolor='#f9f9f9', # Neutral background
181
- paper_bgcolor='#f9f9f9', # Neutral background
182
- height=600,
183
- margin=dict(t=50, b=100, l=50, r=10),
184
- )
185
- fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',
186
- marker=dict(line=dict(width=1, color='DarkSlateGrey')))
187
- return fig
188
-
189
- def generate_network_graph(df, raw_text):
190
- """
191
- Generates a network graph visualization (Node Plot) with edges
192
- based on entity co-occurrence in sentences.
193
- """
194
- entity_counts = df['text'].value_counts().reset_index()
195
- entity_counts.columns = ['text', 'frequency']
196
-
197
- unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
198
- if unique_entities.shape[0] < 2:
199
- return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
200
-
201
- num_nodes = len(unique_entities)
202
- thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
203
-
204
- radius = 10
205
- unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
206
- unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
207
 
208
- pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
209
- edges = set()
 
210
 
211
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
212
- for sentence in sentences:
213
- entities_in_sentence = []
214
- for entity_text in unique_entities['text'].unique():
215
- if entity_text.lower() in sentence.lower():
216
- entities_in_sentence.append(entity_text)
217
- unique_entities_in_sentence = list(set(entities_in_sentence))
218
 
219
- for i in range(len(unique_entities_in_sentence)):
220
- for j in range(i + 1, len(unique_entities_in_sentence)):
221
- node1 = unique_entities_in_sentence[i]
222
- node2 = unique_entities_in_sentence[j]
223
- edge_tuple = tuple(sorted((node1, node2)))
224
- edges.add(edge_tuple)
225
-
226
- edge_x = []
227
- edge_y = []
228
-
229
- for edge in edges:
230
- n1, n2 = edge
231
- if n1 in pos_map and n2 in pos_map:
232
- edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
233
- edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
234
-
235
- fig = go.Figure()
236
-
237
- edge_trace = go.Scatter(
238
- x=edge_x, y=edge_y,
239
- line=dict(width=0.5, color='#888'),
240
- hoverinfo='none',
241
- mode='lines',
242
- name='Co-occurrence Edges',
243
- showlegend=False
244
- )
245
- fig.add_trace(edge_trace)
246
-
247
- fig.add_trace(go.Scatter(
248
- x=unique_entities['x'],
249
- y=unique_entities['y'],
250
- mode='markers+text',
251
- name='Entities',
252
- text=unique_entities['text'],
253
- textposition="top center",
254
- showlegend=False,
255
- marker=dict(
256
- size=unique_entities['frequency'] * 5 + 10,
257
- color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']], # Use monochrome map
258
- line_width=1,
259
- line_color='black',
260
- opacity=0.9
261
- ),
262
- textfont=dict(size=10),
263
- customdata=unique_entities[['label', 'score', 'frequency']],
264
- hovertemplate=(
265
- "<b>%{text}</b><br>" +
266
- "Label: %{customdata[0]}<br>" +
267
- "Score: %{customdata[1]:.2f}<br>" +
268
- "Frequency: %{customdata[2]}<extra></extra>"
269
- )
270
- ))
271
-
272
- legend_traces = []
273
- seen_labels = set()
274
- for index, row in unique_entities.iterrows():
275
- label = row['label'] # 'label' is defined here
276
- if label not in seen_labels:
277
- seen_labels.add(label)
278
- color = entity_color_map.get(label, '#cccccc')
279
- legend_traces.append(go.Scatter(
280
- x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True
281
- ))
282
- for trace in legend_traces:
283
- fig.add_trace(trace)
284
-
285
- fig.update_layout(
286
- title='Entity Co-occurrence Network (Edges = Same Sentence)',
287
- showlegend=True,
288
- hovermode='closest',
289
- xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
290
- yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
291
- plot_bgcolor='#f9f9f9',
292
- paper_bgcolor='#f9f9f9',
293
- margin=dict(t=50, b=10, l=10, r=10),
294
- height=600
295
- )
296
-
297
- return fig
298
-
299
-
300
- # --- PPTX HELPER FUNCTIONS ---
301
-
302
- def fig_to_image_buffer(fig):
303
- """
304
- Converts a Plotly figure object into a BytesIO buffer containing PNG data.
305
- Requires 'kaleido' to be installed for image export.
306
- Returns None if export fails.
307
- """
308
- try:
309
- # Use pio.to_image to convert the figure to a PNG byte array
310
- img_bytes = pio.to_image(fig, format="png", width=900, height=500, scale=2)
311
- img_buffer = BytesIO(img_bytes)
312
- return img_buffer
313
- except Exception as e:
314
- # Changed the error message to be more explicit about the Kaleido dependency issue
315
- print(f"Plotly image export failed (Kaleido dependency error): {e}. This means the PPTX will contain placeholder slides where charts should be.")
316
- return None
317
-
318
- # --- PPTX GENERATION FUNCTION ---
319
-
320
- def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_category_mapping):
321
- """
322
- Generates a PowerPoint presentation (.pptx) file containing key analysis results.
323
- Returns the file content as a BytesIO buffer.
324
- """
325
  prs = Presentation()
326
- # Layout 5: Title and Content (often good for charts)
327
- chart_layout = prs.slide_layouts[5]
328
-
329
- # 1. Title Slide (Layout 0)
330
- title_slide_layout = prs.slide_layouts[0]
331
- slide = prs.slides.add_slide(title_slide_layout)
332
- slide.shapes.title.text = "NER & Topic Analysis Report"
333
-
334
- # FIX: Add safety check for placeholder index 1 (subtitle)
335
- if len(slide.placeholders) > 1:
336
- subtitle = slide.placeholders[1]
337
- subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
338
- # End FIX
339
-
340
- # 2. Source Text Slide
341
- slide = prs.slides.add_slide(chart_layout)
342
- slide.shapes.title.text = "Analyzed Source Text"
343
-
344
- # Add the raw text to a text box
345
- left = Inches(0.5)
346
  top = Inches(1.5)
347
- width = Inches(9.0)
348
- height = Inches(5.0)
349
- txBox = slide.shapes.add_textbox(left, top, width, height)
350
- tf = txBox.text_frame
351
- tf.margin_top = Inches(0.1)
352
- tf.margin_bottom = Inches(0.1)
353
- tf.word_wrap = True
354
- p = tf.add_paragraph()
355
- p.text = text_input
356
- p.font.size = Pt(14)
357
- p.font.name = 'Arial'
358
-
359
- # 3. Entity Summary Slide (Table)
360
- slide = prs.slides.add_slide(chart_layout)
361
- slide.shapes.title.text = "Entity Summary (Count by Category and Label)"
362
-
363
- # Create the summary table using the app's established logic
364
- grouped_entity_table = df['label'].value_counts().reset_index()
365
- grouped_entity_table.columns = ['Entity Label', 'Count']
366
- grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(
367
- lambda x: reverse_category_mapping.get(x, 'Other')
368
- )
369
- grouped_entity_table = grouped_entity_table[['Category', 'Entity Label', 'Count']]
370
-
371
- # Simple way to insert a table:
372
- rows, cols = grouped_entity_table.shape
373
- x, y, cx, cy = Inches(1), Inches(1.5), Inches(8), Inches(4.5)
374
- # Add 1 row for the header
375
- table = slide.shapes.add_table(rows + 1, cols, x, y, cx, cy).table
376
-
377
- # Set column widths
378
- table.columns[0].width = Inches(2.7)
379
- table.columns[1].width = Inches(2.8)
380
- table.columns[2].width = Inches(2.5)
381
-
382
- # Set column headers
383
- for i, col in enumerate(grouped_entity_table.columns):
384
- cell = table.cell(0, i)
385
- cell.text = col
386
- cell.fill.solid()
387
- # Optional: Add simple styling to header
388
-
389
- # 4. Treemap Slide (Visualization)
390
- fig_treemap = px.treemap(
391
- df,
392
- path=[px.Constant("All Entities"), 'category', 'label', 'text'],
393
- values='score',
394
- color='category',
395
- title="Entity Distribution by Category and Label",
396
- color_discrete_sequence=px.colors.sequential.Greys # Monochrome palette
397
- )
398
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
399
- treemap_image = fig_to_image_buffer(fig_treemap)
400
-
401
- if treemap_image:
402
- slide = prs.slides.add_slide(chart_layout)
403
- slide.shapes.title.text = "Entity Distribution Treemap"
404
- slide.shapes.add_picture(treemap_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
405
- else:
406
- # Placeholder if image conversion failed (e.g., Kaleido issue)
407
- slide = prs.slides.add_slide(chart_layout)
408
- slide.shapes.title.text = "Entity Distribution Treemap (Chart Failed)"
409
- # FIX: Safety check for placeholder index 1
410
- if len(slide.placeholders) > 1:
411
- slide.placeholders[1].text = "Chart generation failed, likely due to a missing 'kaleido' dependency for static image export."
412
-
413
-
414
- # 5. Entity Count Bar Chart Slide (Visualization)
415
- grouped_counts = df['category'].value_counts().reset_index()
416
- grouped_counts.columns = ['Category', 'Count']
417
- fig_bar_category = px.bar(
418
- grouped_counts,
419
- x='Category',
420
- y='Count',
421
- color='Category',
422
- title='Total Entities per Category',
423
- color_discrete_sequence=px.colors.sequential.Greys # Monochrome palette
424
- )
425
- fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
426
- bar_category_image = fig_to_image_buffer(fig_bar_category)
427
-
428
- if bar_category_image:
429
- slide = prs.slides.add_slide(chart_layout)
430
- slide.shapes.title.text = "Total Entities per Category"
431
- slide.shapes.add_picture(bar_category_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
432
- else:
433
- slide = prs.slides.add_slide(chart_layout)
434
- slide.shapes.title.text = "Total Entities per Category (Chart Failed)"
435
- # FIX: Safety check for placeholder index 1
436
- if len(slide.placeholders) > 1:
437
- slide.placeholders[1].text = "Chart generation failed, likely due to a missing 'kaleido' dependency for static image export."
438
-
439
-
440
- # 6. Topic Modeling Bubble Chart Slide
441
- if df_topic_data is not None and not df_topic_data.empty:
442
- # Ensure data frame is in the format expected by create_topic_word_bubbles
443
- df_topic_data_pptx = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
444
- bubble_figure = create_topic_word_bubbles(df_topic_data_pptx)
445
- bubble_image = fig_to_image_buffer(bubble_figure)
446
- if bubble_image:
447
- slide = prs.slides.add_slide(chart_layout)
448
- slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
449
- slide.shapes.add_picture(bubble_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
450
- else:
451
- slide = prs.slides.add_slide(chart_layout)
452
- slide.shapes.title.text = "Topic Word Weights (Chart Failed)"
453
- # FIX: Safety check for placeholder index 1
454
- if len(slide.placeholders) > 1:
455
- slide.placeholders[1].text = "Chart generation failed, likely due to a missing 'kaleido' dependency for static image export."
456
-
457
- else:
458
- # Placeholder slide if topic modeling is not available
459
- slide = prs.slides.add_slide(chart_layout)
460
- slide.shapes.title.text = "Topic Modeling Results"
461
- # FIX: Safety check for placeholder index 1
462
- if len(slide.placeholders) > 1:
463
- slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
464
-
465
- # Save the presentation to an in-memory buffer
466
- pptx_buffer = BytesIO()
467
- prs.save(pptx_buffer)
468
- pptx_buffer.seek(0)
469
- return pptx_buffer
470
-
471
- # --- NEW CSV GENERATION FUNCTION ---
472
- def generate_entity_csv(df):
473
- """
474
- Generates a CSV file of the extracted entities in an in-memory buffer,
475
- including text, label, category, score, start, and end indices.
476
- """
477
- csv_buffer = BytesIO()
478
- # Select desired columns and write to buffer
479
- df_export = df[['text', 'label', 'category', 'score', 'start', 'end']]
480
- csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
481
- csv_buffer.seek(0)
482
- return csv_buffer
483
- # -----------------------------------
484
-
485
- # --- Existing App Functionality (HTML) ---
486
- def generate_html_report(df, text_input, elapsed_time, df_topic_data):
487
- """
488
- Generates a full HTML report containing all analysis results and visualizations.
489
- """
490
- # 1. Generate Visualizations (Plotly HTML)
491
-
492
- # 1a. Treemap
493
- fig_treemap = px.treemap(
494
- df,
495
- path=[px.Constant("All Entities"), 'category', 'label', 'text'],
496
- values='score',
497
- color='category',
498
- title="Entity Distribution by Category and Label",
499
- color_discrete_sequence=px.colors.sequential.Greys # Monochrome palette
500
- )
501
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
502
- treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
503
-
504
- # 1b. Pie Chart
505
- grouped_counts = df['category'].value_counts().reset_index()
506
- grouped_counts.columns = ['Category', 'Count']
507
- fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Greys) # Monochrome palette
508
- fig_pie.update_layout(margin=dict(t=50, b=10))
509
- pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
510
-
511
- # 1c. Bar Chart (Category Count)
512
- fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.sequential.Greys) # Monochrome palette
513
- fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
514
- bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
515
-
516
- # 1d. Bar Chart (Most Frequent Entities)
517
- word_counts = df['text'].value_counts().reset_index()
518
- word_counts.columns = ['Entity', 'Count']
519
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
520
- bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
521
-
522
- if not repeating_entities.empty:
523
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Greys) # Monochrome palette
524
- fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
525
- bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
526
-
527
- # 1e. Network Graph HTML
528
- network_fig = generate_network_graph(df, text_input)
529
- network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
530
-
531
- # 1f. Topic Charts HTML
532
- topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
533
- if df_topic_data is not None and not df_topic_data.empty:
534
- bubble_figure = create_topic_word_bubbles(df_topic_data)
535
- if bubble_figure:
536
- topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn")}</div>'
537
- else:
538
- topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
539
- else:
540
- topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #AAAAAA;">'
541
- topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
542
- topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
543
- topic_charts_html += '</div>'
544
-
545
- # 2. Get Highlighted Text
546
- # The div style is now monochrome/neutral (border: #AAAAAA, background: #FFFFFF)
547
- highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
548
-
549
- # 3. Entity Tables (Pandas to HTML)
550
- entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
551
- classes='table table-striped',
552
- index=False
553
- )
554
-
555
- # 4. Construct the Final HTML
556
- # Updated CSS to remove all color/pink references
557
- html_content = f"""<!DOCTYPE html><html lang="en"><head>
558
- <meta charset="UTF-8">
559
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
560
- <title>Entity and Topic Analysis Report</title>
561
- <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
562
- <style>
563
- body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f4; color: #333; }}
564
- .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
565
- h1 {{ color: #333333; border-bottom: 3px solid #666666; padding-bottom: 10px; margin-top: 0; }}
566
- h2 {{ color: #555555; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
567
- h3 {{ color: #555; margin-top: 20px; }}
568
- .metadata {{ background-color: #eeeeee; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
569
- .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
570
- table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
571
- table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
572
- table th {{ background-color: #f0f0f0; }}
573
- .highlighted-text {{ border: 1px solid #AAAAAA; padding: 15px; border-radius: 5px; background-color: #FFFFFF; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
574
- </style></head><body>
575
- <div class="container">
576
- <h1>Entity and Topic Analysis Report</h1>
577
- <div class="metadata">
578
- <p><strong>Generated At:</strong> {time.strftime('%Y-%m-%d %H:%M:%S')}</p>
579
- <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
580
- </div>
581
- <h2>1. Analyzed Text & Extracted Entities</h2>
582
- <h3>Original Text with Highlighted Entities</h3>
583
- <div class="highlighted-text-container">
584
- {highlighted_text_html}
585
- </div>
586
- <h2>2. Full Extracted Entities Table</h2>
587
- {entity_table_html}
588
- <h2>3. Data Visualizations</h2>
589
- <h3>3.1 Entity Distribution Treemap</h3>
590
- <div class="chart-box">{treemap_html}</div>
591
- <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
592
- <div class="chart-box">{pie_html}</div>
593
- <div class="chart-box">{bar_category_html}</div>
594
- <div class="chart-box">{bar_freq_html}</div>
595
- <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
596
- <div class="chart-box">{network_html}</div>
597
- <h2>4. Topic Modeling (LDA on Entities)</h2>
598
- {topic_charts_html}
599
- </div></body></html>
600
- """
601
- return html_content
602
-
603
-
604
- # --- Page Configuration and Styling (No Sidebar) ---
605
- st.set_page_config(layout="wide", page_title="NER & Topic Report App")
606
- st.markdown(
607
- """
608
- <style>
609
- /* Overall app container - NO SIDEBAR */
610
- .main {
611
- background-color: #F8F8F8; /* Near White/Lightest Gray */
612
- color: #333333; /* Dark grey text for contrast */
613
- }
614
- .stApp {
615
- background-color: #F8F8F8;
616
- }
617
- /* Text Area background and text color (input fields) */
618
- .stTextArea textarea {
619
- background-color: #FFFFFF; /* Pure White for input fields */
620
- color: #000000; /* Black text for input */
621
- border: 1px solid #AAAAAA; /* Gray border */
622
- }
623
- /* Button styling */
624
- .stButton > button {
625
- background-color: #666666; /* Medium Gray for the button */
626
- color: #FFFFFF; /* White text for contrast */
627
- border: none;
628
- padding: 10px 20px;
629
- border-radius: 5px;
630
- transition: background-color 0.3s;
631
- }
632
- .stButton > button:hover {
633
- background-color: #444444; /* Darker Gray on hover */
634
- }
635
- /* Expander header and content background */
636
- .streamlit-expanderHeader, .streamlit-expanderContent {
637
- background-color: #EEEEEE; /* Very Light Gray */
638
- color: #333333;
639
- }
640
- </style>
641
- """,
642
- unsafe_allow_html=True)
643
- st.subheader("NER and Topic Analysis Report Generator", divider="gray") # Divider is now gray
644
- st.link_button("by nlpblogs", "https://nlpblogs.com", type="secondary")
645
- expander = st.expander("**Important notes**")
646
- expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
647
- **Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and **`kaleido`**. If charts in the PPTX are blank, please check your environment's $\text{kaleido}$ installation/permissions.
648
- **Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
649
- **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
650
- st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
651
-
652
- # --- Comet ML Setup (Placeholder/Conditional) ---
653
- COMET_API_KEY = os.environ.get("COMET_API_KEY")
654
- COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
655
- COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
656
- comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
657
-
658
- # --- Model Loading ---
659
- @st.cache_resource
660
- def load_ner_model():
661
- """Loads the GLiNER model and caches it."""
662
- try:
663
- return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
664
- except Exception as e:
665
- st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
666
- st.stop()
667
-
668
- model = load_ner_model()
669
-
670
- # --- LONG DEFAULT TEXT (178 Words) ---
671
- DEFAULT_TEXT = (
672
- "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
673
- "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
674
- "leap forward for commercial space technology across the entire European Union. The agreement, finalized "
675
- "on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
676
- "software platform. This platform is critical for processing and managing the vast amounts of data being sent "
677
- "back from the recent Mars rover mission. The core team, including lead engineer Marcus Davies, will hold "
678
- "their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
679
- "media platform X (under the username @TechSolutionsCEO) was overwhelmingly positive, with many major tech "
680
- "publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
681
- "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
682
- "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
683
- "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
684
- "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
685
- )
686
- # -----------------------------------
687
- # --- Session State Initialization (CRITICAL FIX) ---
688
- if 'show_results' not in st.session_state:
689
- st.session_state.show_results = False
690
- if 'last_text' not in st.session_state:
691
- st.session_state.last_text = ""
692
- if 'results_df' not in st.session_state:
693
- st.session_state.results_df = pd.DataFrame()
694
- if 'elapsed_time' not in st.session_state:
695
- st.session_state.elapsed_time = 0.0
696
- if 'topic_results' not in st.session_state:
697
- st.session_state.topic_results = None
698
- if 'my_text_area' not in st.session_state:
699
- st.session_state.my_text_area = DEFAULT_TEXT
700
-
701
- # --- Clear Button Function (MODIFIED) ---
702
- def clear_text():
703
- """Clears the text area (sets it to an empty string) and hides results."""
704
- st.session_state['my_text_area'] = ""
705
- st.session_state.show_results = False
706
- st.session_state.last_text = ""
707
- st.session_state.results_df = pd.DataFrame()
708
- st.session_state.elapsed_time = 0.0
709
- st.session_state.topic_results = None
710
-
711
- # --- Text Input and Clear Button ---
712
- word_limit = 1000
713
- text = st.text_area(
714
- f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
715
- height=250,
716
- key='my_text_area',
717
- value=st.session_state.my_text_area)
718
-
719
- word_count = len(text.split())
720
- st.markdown(f"**Word count:** {word_count}/{word_limit}")
721
- st.button("Clear text", on_click=clear_text)
722
-
723
- # --- Results Trigger and Processing (Updated Logic) ---
724
- if st.button("Results"):
725
- if not text.strip():
726
- st.warning("Please enter some text to extract entities.")
727
- st.session_state.show_results = False
728
- elif word_count > word_limit:
729
- st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
730
- st.session_state.show_results = False
731
- else:
732
- with st.spinner("Extracting entities and generating report data...", show_time=True):
733
- if text != st.session_state.last_text:
734
- st.session_state.last_text = text
735
- start_time = time.time()
736
-
737
- # --- Model Prediction & Dataframe Creation ---
738
- entities = model.predict_entities(text, labels)
739
- df = pd.DataFrame(entities)
740
-
741
- if not df.empty:
742
- df['text'] = df['text'].apply(remove_trailing_punctuation)
743
- df['category'] = df['label'].map(reverse_category_mapping)
744
- st.session_state.results_df = df
745
-
746
- unique_entity_count = len(df['text'].unique())
747
- N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
748
-
749
- st.session_state.topic_results = perform_topic_modeling(
750
- df,
751
- num_topics=2,
752
- num_top_words=N_TOP_WORDS_TO_USE
753
- )
754
-
755
- if comet_initialized:
756
- experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
757
- experiment.log_parameter("input_text", text)
758
- experiment.log_table("predicted_entities", df)
759
- experiment.end()
760
- else:
761
- st.session_state.results_df = pd.DataFrame()
762
- st.session_state.topic_results = None
763
-
764
- end_time = time.time()
765
- st.session_state.elapsed_time = end_time - start_time
766
-
767
- st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
768
- st.session_state.show_results = True
769
-
770
- # --- Display Download Link and Results (The missing logic that was completed) ---
771
- if st.session_state.show_results:
772
- df = st.session_state.results_df
773
-
774
- if df.empty:
775
- st.error("No entities were extracted from the text. The report cannot be generated.")
776
- else:
777
- # --- Generate All Report Files/Buffers ---
778
- with st.spinner("Generating Report Files (HTML, PPTX, CSV)..."):
779
- # 1. HTML Report Generation
780
- html_report_content = generate_html_report(
781
- df,
782
- st.session_state.last_text,
783
- st.session_state.elapsed_time,
784
- st.session_state.topic_results
785
- )
786
-
787
- # 2. PPTX Report Generation
788
- pptx_buffer = generate_pptx_report(
789
- df,
790
- st.session_state.last_text,
791
- st.session_state.elapsed_time,
792
- st.session_state.topic_results,
793
- reverse_category_mapping
794
- )
795
-
796
- # 3. CSV Report Generation
797
- csv_buffer = generate_entity_csv(df)
798
-
799
- # --- Display Downloads and Preview ---
800
- st.markdown("## Download Analysis Reports", anchor=False)
801
- st.markdown("---")
802
-
803
- col1, col2, col3 = st.columns(3)
804
-
805
- with col1:
806
- st.download_button(
807
- label="Download HTML Report 🌐",
808
- data=html_report_content,
809
- file_name="entity_topic_report.html",
810
- mime="text/html",
811
- help="A full, interactive report with all charts."
812
- )
813
- with col2:
814
- st.download_button(
815
- label="Download PowerPoint (.pptx) 📊",
816
- data=pptx_buffer,
817
- file_name="entity_topic_slides.pptx",
818
- mime="application/vnd.openxmlformats-officedocument.presentationml.presentation",
819
- help="A summary presentation with static charts."
820
- )
821
- with col3:
822
- st.download_button(
823
- label="Download Raw Entities (.csv) 📋",
824
- data=csv_buffer,
825
- file_name="extracted_entities.csv",
826
- mime="text/csv",
827
- help="Raw data table of all extracted entities."
828
- )
829
-
830
- st.markdown("---")
831
-
832
- # --- Display Interactive Preview ---
833
- st.markdown("## Interactive HTML Report Preview", anchor=False)
834
- st.info("Scroll within the box below to see the complete report and interactive charts.")
835
-
836
- # Display the HTML report using the Streamlit component
837
- components.html(
838
- html_report_content,
839
- height=800,
840
- scrolling=True
841
- )
 
 
 
 
1
  import streamlit as st
 
 
 
2
  import plotly.express as px
3
+ import pandas as pd
 
 
 
 
 
4
  from io import BytesIO
5
  from pptx import Presentation
6
+ from pptx.util import Inches
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Sample data and Plotly graph
9
+ df = pd.DataFrame({'Category': ['A', 'B', 'C'], 'Value': [10, 20, 30]})
10
+ fig = px.bar(df, x='Category', y='Value', title='Sample Plotly Bar Chart')
11
 
12
+ # Convert Plotly figure to image
13
+ img_buffer = BytesIO()
14
+ fig.write_image(img_buffer, format='png', width=800, height=400)
15
+ img_buffer.seek(0)
16
+ img_data = img_buffer.getvalue()
 
 
17
 
18
+ # Function to create PPTX
19
+ def create_presentation():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  prs = Presentation()
21
+
22
+ # Title slide
23
+ slide = prs.slides.add_slide(prs.slide_layouts[0])
24
+ title = slide.shapes.title
25
+ title.text = "Streamlit Plotly Export"
26
+
27
+ # Slide with Plotly image and table
28
+ slide = prs.slides.add_slide(prs.slide_layouts[1])
29
+ title = slide.shapes.title
30
+ title.text = "Plotly Chart and Data"
31
+
32
+ # Add Plotly image
33
+ left = Inches(1)
 
 
 
 
 
 
 
34
  top = Inches(1.5)
35
+ slide.shapes.add_picture(BytesIO(img_data), left, top, width=Inches(6))
36
+
37
+ # Add table
38
+ rows, cols = df.shape
39
+ left = Inches(1)
40
+ top = Inches(4)
41
+ width = Inches(6)
42
+ height = Inches(0.8)
43
+ table = slide.shapes.add_table(rows + 1, cols, left, top, width, height).table
44
+ table.cell(0, 0).text = 'Category'
45
+ table.cell(0, 1).text = 'Value'
46
+ for i in range(rows):
47
+ table.cell(i + 1, 0).text = df.iloc[i]['Category']
48
+ table.cell(i + 1, 1).text = str(df.iloc[i]['Value'])
49
+
50
+ # Save to bytes
51
+ bio = BytesIO()
52
+ prs.save(bio)
53
+ bio.seek(0)
54
+ return bio.getvalue()
55
+
56
+ # Streamlit UI
57
+ st.title("Export Plotly Graph to PPTX")
58
+ st.plotly_chart(fig) # Display the Plotly chart in the app
59
+ if st.button("Generate and Download Slides"):
60
+ pptx_data = create_presentation()
61
+ st.download_button(
62
+ label="Download PPTX",
63
+ data=pptx_data,
64
+ file_name="plotly_slides.pptx",
65
+ mime="application/vnd.openxmlformats-officedocument.presentationml.presentation"
66
+ )