AIEcosystem commited on
Commit
adf3d87
·
verified ·
1 Parent(s): f91c3e9

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +307 -139
src/streamlit_app.py CHANGED
@@ -10,6 +10,14 @@ import plotly.graph_objects as go
10
  import numpy as np
11
  import re
12
  import string
 
 
 
 
 
 
 
 
13
  # --- Stable Scikit-learn LDA Imports ---
14
  from sklearn.feature_extraction.text import TfidfVectorizer
15
  from sklearn.decomposition import LatentDirichletAllocation
@@ -50,7 +58,18 @@ entity_color_map = {
50
  "nationality_religion": "#fb7185"
51
  }
52
 
53
- # --- Utility Functions ---
 
 
 
 
 
 
 
 
 
 
 
54
  def extract_label(node_name):
55
  """Extracts the label from a node string like 'Text (Label)'."""
56
  match = re.search(r'\(([^)]+)\)$', node_name)
@@ -88,22 +107,17 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
88
  """
89
  Performs basic Topic Modeling using LDA on the extracted entities
90
  and returns structured data for visualization.
91
-
92
- Includes updated TF-IDF parameters (stop_words='english', max_df=0.95, min_df=1).
93
  """
94
- # Aggregate all unique entity text into a single document list
95
  documents = df_entities['text'].unique().tolist()
96
  if len(documents) < 2:
97
  return None
98
 
99
  N = min(num_top_words, len(documents))
100
  try:
101
- # UPDATED: Added stop_words='english' to filter common words tokenized
102
- # from multi-word entities (e.g., "The" from "The White House").
103
  tfidf_vectorizer = TfidfVectorizer(
104
  max_df=0.95,
105
- min_df=1, # Retained at 1 to keep all unique entities
106
- stop_words='english' # <-- THIS IS THE KEY ADDITION
107
  )
108
  tfidf = tfidf_vectorizer.fit_transform(documents)
109
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
@@ -130,113 +144,102 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
130
 
131
  def create_topic_word_bubbles(df_topic_data):
132
  """Generates a Plotly Bubble Chart for top words across all topics."""
 
 
 
133
 
134
  if df_topic_data.empty:
135
  return None
136
  fig = px.scatter(
137
  df_topic_data,
138
- x='Word',
139
- y='Topic_ID',
140
- size='Weight',
141
- color='Topic_ID',
 
142
  size_max=80,
143
  title='Topic Word Weights (Bubble Chart)',
144
  color_discrete_sequence=px.colors.qualitative.Bold,
145
- hover_data={'Word': True, 'Weight': ':.3f', 'Topic_ID': False}
 
 
 
 
 
146
  )
147
  fig.update_layout(
148
  xaxis_title="Entity/Word (Bubble size = Word Weight)",
149
- yaxis_title="Topic ID",
150
  xaxis={'tickangle': -45, 'showgrid': False},
151
- yaxis={'showgrid': True, 'autorange': 'reversed'},
152
  showlegend=True,
153
  plot_bgcolor='#FFF0F5',
154
  paper_bgcolor='#FFF0F5',
155
  height=600,
156
  margin=dict(t=50, b=100, l=50, r=10),
157
  )
158
-
159
- fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
160
-
161
  return fig
162
 
163
  def generate_network_graph(df, raw_text):
164
  """
165
  Generates a network graph visualization (Node Plot) with edges
166
- based on entity co-occurrence in sentences.
167
  """
 
168
  entity_counts = df['text'].value_counts().reset_index()
169
  entity_counts.columns = ['text', 'frequency']
170
 
171
- # Merge counts with unique entities (text + label)
172
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
173
  if unique_entities.shape[0] < 2:
174
- # Return a simple figure with a message if not enough data
175
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
176
 
177
  num_nodes = len(unique_entities)
178
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
179
 
180
  radius = 10
181
- # Assign circular positions + a little randomness
182
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
183
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
184
 
185
- # Map entity text to its coordinates for easy lookup
186
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
187
- # ----------------------------------------------------------------------
188
- # 1. Identify Edges (Co-occurrence in sentences)
189
- # ----------------------------------------------------------------------
190
  edges = set()
191
 
192
- # Simple sentence segmentation (handles standard punctuation followed by space)
193
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
194
  for sentence in sentences:
195
- # Find unique entities that are substrings of this sentence
196
  entities_in_sentence = []
197
  for entity_text in unique_entities['text'].unique():
198
  if entity_text.lower() in sentence.lower():
199
  entities_in_sentence.append(entity_text)
200
- # Create edges (pairs) based on co-occurrence
201
  unique_entities_in_sentence = list(set(entities_in_sentence))
202
 
203
- # Create all unique pairs (edges)
204
  for i in range(len(unique_entities_in_sentence)):
205
  for j in range(i + 1, len(unique_entities_in_sentence)):
206
  node1 = unique_entities_in_sentence[i]
207
  node2 = unique_entities_in_sentence[j]
208
-
209
- # Ensure consistent order for the set to avoid duplicates like (A, B) and (B, A)
210
  edge_tuple = tuple(sorted((node1, node2)))
211
  edges.add(edge_tuple)
212
- # ----------------------------------------------------------------------
213
- # 2. Create Plotly Trace Data for Edges
214
- # ----------------------------------------------------------------------
215
  edge_x = []
216
  edge_y = []
217
 
218
  for edge in edges:
219
  n1, n2 = edge
220
  if n1 in pos_map and n2 in pos_map:
221
- # Append coordinates for line segment: [x1, x2, None] for separation
222
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
223
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
224
 
225
  fig = go.Figure()
226
 
227
- # Add Edge Trace (Lines)
228
  edge_trace = go.Scatter(
229
  x=edge_x, y=edge_y,
230
  line=dict(width=0.5, color='#888'),
231
  hoverinfo='none',
232
  mode='lines',
233
  name='Co-occurrence Edges',
234
- showlegend=False # Edges don't need a legend entry
235
  )
236
  fig.add_trace(edge_trace)
237
- # ----------------------------------------------------------------------
238
- # 3. Add Node Trace (Markers)
239
- # ----------------------------------------------------------------------
240
  fig.add_trace(go.Scatter(
241
  x=unique_entities['x'],
242
  y=unique_entities['y'],
@@ -244,8 +247,6 @@ def generate_network_graph(df, raw_text):
244
  name='Entities',
245
  text=unique_entities['text'],
246
  textposition="top center",
247
- # FIX: Explicitly set showlegend=False for the main node trace
248
- # as we are creating separate traces for the legend colors below.
249
  showlegend=False,
250
  marker=dict(
251
  size=unique_entities['frequency'] * 5 + 10,
@@ -264,7 +265,6 @@ def generate_network_graph(df, raw_text):
264
  )
265
  ))
266
 
267
- # Adding discrete traces for the legend based on unique labels
268
  legend_traces = []
269
  seen_labels = set()
270
  for index, row in unique_entities.iterrows():
@@ -273,12 +273,7 @@ def generate_network_graph(df, raw_text):
273
  seen_labels.add(label)
274
  color = entity_color_map.get(label, '#cccccc')
275
  legend_traces.append(go.Scatter(
276
- x=[None],
277
- y=[None],
278
- mode='markers',
279
- marker=dict(size=10, color=color),
280
- name=f"{label.capitalize()}",
281
- showlegend=True # Ensure legend traces are explicitly visible
282
  ))
283
  for trace in legend_traces:
284
  fig.add_trace(trace)
@@ -287,7 +282,6 @@ def generate_network_graph(df, raw_text):
287
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
288
  showlegend=True,
289
  hovermode='closest',
290
- # Set explicit range to ensure padding for text labels on the edge
291
  xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
292
  yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
293
  plot_bgcolor='#f9f9f9',
@@ -298,27 +292,175 @@ def generate_network_graph(df, raw_text):
298
 
299
  return fig
300
 
301
- def generate_html_report(df, text_input, elapsed_time, df_topic_data):
 
 
 
302
  """
303
- Generates a full HTML report containing all analysis results and visualizations.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
- FIX APPLIED: Removed the CSS Grid layout for the three comparative charts
306
- (Pie, Category Count, Frequency) and stacked them vertically to prevent
307
- overlapping and ensure reliable rendering across devices.
308
- FIX 2 APPLIED: Increased the bottom margin (b) for both bar charts to prevent X-axis labels from being cut off.
309
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
 
 
 
 
 
 
 
311
  # 1. Generate Visualizations (Plotly HTML)
312
 
313
  # 1a. Treemap
314
- # FIX 1: Explicitly set a color_discrete_sequence to prevent the Treemap from being black
315
  fig_treemap = px.treemap(
316
  df,
317
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
318
  values='score',
319
  color='category',
320
  title="Entity Distribution by Category and Label",
321
- color_discrete_sequence=px.colors.qualitative.Dark24 # Use a robust color sequence
322
  )
323
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
324
  treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
@@ -332,28 +474,25 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
332
 
333
  # 1c. Bar Chart (Category Count)
334
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
335
- # FIX 2: Increased bottom margin from b=10 to b=100
336
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
337
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
338
 
339
  # 1d. Bar Chart (Most Frequent Entities)
340
  word_counts = df['text'].value_counts().reset_index()
341
  word_counts.columns = ['Entity', 'Count']
342
- # Top 10 repeating entities
343
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
344
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
345
 
346
  if not repeating_entities.empty:
347
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
348
- # FIX 2: Increased bottom margin from b=10 to b=100
349
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
350
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
351
 
352
- # 1e. Network Graph HTML - UPDATED to pass text_input
353
  network_fig = generate_network_graph(df, text_input)
354
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
355
 
356
- # 1f. Topic Charts HTML (Now a single Bubble Chart with Placeholder logic)
357
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
358
  if df_topic_data is not None and not df_topic_data.empty:
359
  bubble_figure = create_topic_word_bubbles(df_topic_data)
@@ -362,7 +501,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
362
  else:
363
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
364
  else:
365
- # Placeholder for low data
366
  topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #FF69B4;">'
367
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
368
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
@@ -390,32 +528,11 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
390
  h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
391
  h3 {{ color: #555; margin-top: 20px; }}
392
  .metadata {{ background-color: #FFE4E1; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
393
- /* The 'grid' class is kept for potential future use or the network graph, but not used for 3.2 */
394
- .grid {{
395
- display: grid;
396
- grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
397
- gap: 20px;
398
- margin-top: 20px;
399
- }}
400
- .chart-box {{
401
- background-color: #f9f9f9;
402
- padding: 15px;
403
- border-radius: 8px;
404
- box-shadow: 0 2px 4px rgba(0,0,0,0.05);
405
- /* Important: Set a minimum width for the chart box, and margin for stacking */
406
- min-width: 0;
407
- margin-bottom: 20px; /* NEW: Added margin for separation when stacked */
408
- }}
409
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
410
  table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
411
  table th {{ background-color: #f0f0f0; }}
412
- /* Specific styling for highlighted text element */
413
  .highlighted-text {{ border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
414
- @media (max-width: 1050px) {{ /* Increased breakpoint to help prevent overlap */
415
- .grid {{
416
- grid-template-columns: 1fr; /* Stack charts vertically on smaller screens */
417
- }}
418
- }}
419
  </style></head><body>
420
  <div class="container">
421
  <h1>Entity and Topic Analysis Report</h1>
@@ -438,7 +555,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
438
  <div class="chart-box">{treemap_html}</div>
439
  <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
440
 
441
- <!-- FIX: Charts are now in separate chart-box divs (not a 'grid') for guaranteed vertical stacking -->
442
  <div class="chart-box">{pie_html}</div>
443
  <div class="chart-box">{bar_category_html}</div>
444
  <div class="chart-box">{bar_freq_html}</div>
@@ -453,6 +569,66 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
453
  """
454
  return html_content
455
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  # --- Page Configuration and Styling (No Sidebar) ---
457
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
458
  st.markdown(
@@ -492,7 +668,8 @@ st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
492
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
493
  expander = st.expander("**Important notes**")
494
  expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
495
- **Results:** Results are compiled into a single, comprehensive **HTML report** for easy download and sharing.
 
496
  **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
497
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
498
 
@@ -502,22 +679,11 @@ COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
502
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
503
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
504
 
505
- # --- Label Definitions and Category Mapping ---
506
- labels = list(entity_color_map.keys())
507
- category_mapping = {
508
- "People & Groups": ["person", "username", "hashtag", "mention", "community", "position", "nationality_religion"],
509
- "Location & Organization": ["location", "organization"],
510
- "Temporal & Events": ["event", "date"],
511
- "Digital & Products": ["platform", "product", "media_type", "url"],
512
- }
513
- reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
514
-
515
  # --- Model Loading ---
516
- @st.cache_resource
517
  def load_ner_model():
518
  """Loads the GLiNER model and caches it."""
519
  try:
520
- # Use nested_ner=True and num_gen_sequences=2 for potentially higher recall
521
  return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
522
  except Exception as e:
523
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
@@ -553,14 +719,12 @@ if 'elapsed_time' not in st.session_state:
553
  st.session_state.elapsed_time = 0.0
554
  if 'topic_results' not in st.session_state:
555
  st.session_state.topic_results = None
556
- # FIX: Initialize the text area key with default text before st.text_area is called
557
  if 'my_text_area' not in st.session_state:
558
  st.session_state.my_text_area = DEFAULT_TEXT
559
 
560
  # --- Clear Button Function (MODIFIED) ---
561
  def clear_text():
562
  """Clears the text area (sets it to an empty string) and hides results."""
563
- # MODIFIED: Set to empty string for true clearing
564
  st.session_state['my_text_area'] = ""
565
  st.session_state.show_results = False
566
  st.session_state.last_text = ""
@@ -570,7 +734,6 @@ def clear_text():
570
 
571
  # --- Text Input and Clear Button ---
572
  word_limit = 1000
573
- # The text area now safely uses the pre-initialized session state value
574
  text = st.text_area(
575
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
576
  height=250,
@@ -628,7 +791,7 @@ if st.button("Results"):
628
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
629
  st.session_state.show_results = True
630
 
631
- # --- Display Download Link and Results (FIXED INDENTATION AND NEW LAYOUT) ---
632
  if st.session_state.show_results:
633
  df = st.session_state.results_df
634
  df_topic_data = st.session_state.topic_results
@@ -642,7 +805,7 @@ if st.session_state.show_results:
642
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
643
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
644
 
645
- # 2. Entity Summary Table (Count by Label - kept outside tabs)
646
  st.markdown("### 2. Entity Summary Table (Count by Label)")
647
  grouped_entity_table = df['label'].value_counts().reset_index()
648
  grouped_entity_table.columns = ['Entity Label', 'Count']
@@ -650,80 +813,63 @@ if st.session_state.show_results:
650
  st.dataframe(grouped_entity_table[['Category', 'Entity Label', 'Count']], use_container_width=True)
651
  st.markdown("---")
652
 
 
653
  st.markdown("### 3. Detailed Entity Analysis")
654
- # 3. New Tabs: Tab 1: Category Details Table | Tab 2: Treemap
655
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
656
 
657
- # TAB 1: Detailed Entities Table Grouped by Category
658
  with tab_category_details:
659
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
660
- # Get the unique categories for creating inner tabs
661
  unique_categories = list(category_mapping.keys())
662
-
663
- # Create inner tabs dynamically based on the available categories
664
  tabs_category = st.tabs(unique_categories)
665
- # We iterate over the categories and tabs simultaneously
666
  for category, tab in zip(unique_categories, tabs_category):
667
- # Filter the main DataFrame for the current category
668
  df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
669
-
670
  with tab:
671
  st.markdown(f"##### {category} Entities ({len(df_category)} total)")
672
  if not df_category.empty:
673
- # Display the DataFrame for the current category
674
  st.dataframe(
675
  df_category,
676
  use_container_width=True,
677
- # Format the score for better readability
678
  column_config={'score': st.column_config.NumberColumn(format="%.4f")}
679
  )
680
  else:
681
  st.info(f"No entities of category **{category}** were found in the text.")
682
- # TAB 2: Treemap
683
  with tab_treemap_viz:
684
  st.markdown("#### Treemap: Entity Distribution")
685
- # Treemap
686
- # FIX 1 (Streamlit): Added a robust color sequence here too for consistency in the Streamlit plot
687
  fig_treemap = px.treemap(
688
  df,
689
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
690
  values='score',
691
  color='category',
692
  title="Entity Distribution by Category and Label",
693
- color_discrete_sequence=px.colors.qualitative.Dark24 # Applied fix here
694
  )
695
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
696
  st.plotly_chart(fig_treemap, use_container_width=True)
697
 
698
- # 4. Comparative Charts (Keep outside the new tabs, as in original code structure)
699
  st.markdown("---")
700
  st.markdown("### 4. Comparative Charts")
701
 
702
- # FIX: The three comparative charts are generated here and will be stacked vertically
703
- # in the HTML report output.
704
- col1, col2, col3 = st.columns(3) # Use Streamlit columns for the *Streamlit* preview
705
 
706
  grouped_counts = df['category'].value_counts().reset_index()
707
  grouped_counts.columns = ['Category', 'Count']
708
 
709
- # Pie Chart
710
- with col1:
711
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
712
  fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
713
  st.plotly_chart(fig_pie, use_container_width=True)
714
 
715
- # Bar Chart (Category Count)
716
- with col2:
717
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
718
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
719
  st.plotly_chart(fig_bar_category, use_container_width=True)
720
 
721
- # Bar Chart (Most Frequent Entities)
722
- word_counts = df['text'].value_counts().reset_index()
723
- word_counts.columns = ['Entity', 'Count']
724
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
725
-
726
- with col3:
727
  if not repeating_entities.empty:
728
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
729
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
@@ -733,15 +879,12 @@ if st.session_state.show_results:
733
 
734
  st.markdown("---")
735
  st.markdown("### 5. Entity Co-occurrence Network")
736
-
737
- # 5. Network Graph
738
  network_fig = generate_network_graph(df, st.session_state.last_text)
739
  st.plotly_chart(network_fig, use_container_width=True)
740
 
741
  st.markdown("---")
742
  st.markdown("### 6. Topic Modeling Analysis")
743
 
744
- # 6. Topic Modeling Bubble Chart
745
  if df_topic_data is not None and not df_topic_data.empty:
746
  bubble_figure = create_topic_word_bubbles(df_topic_data)
747
  if bubble_figure:
@@ -753,14 +896,39 @@ if st.session_state.show_results:
753
 
754
  # --- Report Download ---
755
  st.markdown("---")
756
- st.markdown("### Download Full HTML Report")
757
 
 
758
  html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
759
  st.download_button(
760
- label="Download HTML Report",
761
  data=html_report,
762
  file_name="ner_topic_report.html",
763
  mime="text/html",
764
  type="primary"
765
  )
766
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import numpy as np
11
  import re
12
  import string
13
+ import json
14
+ # --- PPTX Imports (NEW) ---
15
+ from io import BytesIO
16
+ from pptx import Presentation
17
+ from pptx.util import Inches, Pt
18
+ from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
+ import plotly.io as pio # Required for image export
20
+ # ---------------------------
21
  # --- Stable Scikit-learn LDA Imports ---
22
  from sklearn.feature_extraction.text import TfidfVectorizer
23
  from sklearn.decomposition import LatentDirichletAllocation
 
58
  "nationality_religion": "#fb7185"
59
  }
60
 
61
+ # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
62
+ labels = list(entity_color_map.keys())
63
+ category_mapping = {
64
+ "People & Groups": ["person", "username", "hashtag", "mention", "community", "position", "nationality_religion"],
65
+ "Location & Organization": ["location", "organization"],
66
+ "Temporal & Events": ["event", "date"],
67
+ "Digital & Products": ["platform", "product", "media_type", "url"],
68
+ }
69
+ reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
70
+
71
+
72
+ # --- Utility Functions for Analysis and Plotly ---
73
  def extract_label(node_name):
74
  """Extracts the label from a node string like 'Text (Label)'."""
75
  match = re.search(r'\(([^)]+)\)$', node_name)
 
107
  """
108
  Performs basic Topic Modeling using LDA on the extracted entities
109
  and returns structured data for visualization.
 
 
110
  """
 
111
  documents = df_entities['text'].unique().tolist()
112
  if len(documents) < 2:
113
  return None
114
 
115
  N = min(num_top_words, len(documents))
116
  try:
 
 
117
  tfidf_vectorizer = TfidfVectorizer(
118
  max_df=0.95,
119
+ min_df=1,
120
+ stop_words='english'
121
  )
122
  tfidf = tfidf_vectorizer.fit_transform(documents)
123
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
 
144
 
145
  def create_topic_word_bubbles(df_topic_data):
146
  """Generates a Plotly Bubble Chart for top words across all topics."""
147
+ # Renaming columns to match the output of perform_topic_modeling
148
+ df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
149
+ df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
150
 
151
  if df_topic_data.empty:
152
  return None
153
  fig = px.scatter(
154
  df_topic_data,
155
+ x='x_pos',
156
+ y='weight',
157
+ size='weight',
158
+ color='topic',
159
+ hover_name='word',
160
  size_max=80,
161
  title='Topic Word Weights (Bubble Chart)',
162
  color_discrete_sequence=px.colors.qualitative.Bold,
163
+ labels={
164
+ 'x_pos': 'Entity/Word Index',
165
+ 'weight': 'Word Weight',
166
+ 'topic': 'Topic ID'
167
+ },
168
+ custom_data=['word', 'weight', 'topic']
169
  )
170
  fig.update_layout(
171
  xaxis_title="Entity/Word (Bubble size = Word Weight)",
172
+ yaxis_title="Word Weight",
173
  xaxis={'tickangle': -45, 'showgrid': False},
174
+ yaxis={'showgrid': True},
175
  showlegend=True,
176
  plot_bgcolor='#FFF0F5',
177
  paper_bgcolor='#FFF0F5',
178
  height=600,
179
  margin=dict(t=50, b=100, l=50, r=10),
180
  )
181
+ fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>', marker=dict(line=dict(width=1, color='DarkSlateGrey')))
 
 
182
  return fig
183
 
184
  def generate_network_graph(df, raw_text):
185
  """
186
  Generates a network graph visualization (Node Plot) with edges
187
+ based on entity co-occurrence in sentences. (Content omitted for brevity but assumed to be here).
188
  """
189
+ # Using the existing generate_network_graph logic from previous context...
190
  entity_counts = df['text'].value_counts().reset_index()
191
  entity_counts.columns = ['text', 'frequency']
192
 
 
193
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
194
  if unique_entities.shape[0] < 2:
 
195
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
196
 
197
  num_nodes = len(unique_entities)
198
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
199
 
200
  radius = 10
 
201
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
202
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
203
 
 
204
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
 
 
 
205
  edges = set()
206
 
 
207
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
208
  for sentence in sentences:
 
209
  entities_in_sentence = []
210
  for entity_text in unique_entities['text'].unique():
211
  if entity_text.lower() in sentence.lower():
212
  entities_in_sentence.append(entity_text)
 
213
  unique_entities_in_sentence = list(set(entities_in_sentence))
214
 
 
215
  for i in range(len(unique_entities_in_sentence)):
216
  for j in range(i + 1, len(unique_entities_in_sentence)):
217
  node1 = unique_entities_in_sentence[i]
218
  node2 = unique_entities_in_sentence[j]
 
 
219
  edge_tuple = tuple(sorted((node1, node2)))
220
  edges.add(edge_tuple)
221
+
 
 
222
  edge_x = []
223
  edge_y = []
224
 
225
  for edge in edges:
226
  n1, n2 = edge
227
  if n1 in pos_map and n2 in pos_map:
 
228
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
229
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
230
 
231
  fig = go.Figure()
232
 
 
233
  edge_trace = go.Scatter(
234
  x=edge_x, y=edge_y,
235
  line=dict(width=0.5, color='#888'),
236
  hoverinfo='none',
237
  mode='lines',
238
  name='Co-occurrence Edges',
239
+ showlegend=False
240
  )
241
  fig.add_trace(edge_trace)
242
+
 
 
243
  fig.add_trace(go.Scatter(
244
  x=unique_entities['x'],
245
  y=unique_entities['y'],
 
247
  name='Entities',
248
  text=unique_entities['text'],
249
  textposition="top center",
 
 
250
  showlegend=False,
251
  marker=dict(
252
  size=unique_entities['frequency'] * 5 + 10,
 
265
  )
266
  ))
267
 
 
268
  legend_traces = []
269
  seen_labels = set()
270
  for index, row in unique_entities.iterrows():
 
273
  seen_labels.add(label)
274
  color = entity_color_map.get(label, '#cccccc')
275
  legend_traces.append(go.Scatter(
276
+ x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True
 
 
 
 
 
277
  ))
278
  for trace in legend_traces:
279
  fig.add_trace(trace)
 
282
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
283
  showlegend=True,
284
  hovermode='closest',
 
285
  xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
286
  yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
287
  plot_bgcolor='#f9f9f9',
 
292
 
293
  return fig
294
 
295
+
296
+ # --- PPTX HELPER FUNCTIONS (Integrated from generate_report.py) ---
297
+
298
+ def fig_to_image_buffer(fig):
299
  """
300
+ Converts a Plotly figure object into a BytesIO buffer containing PNG data.
301
+ Requires 'kaleido' to be installed for image export.
302
+ Returns None if export fails.
303
+ """
304
+ try:
305
+ # Use pio.to_image to convert the figure to a PNG byte array
306
+ img_bytes = pio.to_image(fig, format="png", width=900, height=500, scale=2)
307
+ img_buffer = BytesIO(img_bytes)
308
+ return img_buffer
309
+ except Exception as e:
310
+ # In a Streamlit environment, we can't show this error directly in the app execution flow
311
+ print(f"Error converting Plotly figure to image: {e}")
312
+ return None
313
+
314
+ # --- PPTX GENERATION FUNCTION (Integrated and Adapted) ---
315
 
316
+ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_category_mapping):
317
+ """
318
+ Generates a PowerPoint presentation (.pptx) file containing key analysis results.
319
+ Returns the file content as a BytesIO buffer.
320
  """
321
+ prs = Presentation()
322
+ # Layout 5: Title and Content (often good for charts)
323
+ chart_layout = prs.slide_layouts[5]
324
+
325
+ # 1. Title Slide
326
+ title_slide_layout = prs.slide_layouts[0]
327
+ slide = prs.slides.add_slide(title_slide_layout)
328
+ title = slide.shapes.title
329
+ subtitle = slide.placeholders[1]
330
+ title.text = "NER & Topic Analysis Report"
331
+ subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
332
+
333
+ # 2. Source Text Slide
334
+ slide = prs.slides.add_slide(chart_layout)
335
+ slide.shapes.title.text = "Analyzed Source Text"
336
+
337
+ # Add the raw text to a text box
338
+ left = Inches(0.5)
339
+ top = Inches(1.5)
340
+ width = Inches(9.0)
341
+ height = Inches(5.0)
342
+ txBox = slide.shapes.add_textbox(left, top, width, height)
343
+ tf = txBox.text_frame
344
+ tf.margin_top = Inches(0.1)
345
+ tf.margin_bottom = Inches(0.1)
346
+ tf.word_wrap = True
347
+ p = tf.add_paragraph()
348
+ p.text = text_input
349
+ p.font.size = Pt(14)
350
+ p.font.name = 'Arial'
351
+
352
+ # 3. Entity Summary Slide (Table)
353
+ slide = prs.slides.add_slide(chart_layout)
354
+ slide.shapes.title.text = "Entity Summary (Count by Category and Label)"
355
+
356
+ # Create the summary table using the app's established logic
357
+ grouped_entity_table = df['label'].value_counts().reset_index()
358
+ grouped_entity_table.columns = ['Entity Label', 'Count']
359
+ grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(
360
+ lambda x: reverse_category_mapping.get(x, 'Other')
361
+ )
362
+ grouped_entity_table = grouped_entity_table[['Category', 'Entity Label', 'Count']]
363
+
364
+ # Simple way to insert a table:
365
+ rows, cols = grouped_entity_table.shape
366
+ x, y, cx, cy = Inches(1), Inches(1.5), Inches(8), Inches(4.5)
367
+ # Add 1 row for the header
368
+ table = slide.shapes.add_table(rows + 1, cols, x, y, cx, cy).table
369
+
370
+ # Set column widths
371
+ table.columns[0].width = Inches(2.7)
372
+ table.columns[1].width = Inches(2.8)
373
+ table.columns[2].width = Inches(2.5)
374
+
375
+ # Set column headers
376
+ for i, col in enumerate(grouped_entity_table.columns):
377
+ cell = table.cell(0, i)
378
+ cell.text = col
379
+ cell.fill.solid()
380
+ # Optional: Add simple styling to header
381
+
382
+ # Fill in the data
383
+ for i in range(rows):
384
+ for j in range(cols):
385
+ cell = table.cell(i+1, j)
386
+ cell.text = str(grouped_entity_table.iloc[i, j])
387
+ # Optional: Style data cells
388
+
389
+ # 4. Treemap Slide (Visualization)
390
+ fig_treemap = px.treemap(
391
+ df,
392
+ path=[px.Constant("All Entities"), 'category', 'label', 'text'],
393
+ values='score',
394
+ color='category',
395
+ title="Entity Distribution by Category and Label",
396
+ color_discrete_sequence=px.colors.qualitative.Dark24
397
+ )
398
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
399
+ treemap_image = fig_to_image_buffer(fig_treemap)
400
+
401
+ if treemap_image:
402
+ slide = prs.slides.add_slide(chart_layout)
403
+ slide.shapes.title.text = "Entity Distribution Treemap"
404
+ slide.shapes.add_picture(treemap_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
405
+
406
+ # 5. Entity Count Bar Chart Slide (Visualization)
407
+ grouped_counts = df['category'].value_counts().reset_index()
408
+ grouped_counts.columns = ['Category', 'Count']
409
+ fig_bar_category = px.bar(
410
+ grouped_counts,
411
+ x='Category',
412
+ y='Count',
413
+ color='Category',
414
+ title='Total Entities per Category',
415
+ color_discrete_sequence=px.colors.qualitative.Pastel
416
+ )
417
+ fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
418
+ bar_category_image = fig_to_image_buffer(fig_bar_category)
419
+
420
+ if bar_category_image:
421
+ slide = prs.slides.add_slide(chart_layout)
422
+ slide.shapes.title.text = "Total Entities per Category"
423
+ slide.shapes.add_picture(bar_category_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
424
+
425
+ # 6. Topic Modeling Bubble Chart Slide
426
+ if df_topic_data is not None and not df_topic_data.empty:
427
+ # Ensure data frame is in the format expected by create_topic_word_bubbles
428
+ df_topic_data_pptx = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
429
+ bubble_figure = create_topic_word_bubbles(df_topic_data_pptx)
430
+ bubble_image = fig_to_image_buffer(bubble_figure)
431
+ if bubble_image:
432
+ slide = prs.slides.add_slide(chart_layout)
433
+ slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
434
+ slide.shapes.add_picture(bubble_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
435
+ else:
436
+ # Placeholder slide if topic modeling is not available
437
+ slide = prs.slides.add_slide(chart_layout)
438
+ slide.shapes.title.text = "Topic Modeling Results"
439
+ slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
440
+
441
+ # Save the presentation to an in-memory buffer
442
+ pptx_buffer = BytesIO()
443
+ prs.save(pptx_buffer)
444
+ pptx_buffer.seek(0)
445
+ return pptx_buffer
446
 
447
+ # --- Existing App Functionality (HTML and JSON) ---
448
+
449
+ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
450
+ """
451
+ Generates a full HTML report containing all analysis results and visualizations.
452
+ (Content omitted for brevity but assumed to be here).
453
+ """
454
  # 1. Generate Visualizations (Plotly HTML)
455
 
456
  # 1a. Treemap
 
457
  fig_treemap = px.treemap(
458
  df,
459
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
460
  values='score',
461
  color='category',
462
  title="Entity Distribution by Category and Label",
463
+ color_discrete_sequence=px.colors.qualitative.Dark24
464
  )
465
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
466
  treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
 
474
 
475
  # 1c. Bar Chart (Category Count)
476
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
 
477
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
478
  bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
479
 
480
  # 1d. Bar Chart (Most Frequent Entities)
481
  word_counts = df['text'].value_counts().reset_index()
482
  word_counts.columns = ['Entity', 'Count']
 
483
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
484
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
485
 
486
  if not repeating_entities.empty:
487
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
 
488
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
489
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
490
 
491
+ # 1e. Network Graph HTML
492
  network_fig = generate_network_graph(df, text_input)
493
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
494
 
495
+ # 1f. Topic Charts HTML
496
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
497
  if df_topic_data is not None and not df_topic_data.empty:
498
  bubble_figure = create_topic_word_bubbles(df_topic_data)
 
501
  else:
502
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
503
  else:
 
504
  topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #FF69B4;">'
505
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
506
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
 
528
  h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
529
  h3 {{ color: #555; margin-top: 20px; }}
530
  .metadata {{ background-color: #FFE4E1; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
531
+ .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
533
  table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
534
  table th {{ background-color: #f0f0f0; }}
 
535
  .highlighted-text {{ border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
 
 
 
 
 
536
  </style></head><body>
537
  <div class="container">
538
  <h1>Entity and Topic Analysis Report</h1>
 
555
  <div class="chart-box">{treemap_html}</div>
556
  <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
557
 
 
558
  <div class="chart-box">{pie_html}</div>
559
  <div class="chart-box">{bar_category_html}</div>
560
  <div class="chart-box">{bar_freq_html}</div>
 
569
  """
570
  return html_content
571
 
572
+ def generate_presentation_json(df, elapsed_time, df_topic_data):
573
+ """
574
+ Generates a structured dictionary of all analysis results suitable for
575
+ importing into a presentation tool, then serializes it to JSON.
576
+ """
577
+ if df.empty:
578
+ return {"error": "No entities found for presentation export."}
579
+
580
+ total_entities = len(df)
581
+ unique_entities = len(df['text'].unique())
582
+ category_counts = df['category'].value_counts()
583
+ top_categories = category_counts.head(3).to_dict()
584
+
585
+ summary_stats = {
586
+ "Total Entities Found": total_entities,
587
+ "Unique Entities Found": unique_entities,
588
+ "Top_3_Entity_Categories": top_categories
589
+ }
590
+
591
+ grouped_entity_table = category_counts.reset_index()
592
+ grouped_entity_table.columns = ['Category', 'Count']
593
+
594
+ word_counts = df['text'].value_counts().reset_index()
595
+ word_counts.columns = ['Entity', 'Count']
596
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
597
+
598
+ topic_data = "Not enough unique data for topic modeling."
599
+ if df_topic_data is not None and not df_topic_data.empty:
600
+ topic_data = df_topic_data.to_dict('records')
601
+
602
+ presentation_data = {
603
+ "ReportTitle": "NER and Topic Analysis Presentation Data",
604
+ "GeneratedAt": time.strftime('%Y-%m-%d %H:%M:%S'),
605
+ "ProcessingTimeSeconds": f"{elapsed_time:.2f}",
606
+ "Slides": [
607
+ {
608
+ "SlideTitle": "1. Analysis Overview and Key Metrics",
609
+ "Metrics": summary_stats,
610
+ "Note": "This data can be used for the introductory slide."
611
+ },
612
+ {
613
+ "SlideTitle": "2. Entity Category Distribution (Chart Data)",
614
+ "Data": grouped_entity_table.to_dict('records'),
615
+ "Note": "Data for Pie Chart and Category Count Bar Chart."
616
+ },
617
+ {
618
+ "SlideTitle": "3. Most Frequent Entities (Top 10)",
619
+ "Data": repeating_entities.to_dict('records'),
620
+ "Note": "Data for the Top 10 Frequent Entities Bar Chart."
621
+ },
622
+ {
623
+ "SlideTitle": "4. Topic Modeling Results (Key Words)",
624
+ "Data": topic_data,
625
+ "Note": "Key entities and their weights per topic from LDA."
626
+ }
627
+ ]
628
+ }
629
+ return presentation_data
630
+
631
+
632
  # --- Page Configuration and Styling (No Sidebar) ---
633
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
634
  st.markdown(
 
668
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
669
  expander = st.expander("**Important notes**")
670
  expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
671
+ **Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`.
672
+ **Results:** Results are compiled into a single, comprehensive **HTML report** and a **PowerPoint (.pptx) file** for easy download and sharing.
673
  **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
674
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
675
 
 
679
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
680
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
681
 
 
 
 
 
 
 
 
 
 
 
682
  # --- Model Loading ---
683
+ @st.cache_resourced
684
  def load_ner_model():
685
  """Loads the GLiNER model and caches it."""
686
  try:
 
687
  return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
688
  except Exception as e:
689
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
 
719
  st.session_state.elapsed_time = 0.0
720
  if 'topic_results' not in st.session_state:
721
  st.session_state.topic_results = None
 
722
  if 'my_text_area' not in st.session_state:
723
  st.session_state.my_text_area = DEFAULT_TEXT
724
 
725
  # --- Clear Button Function (MODIFIED) ---
726
  def clear_text():
727
  """Clears the text area (sets it to an empty string) and hides results."""
 
728
  st.session_state['my_text_area'] = ""
729
  st.session_state.show_results = False
730
  st.session_state.last_text = ""
 
734
 
735
  # --- Text Input and Clear Button ---
736
  word_limit = 1000
 
737
  text = st.text_area(
738
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
739
  height=250,
 
791
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
792
  st.session_state.show_results = True
793
 
794
+ # --- Display Download Link and Results ---
795
  if st.session_state.show_results:
796
  df = st.session_state.results_df
797
  df_topic_data = st.session_state.topic_results
 
805
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
806
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
807
 
808
+ # 2. Entity Summary Table
809
  st.markdown("### 2. Entity Summary Table (Count by Label)")
810
  grouped_entity_table = df['label'].value_counts().reset_index()
811
  grouped_entity_table.columns = ['Entity Label', 'Count']
 
813
  st.dataframe(grouped_entity_table[['Category', 'Entity Label', 'Count']], use_container_width=True)
814
  st.markdown("---")
815
 
816
+ # 3. Detailed Entity Analysis Tabs
817
  st.markdown("### 3. Detailed Entity Analysis")
 
818
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
819
 
 
820
  with tab_category_details:
821
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
 
822
  unique_categories = list(category_mapping.keys())
 
 
823
  tabs_category = st.tabs(unique_categories)
 
824
  for category, tab in zip(unique_categories, tabs_category):
 
825
  df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
 
826
  with tab:
827
  st.markdown(f"##### {category} Entities ({len(df_category)} total)")
828
  if not df_category.empty:
 
829
  st.dataframe(
830
  df_category,
831
  use_container_width=True,
 
832
  column_config={'score': st.column_config.NumberColumn(format="%.4f")}
833
  )
834
  else:
835
  st.info(f"No entities of category **{category}** were found in the text.")
836
+
837
  with tab_treemap_viz:
838
  st.markdown("#### Treemap: Entity Distribution")
 
 
839
  fig_treemap = px.treemap(
840
  df,
841
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
842
  values='score',
843
  color='category',
844
  title="Entity Distribution by Category and Label",
845
+ color_discrete_sequence=px.colors.qualitative.Dark24
846
  )
847
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
848
  st.plotly_chart(fig_treemap, use_container_width=True)
849
 
850
+ # 4. Comparative Charts
851
  st.markdown("---")
852
  st.markdown("### 4. Comparative Charts")
853
 
854
+ col1, col2, col3 = st.columns(3)
 
 
855
 
856
  grouped_counts = df['category'].value_counts().reset_index()
857
  grouped_counts.columns = ['Category', 'Count']
858
 
859
+ with col1: # Pie Chart
 
860
  fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
861
  fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
862
  st.plotly_chart(fig_pie, use_container_width=True)
863
 
864
+ with col2: # Bar Chart (Category Count)
 
865
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
866
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
867
  st.plotly_chart(fig_bar_category, use_container_width=True)
868
 
869
+ with col3: # Bar Chart (Most Frequent Entities)
870
+ word_counts = df['text'].value_counts().reset_index()
871
+ word_counts.columns = ['Entity', 'Count']
872
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
 
 
873
  if not repeating_entities.empty:
874
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
875
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
 
879
 
880
  st.markdown("---")
881
  st.markdown("### 5. Entity Co-occurrence Network")
 
 
882
  network_fig = generate_network_graph(df, st.session_state.last_text)
883
  st.plotly_chart(network_fig, use_container_width=True)
884
 
885
  st.markdown("---")
886
  st.markdown("### 6. Topic Modeling Analysis")
887
 
 
888
  if df_topic_data is not None and not df_topic_data.empty:
889
  bubble_figure = create_topic_word_bubbles(df_topic_data)
890
  if bubble_figure:
 
896
 
897
  # --- Report Download ---
898
  st.markdown("---")
899
+ st.markdown("### Download Full Report Artifacts")
900
 
901
+ # 1. HTML Report Download
902
  html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
903
  st.download_button(
904
+ label="Download Comprehensive HTML Report",
905
  data=html_report,
906
  file_name="ner_topic_report.html",
907
  mime="text/html",
908
  type="primary"
909
  )
910
 
911
+ # 2. PowerPoint PPTX Download (NEW)
912
+ pptx_buffer = generate_pptx_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data, reverse_category_mapping)
913
+ st.download_button(
914
+ label="Download Presentation Slides (.pptx)",
915
+ data=pptx_buffer,
916
+ file_name="ner_topic_report.pptx",
917
+ mime="application/vnd.openxmlformats-officedocument.presentationml.presentation",
918
+ type="primary"
919
+ )
920
+
921
+ # 3. Presentation JSON Data Download
922
+ presentation_data = generate_presentation_json(df, st.session_state.elapsed_time, df_topic_data)
923
+ presentation_json_data = json.dumps(presentation_data, indent=4)
924
+
925
+ st.download_button(
926
+ label="Download Presentation Data (JSON)",
927
+ data=presentation_json_data,
928
+ file_name="ner_presentation_data.json",
929
+ mime="application/json",
930
+ type="secondary"
931
+ )
932
+
933
+
934
+