AIEcosystem commited on
Commit
bebe5f4
·
verified ·
1 Parent(s): d81c772

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +134 -45
src/streamlit_app.py CHANGED
@@ -16,7 +16,7 @@ from io import BytesIO
16
  from pptx import Presentation
17
  from pptx.util import Inches, Pt
18
  from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
- import plotly.io as pio # Required for image export
20
  # ---------------------------
21
  # --- Stable Scikit-learn LDA Imports ---
22
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -184,14 +184,14 @@ def create_topic_word_bubbles(df_topic_data):
184
  def generate_network_graph(df, raw_text):
185
  """
186
  Generates a network graph visualization (Node Plot) with edges
187
- based on entity co-occurrence in sentences. (Content omitted for brevity but assumed to be here).
188
  """
189
- # Using the existing generate_network_graph logic from previous context...
190
  entity_counts = df['text'].value_counts().reset_index()
191
  entity_counts.columns = ['text', 'frequency']
192
 
193
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
194
  if unique_entities.shape[0] < 2:
 
195
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
196
 
197
  num_nodes = len(unique_entities)
@@ -307,8 +307,9 @@ def fig_to_image_buffer(fig):
307
  img_buffer = BytesIO(img_bytes)
308
  return img_buffer
309
  except Exception as e:
310
- # In a Streamlit environment, we can't show this error directly in the app execution flow
311
- print(f"Error converting Plotly figure to image: {e}")
 
312
  return None
313
 
314
  # --- PPTX GENERATION FUNCTION (Integrated and Adapted) ---
@@ -322,7 +323,7 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
322
  # Layout 5: Title and Content (often good for charts)
323
  chart_layout = prs.slide_layouts[5]
324
 
325
- # 1. Title Slide
326
  title_slide_layout = prs.slide_layouts[0]
327
  slide = prs.slides.add_slide(title_slide_layout)
328
  title = slide.shapes.title
@@ -330,9 +331,9 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
330
  title.text = "NER & Topic Analysis Report"
331
  subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
332
 
333
- # 2. Source Text Slide
334
  slide = prs.slides.add_slide(chart_layout)
335
- slide.shapes.title.text = "Analyzed Source Text"
336
 
337
  # Add the raw text to a text box
338
  left = Inches(0.5)
@@ -349,44 +350,83 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
349
  p.font.size = Pt(14)
350
  p.font.name = 'Arial'
351
 
352
- # 3. Entity Summary Slide (Table)
353
  slide = prs.slides.add_slide(chart_layout)
354
- slide.shapes.title.text = "Entity Summary (Count by Category and Label)"
355
 
356
- # Create the summary table using the app's established logic
357
- grouped_entity_table = df['label'].value_counts().reset_index()
358
- grouped_entity_table.columns = ['Entity Label', 'Count']
359
- grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(
360
- lambda x: reverse_category_mapping.get(x, 'Other')
361
- )
362
- grouped_entity_table = grouped_entity_table[['Category', 'Entity Label', 'Count']]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
 
 
 
364
  # Simple way to insert a table:
365
- rows, cols = grouped_entity_table.shape
366
- x, y, cx, cy = Inches(1), Inches(1.5), Inches(8), Inches(4.5)
 
 
 
 
 
 
367
  # Add 1 row for the header
368
- table = slide.shapes.add_table(rows + 1, cols, x, y, cx, cy).table
369
 
370
  # Set column widths
371
- table.columns[0].width = Inches(2.7)
372
- table.columns[1].width = Inches(2.8)
373
- table.columns[2].width = Inches(2.5)
 
374
 
375
  # Set column headers
376
- for i, col in enumerate(grouped_entity_table.columns):
 
377
  cell = table.cell(0, i)
378
  cell.text = col
379
- cell.fill.solid()
380
  # Optional: Add simple styling to header
381
 
382
  # Fill in the data
383
- for i in range(rows):
384
  for j in range(cols):
385
  cell = table.cell(i+1, j)
386
- cell.text = str(grouped_entity_table.iloc[i, j])
 
 
 
387
  # Optional: Style data cells
 
 
 
 
 
 
 
 
388
 
389
- # 4. Treemap Slide (Visualization)
390
  fig_treemap = px.treemap(
391
  df,
392
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
@@ -398,14 +438,31 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
398
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
399
  treemap_image = fig_to_image_buffer(fig_treemap)
400
 
 
 
401
  if treemap_image:
402
- slide = prs.slides.add_slide(chart_layout)
403
- slide.shapes.title.text = "Entity Distribution Treemap"
404
  slide.shapes.add_picture(treemap_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
 
 
 
405
 
406
- # 5. Entity Count Bar Chart Slide (Visualization)
407
  grouped_counts = df['category'].value_counts().reset_index()
408
  grouped_counts.columns = ['Category', 'Count']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  fig_bar_category = px.bar(
410
  grouped_counts,
411
  x='Category',
@@ -417,12 +474,47 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
417
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
418
  bar_category_image = fig_to_image_buffer(fig_bar_category)
419
 
 
 
420
  if bar_category_image:
421
- slide = prs.slides.add_slide(chart_layout)
422
- slide.shapes.title.text = "Total Entities per Category"
423
  slide.shapes.add_picture(bar_category_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
- # 6. Topic Modeling Bubble Chart Slide
 
 
 
 
 
 
 
 
 
 
 
 
426
  if df_topic_data is not None and not df_topic_data.empty:
427
  # Ensure data frame is in the format expected by create_topic_word_bubbles
428
  df_topic_data_pptx = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
@@ -432,8 +524,11 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
432
  slide = prs.slides.add_slide(chart_layout)
433
  slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
434
  slide.shapes.add_picture(bubble_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
 
 
 
 
435
  else:
436
- # Placeholder slide if topic modeling is not available
437
  slide = prs.slides.add_slide(chart_layout)
438
  slide.shapes.title.text = "Topic Modeling Results"
439
  slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
@@ -444,7 +539,7 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
444
  pptx_buffer.seek(0)
445
  return pptx_buffer
446
 
447
- # --- NEW CSV GENERATION FUNCTION ---
448
  def generate_entity_csv(df):
449
  """
450
  Generates a CSV file of the extracted entities in an in-memory buffer,
@@ -458,7 +553,7 @@ def generate_entity_csv(df):
458
  return csv_buffer
459
  # -----------------------------------
460
 
461
- # --- Existing App Functionality (HTML) ---
462
 
463
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
464
  """
@@ -623,7 +718,7 @@ st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
623
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
624
  expander = st.expander("**Important notes**")
625
  expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
626
- **Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`.
627
  **Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
628
  **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
629
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
@@ -635,7 +730,7 @@ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
635
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
636
 
637
  # --- Model Loading ---
638
- @st.cache_resource
639
  def load_ner_model():
640
  """Loads the GLiNER model and caches it."""
641
  try:
@@ -882,9 +977,3 @@ if st.session_state.show_results:
882
  mime="text/csv",
883
  type="secondary"
884
  )
885
-
886
-
887
-
888
-
889
-
890
-
 
16
  from pptx import Presentation
17
  from pptx.util import Inches, Pt
18
  from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
+ import plotly.io as pio # Required for image export (needs kaleido!)
20
  # ---------------------------
21
  # --- Stable Scikit-learn LDA Imports ---
22
  from sklearn.feature_extraction.text import TfidfVectorizer
 
184
  def generate_network_graph(df, raw_text):
185
  """
186
  Generates a network graph visualization (Node Plot) with edges
187
+ based on entity co-occurrence in sentences.
188
  """
 
189
  entity_counts = df['text'].value_counts().reset_index()
190
  entity_counts.columns = ['text', 'frequency']
191
 
192
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
193
  if unique_entities.shape[0] < 2:
194
+ # Return a blank figure if not enough entities
195
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
196
 
197
  num_nodes = len(unique_entities)
 
307
  img_buffer = BytesIO(img_bytes)
308
  return img_buffer
309
  except Exception as e:
310
+ # Print the error for debugging purposes in the Streamlit console
311
+ # This message is CRITICAL for the user to understand why plots are missing
312
+ print(f"ERROR: Failed to convert Plotly figure to image for PPTX. This usually means 'kaleido' is missing. Error: {e}")
313
  return None
314
 
315
  # --- PPTX GENERATION FUNCTION (Integrated and Adapted) ---
 
323
  # Layout 5: Title and Content (often good for charts)
324
  chart_layout = prs.slide_layouts[5]
325
 
326
+ # --- 1. Title Slide ---
327
  title_slide_layout = prs.slide_layouts[0]
328
  slide = prs.slides.add_slide(title_slide_layout)
329
  title = slide.shapes.title
 
331
  title.text = "NER & Topic Analysis Report"
332
  subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
333
 
334
+ # --- 2. Source Text Slide ---
335
  slide = prs.slides.add_slide(chart_layout)
336
+ slide.shapes.title.text = "Analyzed Source Text (Raw)"
337
 
338
  # Add the raw text to a text box
339
  left = Inches(0.5)
 
350
  p.font.size = Pt(14)
351
  p.font.name = 'Arial'
352
 
353
+ # --- 3. Highlighted Text Slide ---
354
  slide = prs.slides.add_slide(chart_layout)
355
+ slide.shapes.title.text = "Analyzed Source Text with Entity Highlights"
356
 
357
+ # Generate the HTML for highlighting (we need to strip the HTML formatting for PPTX text box)
358
+ highlighted_html = highlight_entities(text_input, df)
359
+ # Simple regex to remove the HTML tags, keeping only the text content
360
+ highlighted_clean_text = re.sub(r'<[^>]*>', '', highlighted_html)
361
+ highlighted_clean_text = highlighted_clean_text.replace("div style", "").strip()
362
+
363
+ # Add the text to a text box
364
+ left = Inches(0.5)
365
+ top = Inches(1.5)
366
+ width = Inches(9.0)
367
+ height = Inches(5.0)
368
+ txBox = slide.shapes.add_textbox(left, top, width, height)
369
+ tf = txBox.text_frame
370
+ tf.margin_top = Inches(0.1)
371
+ tf.margin_bottom = Inches(0.1)
372
+ tf.word_wrap = True
373
+ p = tf.add_paragraph()
374
+ p.text = highlighted_clean_text
375
+ p.font.size = Pt(12)
376
+ p.font.name = 'Arial'
377
+ p.font.color.rgb = prs.theme.theme_color_scheme.get_color(0) # Default text color
378
+
379
+ # --- 4. Extracted Entities Table Slide ---
380
+ slide = prs.slides.add_slide(chart_layout)
381
+ slide.shapes.title.text = "Extracted Entities Table"
382
 
383
+ # Prepare the dataframe for the table
384
+ table_df = df[['category', 'label', 'text', 'score']].sort_values(by=['category', 'label', 'score'], ascending=[True, True, False])
385
+
386
  # Simple way to insert a table:
387
+ rows, cols = table_df.shape
388
+ # Cap the table size for the slide, otherwise it gets too cramped
389
+ max_rows = 15
390
+ table_to_display = table_df.head(max_rows)
391
+ rows_display = len(table_to_display)
392
+
393
+ x, y, cx, cy = Inches(0.2), Inches(1.2), Inches(9.6), Inches(6.0)
394
+
395
  # Add 1 row for the header
396
+ table = slide.shapes.add_table(rows_display + 1, cols, x, y, cx, cy).table
397
 
398
  # Set column widths
399
+ table.columns[0].width = Inches(2.0) # Category
400
+ table.columns[1].width = Inches(2.0) # Label
401
+ table.columns[2].width = Inches(4.0) # Text
402
+ table.columns[3].width = Inches(1.6) # Score
403
 
404
  # Set column headers
405
+ header_cols = ['Category', 'Label', 'Text', 'Score']
406
+ for i, col in enumerate(header_cols):
407
  cell = table.cell(0, i)
408
  cell.text = col
 
409
  # Optional: Add simple styling to header
410
 
411
  # Fill in the data
412
+ for i in range(rows_display):
413
  for j in range(cols):
414
  cell = table.cell(i+1, j)
415
+ if table_df.columns[j] == 'score':
416
+ cell.text = f"{table_to_display.iloc[i, j]:.4f}"
417
+ else:
418
+ cell.text = str(table_to_display.iloc[i, j])
419
  # Optional: Style data cells
420
+
421
+ if rows > max_rows:
422
+ slide.placeholders[1].text = f"... Table truncated for slide readability. Full data contains {rows} entries. See CSV file for all data."
423
+ slide.placeholders[1].top = Inches(6.5)
424
+ slide.placeholders[1].left = Inches(0.5)
425
+ slide.placeholders[1].width = Inches(9.0)
426
+ slide.placeholders[1].height = Inches(0.5)
427
+
428
 
429
+ # --- 5. Treemap Slide (Visualization) ---
430
  fig_treemap = px.treemap(
431
  df,
432
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
 
438
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
439
  treemap_image = fig_to_image_buffer(fig_treemap)
440
 
441
+ slide = prs.slides.add_slide(chart_layout)
442
+ slide.shapes.title.text = "Entity Distribution Treemap"
443
  if treemap_image:
 
 
444
  slide.shapes.add_picture(treemap_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
445
+ else:
446
+ slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
447
+
448
 
449
+ # --- 6. Pie Chart Slide (Visualization) ---
450
  grouped_counts = df['category'].value_counts().reset_index()
451
  grouped_counts.columns = ['Category', 'Count']
452
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category', title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
453
+ fig_pie.update_layout(margin=dict(t=50, b=10))
454
+ pie_image = fig_to_image_buffer(fig_pie)
455
+
456
+ slide = prs.slides.add_slide(chart_layout)
457
+ slide.shapes.title.text = "Entity Distribution Pie Chart"
458
+ if pie_image:
459
+ # Pie charts often look better centered on the slide
460
+ slide.shapes.add_picture(pie_image, Inches(1.5), Inches(1.5), width=Inches(7.0))
461
+ else:
462
+ slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
463
+
464
+
465
+ # --- 7. Category Count Bar Chart Slide (Visualization) ---
466
  fig_bar_category = px.bar(
467
  grouped_counts,
468
  x='Category',
 
474
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
475
  bar_category_image = fig_to_image_buffer(fig_bar_category)
476
 
477
+ slide = prs.slides.add_slide(chart_layout)
478
+ slide.shapes.title.text = "Total Entities per Category Bar Chart"
479
  if bar_category_image:
 
 
480
  slide.shapes.add_picture(bar_category_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
481
+ else:
482
+ slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
483
+
484
+ # --- 8. Most Frequent Entities Bar Chart Slide (Visualization) ---
485
+ word_counts = df['text'].value_counts().reset_index()
486
+ word_counts.columns = ['Entity', 'Count']
487
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
488
+
489
+ if not repeating_entities.empty:
490
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
491
+ fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
492
+ bar_freq_image = fig_to_image_buffer(fig_bar_freq)
493
+
494
+ slide = prs.slides.add_slide(chart_layout)
495
+ slide.shapes.title.text = "Top 10 Most Frequent Entities Bar Chart"
496
+ if bar_freq_image:
497
+ slide.shapes.add_picture(bar_freq_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
498
+ else:
499
+ slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
500
+ else:
501
+ slide = prs.slides.add_slide(chart_layout)
502
+ slide.shapes.title.text = "Top 10 Most Frequent Entities Bar Chart"
503
+ slide.placeholders[1].text = "No entities repeat in the text, so a frequency chart was not generated."
504
 
505
+
506
+ # --- 9. Network Graph Slide (Visualization) ---
507
+ network_fig = generate_network_graph(df, text_input)
508
+ network_image = fig_to_image_buffer(network_fig)
509
+
510
+ slide = prs.slides.add_slide(chart_layout)
511
+ slide.shapes.title.text = "Entity Co-occurrence Network"
512
+ if network_image:
513
+ slide.shapes.add_picture(network_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
514
+ else:
515
+ slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
516
+
517
+ # --- 10. Topic Modeling Bubble Chart Slide ---
518
  if df_topic_data is not None and not df_topic_data.empty:
519
  # Ensure data frame is in the format expected by create_topic_word_bubbles
520
  df_topic_data_pptx = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
 
524
  slide = prs.slides.add_slide(chart_layout)
525
  slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
526
  slide.shapes.add_picture(bubble_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
527
+ else:
528
+ slide = prs.slides.add_slide(chart_layout)
529
+ slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
530
+ slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
531
  else:
 
532
  slide = prs.slides.add_slide(chart_layout)
533
  slide.shapes.title.text = "Topic Modeling Results"
534
  slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
 
539
  pptx_buffer.seek(0)
540
  return pptx_buffer
541
 
542
+ # --- NEW CSV GENERATION FUNCTION (Retained) ---
543
  def generate_entity_csv(df):
544
  """
545
  Generates a CSV file of the extracted entities in an in-memory buffer,
 
553
  return csv_buffer
554
  # -----------------------------------
555
 
556
+ # --- Existing App Functionality (HTML) (Retained) ---
557
 
558
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
559
  """
 
718
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
719
  expander = st.expander("**Important notes**")
720
  expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
721
+ **Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and crucially, **`kaleido`** (for converting Plotly charts into static images).
722
  **Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
723
  **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
724
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 
730
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
731
 
732
  # --- Model Loading ---
733
+ @st.cache_resourced
734
  def load_ner_model():
735
  """Loads the GLiNER model and caches it."""
736
  try:
 
977
  mime="text/csv",
978
  type="secondary"
979
  )