AIEcosystem commited on
Commit
4dbacfd
ยท
verified ยท
1 Parent(s): 46fc5df

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +112 -255
src/streamlit_app.py CHANGED
@@ -16,7 +16,7 @@ from io import BytesIO
16
  from pptx import Presentation
17
  from pptx.util import Inches, Pt
18
  from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
- import plotly.io as pio # Required for image export (needs kaleido!)
20
  # ---------------------------
21
  # --- Stable Scikit-learn LDA Imports ---
22
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -66,7 +66,8 @@ category_mapping = {
66
  "Temporal & Events": ["event", "date"],
67
  "Digital & Products": ["platform", "product", "media_type", "url"],
68
  }
69
- reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 
70
 
71
 
72
  # --- Utility Functions for Analysis and Plotly ---
@@ -178,7 +179,8 @@ def create_topic_word_bubbles(df_topic_data):
178
  height=600,
179
  margin=dict(t=50, b=100, l=50, r=10),
180
  )
181
- fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>', marker=dict(line=dict(width=1, color='DarkSlateGrey')))
 
182
  return fig
183
 
184
  def generate_network_graph(df, raw_text):
@@ -191,7 +193,6 @@ def generate_network_graph(df, raw_text):
191
 
192
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
193
  if unique_entities.shape[0] < 2:
194
- # Return a blank figure if not enough entities
195
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
196
 
197
  num_nodes = len(unique_entities)
@@ -293,7 +294,7 @@ def generate_network_graph(df, raw_text):
293
  return fig
294
 
295
 
296
- # --- PPTX HELPER FUNCTIONS (Integrated from generate_report.py) ---
297
 
298
  def fig_to_image_buffer(fig):
299
  """
@@ -307,12 +308,11 @@ def fig_to_image_buffer(fig):
307
  img_buffer = BytesIO(img_bytes)
308
  return img_buffer
309
  except Exception as e:
310
- # Print the error for debugging purposes in the Streamlit console
311
- # This message is CRITICAL for the user to understand why plots are missing
312
- print(f"ERROR: Failed to convert Plotly figure to image for PPTX. This usually means 'kaleido' is missing. Error: {e}")
313
  return None
314
 
315
- # --- PPTX GENERATION FUNCTION (Integrated and Adapted) ---
316
 
317
  def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_category_mapping):
318
  """
@@ -323,7 +323,7 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
323
  # Layout 5: Title and Content (often good for charts)
324
  chart_layout = prs.slide_layouts[5]
325
 
326
- # --- 1. Title Slide ---
327
  title_slide_layout = prs.slide_layouts[0]
328
  slide = prs.slides.add_slide(title_slide_layout)
329
  title = slide.shapes.title
@@ -331,9 +331,9 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
331
  title.text = "NER & Topic Analysis Report"
332
  subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
333
 
334
- # --- 2. Source Text Slide ---
335
  slide = prs.slides.add_slide(chart_layout)
336
- slide.shapes.title.text = "Analyzed Source Text (Raw)"
337
 
338
  # Add the raw text to a text box
339
  left = Inches(0.5)
@@ -350,83 +350,44 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
350
  p.font.size = Pt(14)
351
  p.font.name = 'Arial'
352
 
353
- # --- 3. Highlighted Text Slide ---
354
  slide = prs.slides.add_slide(chart_layout)
355
- slide.shapes.title.text = "Analyzed Source Text with Entity Highlights"
356
 
357
- # Generate the HTML for highlighting (we need to strip the HTML formatting for PPTX text box)
358
- highlighted_html = highlight_entities(text_input, df)
359
- # Simple regex to remove the HTML tags, keeping only the text content
360
- highlighted_clean_text = re.sub(r'<[^>]*>', '', highlighted_html)
361
- highlighted_clean_text = highlighted_clean_text.replace("div style", "").strip()
362
-
363
- # Add the text to a text box
364
- left = Inches(0.5)
365
- top = Inches(1.5)
366
- width = Inches(9.0)
367
- height = Inches(5.0)
368
- txBox = slide.shapes.add_textbox(left, top, width, height)
369
- tf = txBox.text_frame
370
- tf.margin_top = Inches(0.1)
371
- tf.margin_bottom = Inches(0.1)
372
- tf.word_wrap = True
373
- p = tf.add_paragraph()
374
- p.text = highlighted_clean_text
375
- p.font.size = Pt(12)
376
- p.font.name = 'Arial'
377
- p.font.color.rgb = prs.theme.theme_color_scheme.get_color(0) # Default text color
378
-
379
- # --- 4. Extracted Entities Table Slide ---
380
- slide = prs.slides.add_slide(chart_layout)
381
- slide.shapes.title.text = "Extracted Entities Table"
382
 
383
- # Prepare the dataframe for the table
384
- table_df = df[['category', 'label', 'text', 'score']].sort_values(by=['category', 'label', 'score'], ascending=[True, True, False])
385
-
386
  # Simple way to insert a table:
387
- rows, cols = table_df.shape
388
- # Cap the table size for the slide, otherwise it gets too cramped
389
- max_rows = 15
390
- table_to_display = table_df.head(max_rows)
391
- rows_display = len(table_to_display)
392
-
393
- x, y, cx, cy = Inches(0.2), Inches(1.2), Inches(9.6), Inches(6.0)
394
-
395
  # Add 1 row for the header
396
- table = slide.shapes.add_table(rows_display + 1, cols, x, y, cx, cy).table
397
 
398
  # Set column widths
399
- table.columns[0].width = Inches(2.0) # Category
400
- table.columns[1].width = Inches(2.0) # Label
401
- table.columns[2].width = Inches(4.0) # Text
402
- table.columns[3].width = Inches(1.6) # Score
403
 
404
  # Set column headers
405
- header_cols = ['Category', 'Label', 'Text', 'Score']
406
- for i, col in enumerate(header_cols):
407
  cell = table.cell(0, i)
408
  cell.text = col
 
409
  # Optional: Add simple styling to header
410
 
411
  # Fill in the data
412
- for i in range(rows_display):
413
  for j in range(cols):
414
  cell = table.cell(i+1, j)
415
- if table_df.columns[j] == 'score':
416
- cell.text = f"{table_to_display.iloc[i, j]:.4f}"
417
- else:
418
- cell.text = str(table_to_display.iloc[i, j])
419
  # Optional: Style data cells
420
-
421
- if rows > max_rows:
422
- slide.placeholders[1].text = f"... Table truncated for slide readability. Full data contains {rows} entries. See CSV file for all data."
423
- slide.placeholders[1].top = Inches(6.5)
424
- slide.placeholders[1].left = Inches(0.5)
425
- slide.placeholders[1].width = Inches(9.0)
426
- slide.placeholders[1].height = Inches(0.5)
427
-
428
 
429
- # --- 5. Treemap Slide (Visualization) ---
430
  fig_treemap = px.treemap(
431
  df,
432
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
@@ -438,31 +399,20 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
438
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
439
  treemap_image = fig_to_image_buffer(fig_treemap)
440
 
441
- slide = prs.slides.add_slide(chart_layout)
442
- slide.shapes.title.text = "Entity Distribution Treemap"
443
  if treemap_image:
 
 
444
  slide.shapes.add_picture(treemap_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
445
  else:
446
- slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
 
 
 
447
 
448
 
449
- # --- 6. Pie Chart Slide (Visualization) ---
450
  grouped_counts = df['category'].value_counts().reset_index()
451
  grouped_counts.columns = ['Category', 'Count']
452
- fig_pie = px.pie(grouped_counts, values='Count', names='Category', title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
453
- fig_pie.update_layout(margin=dict(t=50, b=10))
454
- pie_image = fig_to_image_buffer(fig_pie)
455
-
456
- slide = prs.slides.add_slide(chart_layout)
457
- slide.shapes.title.text = "Entity Distribution Pie Chart"
458
- if pie_image:
459
- # Pie charts often look better centered on the slide
460
- slide.shapes.add_picture(pie_image, Inches(1.5), Inches(1.5), width=Inches(7.0))
461
- else:
462
- slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
463
-
464
-
465
- # --- 7. Category Count Bar Chart Slide (Visualization) ---
466
  fig_bar_category = px.bar(
467
  grouped_counts,
468
  x='Category',
@@ -474,47 +424,17 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
474
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
475
  bar_category_image = fig_to_image_buffer(fig_bar_category)
476
 
477
- slide = prs.slides.add_slide(chart_layout)
478
- slide.shapes.title.text = "Total Entities per Category Bar Chart"
479
  if bar_category_image:
480
- slide.shapes.add_picture(bar_category_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
481
- else:
482
- slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
483
-
484
- # --- 8. Most Frequent Entities Bar Chart Slide (Visualization) ---
485
- word_counts = df['text'].value_counts().reset_index()
486
- word_counts.columns = ['Entity', 'Count']
487
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
488
-
489
- if not repeating_entities.empty:
490
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
491
- fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
492
- bar_freq_image = fig_to_image_buffer(fig_bar_freq)
493
-
494
  slide = prs.slides.add_slide(chart_layout)
495
- slide.shapes.title.text = "Top 10 Most Frequent Entities Bar Chart"
496
- if bar_freq_image:
497
- slide.shapes.add_picture(bar_freq_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
498
- else:
499
- slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
500
  else:
501
  slide = prs.slides.add_slide(chart_layout)
502
- slide.shapes.title.text = "Top 10 Most Frequent Entities Bar Chart"
503
- slide.placeholders[1].text = "No entities repeat in the text, so a frequency chart was not generated."
504
-
505
 
506
- # --- 9. Network Graph Slide (Visualization) ---
507
- network_fig = generate_network_graph(df, text_input)
508
- network_image = fig_to_image_buffer(network_fig)
509
-
510
- slide = prs.slides.add_slide(chart_layout)
511
- slide.shapes.title.text = "Entity Co-occurrence Network"
512
- if network_image:
513
- slide.shapes.add_picture(network_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
514
- else:
515
- slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
516
 
517
- # --- 10. Topic Modeling Bubble Chart Slide ---
518
  if df_topic_data is not None and not df_topic_data.empty:
519
  # Ensure data frame is in the format expected by create_topic_word_bubbles
520
  df_topic_data_pptx = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
@@ -526,9 +446,11 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
526
  slide.shapes.add_picture(bubble_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
527
  else:
528
  slide = prs.slides.add_slide(chart_layout)
529
- slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
530
- slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
 
531
  else:
 
532
  slide = prs.slides.add_slide(chart_layout)
533
  slide.shapes.title.text = "Topic Modeling Results"
534
  slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
@@ -539,7 +461,7 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
539
  pptx_buffer.seek(0)
540
  return pptx_buffer
541
 
542
- # --- NEW CSV GENERATION FUNCTION (Retained) ---
543
  def generate_entity_csv(df):
544
  """
545
  Generates a CSV file of the extracted entities in an in-memory buffer,
@@ -553,12 +475,10 @@ def generate_entity_csv(df):
553
  return csv_buffer
554
  # -----------------------------------
555
 
556
- # --- Existing App Functionality (HTML) (Retained) ---
557
-
558
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
559
  """
560
  Generates a full HTML report containing all analysis results and visualizations.
561
- (Content omitted for brevity but assumed to be here).
562
  """
563
  # 1. Generate Visualizations (Plotly HTML)
564
 
@@ -645,7 +565,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
645
  </style></head><body>
646
  <div class="container">
647
  <h1>Entity and Topic Analysis Report</h1>
648
-
649
  <div class="metadata">
650
  <p><strong>Generated At:</strong> {time.strftime('%Y-%m-%d %H:%M:%S')}</p>
651
  <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
@@ -655,25 +574,19 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
655
  <div class="highlighted-text-container">
656
  {highlighted_text_html}
657
  </div>
658
-
659
  <h2>2. Full Extracted Entities Table</h2>
660
  {entity_table_html}
661
  <h2>3. Data Visualizations</h2>
662
-
663
  <h3>3.1 Entity Distribution Treemap</h3>
664
  <div class="chart-box">{treemap_html}</div>
665
  <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
666
-
667
  <div class="chart-box">{pie_html}</div>
668
  <div class="chart-box">{bar_category_html}</div>
669
  <div class="chart-box">{bar_freq_html}</div>
670
-
671
  <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
672
  <div class="chart-box">{network_html}</div>
673
-
674
  <h2>4. Topic Modeling (LDA on Entities)</h2>
675
  {topic_charts_html}
676
-
677
  </div></body></html>
678
  """
679
  return html_content
@@ -705,6 +618,10 @@ st.markdown(
705
  border: none;
706
  padding: 10px 20px;
707
  border-radius: 5px;
 
 
 
 
708
  }
709
  /* Expander header and content background */
710
  .streamlit-expanderHeader, .streamlit-expanderContent {
@@ -715,10 +632,10 @@ st.markdown(
715
  """,
716
  unsafe_allow_html=True)
717
  st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
718
- st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
719
  expander = st.expander("**Important notes**")
720
  expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
721
- **Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and crucially, **`kaleido`** (for converting Plotly charts into static images).
722
  **Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
723
  **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
724
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
@@ -841,139 +758,79 @@ if st.button("Results"):
841
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
842
  st.session_state.show_results = True
843
 
844
- # --- Display Download Link and Results ---
845
  if st.session_state.show_results:
846
  df = st.session_state.results_df
847
- df_topic_data = st.session_state.topic_results
848
 
849
  if df.empty:
850
- st.warning("No entities were found in the provided text.")
851
  else:
852
- st.subheader("Analysis Results", divider="blue")
853
-
854
- # 1. Highlighted Text
855
- st.markdown("### 1. Analyzed Text with Highlighted Entities")
856
- st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
857
-
858
- # 2. Entity Summary Table
859
- st.markdown("### 2. Entity Summary Table (Count by Label)")
860
- grouped_entity_table = df['label'].value_counts().reset_index()
861
- grouped_entity_table.columns = ['Entity Label', 'Count']
862
- grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(reverse_category_mapping)
863
- st.dataframe(grouped_entity_table[['Category', 'Entity Label', 'Count']], use_container_width=True)
864
- st.markdown("---")
865
 
866
- # 3. Detailed Entity Analysis Tabs
867
- st.markdown("### 3. Detailed Entity Analysis")
868
- tab_category_details, tab_treemap_viz = st.tabs(["๐Ÿ“‘ Entities Grouped by Category", "๐Ÿ—บ๏ธ Treemap Distribution"])
869
-
870
- with tab_category_details:
871
- st.markdown("#### Detailed Entities Table (Grouped by Category)")
872
- unique_categories = list(category_mapping.keys())
873
- tabs_category = st.tabs(unique_categories)
874
- for category, tab in zip(unique_categories, tabs_category):
875
- df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
876
- with tab:
877
- st.markdown(f"##### {category} Entities ({len(df_category)} total)")
878
- if not df_category.empty:
879
- st.dataframe(
880
- df_category,
881
- use_container_width=True,
882
- column_config={'score': st.column_config.NumberColumn(format="%.4f")}
883
- )
884
- else:
885
- st.info(f"No entities of category **{category}** were found in the text.")
886
-
887
- with tab_treemap_viz:
888
- st.markdown("#### Treemap: Entity Distribution")
889
- fig_treemap = px.treemap(
890
  df,
891
- path=[px.Constant("All Entities"), 'category', 'label', 'text'],
892
- values='score',
893
- color='category',
894
- title="Entity Distribution by Category and Label",
895
- color_discrete_sequence=px.colors.qualitative.Dark24
896
  )
897
- fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
898
- st.plotly_chart(fig_treemap, use_container_width=True)
899
 
900
- # 4. Comparative Charts
 
 
 
 
901
  st.markdown("---")
902
- st.markdown("### 4. Comparative Charts")
903
 
904
  col1, col2, col3 = st.columns(3)
905
 
906
- grouped_counts = df['category'].value_counts().reset_index()
907
- grouped_counts.columns = ['Category', 'Count']
908
-
909
- with col1: # Pie Chart
910
- fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
911
- fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
912
- st.plotly_chart(fig_pie, use_container_width=True)
913
-
914
- with col2: # Bar Chart (Category Count)
915
- fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
916
- fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
917
- st.plotly_chart(fig_bar_category, use_container_width=True)
918
-
919
- with col3: # Bar Chart (Most Frequent Entities)
920
- word_counts = df['text'].value_counts().reset_index()
921
- word_counts.columns = ['Entity', 'Count']
922
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
923
- if not repeating_entities.empty:
924
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
925
- fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
926
- st.plotly_chart(fig_bar_freq, use_container_width=True)
927
- else:
928
- st.info("No entities repeat for frequency chart.")
 
929
 
930
  st.markdown("---")
931
- st.markdown("### 5. Entity Co-occurrence Network")
932
- network_fig = generate_network_graph(df, st.session_state.last_text)
933
- st.plotly_chart(network_fig, use_container_width=True)
934
 
935
- st.markdown("---")
936
- st.markdown("### 6. Topic Modeling Analysis")
937
-
938
- if df_topic_data is not None and not df_topic_data.empty:
939
- bubble_figure = create_topic_word_bubbles(df_topic_data)
940
- if bubble_figure:
941
- st.plotly_chart(bubble_figure, use_container_width=True)
942
- else:
943
- st.error("Error generating Topic Word Bubble Chart.")
944
- else:
945
- st.info("Topic modeling requires more unique input (at least two unique entities).")
946
 
947
- # --- Report Download ---
948
- st.markdown("---")
949
- st.markdown("### Download Full Report Artifacts")
950
-
951
- # 1. HTML Report Download (Retained)
952
- html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
953
- st.download_button(
954
- label="Download Comprehensive HTML Report",
955
- data=html_report,
956
- file_name="ner_topic_report.html",
957
- mime="text/html",
958
- type="primary"
959
  )
960
 
961
- # 2. PowerPoint PPTX Download (Retained)
962
- pptx_buffer = generate_pptx_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data, reverse_category_mapping)
963
- st.download_button(
964
- label="Download Presentation Slides (.pptx)",
965
- data=pptx_buffer,
966
- file_name="ner_topic_report.pptx",
967
- mime="application/vnd.openxmlformats-officedocument.presentationml.presentation",
968
- type="primary"
969
- )
970
 
971
- # 3. CSV Data Download (NEW)
972
- csv_buffer = generate_entity_csv(df)
973
- st.download_button(
974
- label="Download Extracted Entities (CSV)",
975
- data=csv_buffer,
976
- file_name="extracted_entities.csv",
977
- mime="text/csv",
978
- type="secondary"
979
- )
 
16
  from pptx import Presentation
17
  from pptx.util import Inches, Pt
18
  from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
+ import plotly.io as pio # Required for image export (needs kaleido installed)
20
  # ---------------------------
21
  # --- Stable Scikit-learn LDA Imports ---
22
  from sklearn.feature_extraction.text import TfidfVectorizer
 
66
  "Temporal & Events": ["event", "date"],
67
  "Digital & Products": ["platform", "product", "media_type", "url"],
68
  }
69
+ reverse_category_mapping = {label: category
70
+ for category, label_list in category_mapping.items() for label in label in label_list}
71
 
72
 
73
  # --- Utility Functions for Analysis and Plotly ---
 
179
  height=600,
180
  margin=dict(t=50, b=100, l=50, r=10),
181
  )
182
+ fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',
183
+ marker=dict(line=dict(width=1, color='DarkSlateGrey')))
184
  return fig
185
 
186
  def generate_network_graph(df, raw_text):
 
193
 
194
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
195
  if unique_entities.shape[0] < 2:
 
196
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
197
 
198
  num_nodes = len(unique_entities)
 
294
  return fig
295
 
296
 
297
+ # --- PPTX HELPER FUNCTIONS ---
298
 
299
  def fig_to_image_buffer(fig):
300
  """
 
308
  img_buffer = BytesIO(img_bytes)
309
  return img_buffer
310
  except Exception as e:
311
+ # Print error to console/logs, as Streamlit elements cannot be used here
312
+ print(f"Error converting Plotly figure to image (Check Kaleido installation/permissions): {e}")
 
313
  return None
314
 
315
+ # --- PPTX GENERATION FUNCTION ---
316
 
317
  def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_category_mapping):
318
  """
 
323
  # Layout 5: Title and Content (often good for charts)
324
  chart_layout = prs.slide_layouts[5]
325
 
326
+ # 1. Title Slide
327
  title_slide_layout = prs.slide_layouts[0]
328
  slide = prs.slides.add_slide(title_slide_layout)
329
  title = slide.shapes.title
 
331
  title.text = "NER & Topic Analysis Report"
332
  subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
333
 
334
+ # 2. Source Text Slide
335
  slide = prs.slides.add_slide(chart_layout)
336
+ slide.shapes.title.text = "Analyzed Source Text"
337
 
338
  # Add the raw text to a text box
339
  left = Inches(0.5)
 
350
  p.font.size = Pt(14)
351
  p.font.name = 'Arial'
352
 
353
+ # 3. Entity Summary Slide (Table)
354
  slide = prs.slides.add_slide(chart_layout)
355
+ slide.shapes.title.text = "Entity Summary (Count by Category and Label)"
356
 
357
+ # Create the summary table using the app's established logic
358
+ grouped_entity_table = df['label'].value_counts().reset_index()
359
+ grouped_entity_table.columns = ['Entity Label', 'Count']
360
+ grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(
361
+ lambda x: reverse_category_mapping.get(x, 'Other')
362
+ )
363
+ grouped_entity_table = grouped_entity_table[['Category', 'Entity Label', 'Count']]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
 
 
 
365
  # Simple way to insert a table:
366
+ rows, cols = grouped_entity_table.shape
367
+ x, y, cx, cy = Inches(1), Inches(1.5), Inches(8), Inches(4.5)
 
 
 
 
 
 
368
  # Add 1 row for the header
369
+ table = slide.shapes.add_table(rows + 1, cols, x, y, cx, cy).table
370
 
371
  # Set column widths
372
+ table.columns[0].width = Inches(2.7)
373
+ table.columns[1].width = Inches(2.8)
374
+ table.columns[2].width = Inches(2.5)
 
375
 
376
  # Set column headers
377
+ for i, col in enumerate(grouped_entity_table.columns):
 
378
  cell = table.cell(0, i)
379
  cell.text = col
380
+ cell.fill.solid()
381
  # Optional: Add simple styling to header
382
 
383
  # Fill in the data
384
+ for i in range(rows):
385
  for j in range(cols):
386
  cell = table.cell(i+1, j)
387
+ cell.text = str(grouped_entity_table.iloc[i, j])
 
 
 
388
  # Optional: Style data cells
 
 
 
 
 
 
 
 
389
 
390
+ # 4. Treemap Slide (Visualization)
391
  fig_treemap = px.treemap(
392
  df,
393
  path=[px.Constant("All Entities"), 'category', 'label', 'text'],
 
399
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
400
  treemap_image = fig_to_image_buffer(fig_treemap)
401
 
 
 
402
  if treemap_image:
403
+ slide = prs.slides.add_slide(chart_layout)
404
+ slide.shapes.title.text = "Entity Distribution Treemap"
405
  slide.shapes.add_picture(treemap_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
406
  else:
407
+ # Placeholder if image conversion failed (e.g., Kaleido issue)
408
+ slide = prs.slides.add_slide(chart_layout)
409
+ slide.shapes.title.text = "Entity Distribution Treemap (Chart Failed)"
410
+ slide.placeholders[1].text = "Chart generation failed. Check app logs for Kaleido errors."
411
 
412
 
413
+ # 5. Entity Count Bar Chart Slide (Visualization)
414
  grouped_counts = df['category'].value_counts().reset_index()
415
  grouped_counts.columns = ['Category', 'Count']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  fig_bar_category = px.bar(
417
  grouped_counts,
418
  x='Category',
 
424
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
425
  bar_category_image = fig_to_image_buffer(fig_bar_category)
426
 
 
 
427
  if bar_category_image:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  slide = prs.slides.add_slide(chart_layout)
429
+ slide.shapes.title.text = "Total Entities per Category"
430
+ slide.shapes.add_picture(bar_category_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
 
 
 
431
  else:
432
  slide = prs.slides.add_slide(chart_layout)
433
+ slide.shapes.title.text = "Total Entities per Category (Chart Failed)"
434
+ slide.placeholders[1].text = "Chart generation failed. Check app logs for Kaleido errors."
 
435
 
 
 
 
 
 
 
 
 
 
 
436
 
437
+ # 6. Topic Modeling Bubble Chart Slide
438
  if df_topic_data is not None and not df_topic_data.empty:
439
  # Ensure data frame is in the format expected by create_topic_word_bubbles
440
  df_topic_data_pptx = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
 
446
  slide.shapes.add_picture(bubble_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
447
  else:
448
  slide = prs.slides.add_slide(chart_layout)
449
+ slide.shapes.title.text = "Topic Word Weights (Chart Failed)"
450
+ slide.placeholders[1].text = "Chart generation failed. Check app logs for Kaleido errors."
451
+
452
  else:
453
+ # Placeholder slide if topic modeling is not available
454
  slide = prs.slides.add_slide(chart_layout)
455
  slide.shapes.title.text = "Topic Modeling Results"
456
  slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
 
461
  pptx_buffer.seek(0)
462
  return pptx_buffer
463
 
464
+ # --- NEW CSV GENERATION FUNCTION ---
465
  def generate_entity_csv(df):
466
  """
467
  Generates a CSV file of the extracted entities in an in-memory buffer,
 
475
  return csv_buffer
476
  # -----------------------------------
477
 
478
+ # --- Existing App Functionality (HTML) ---
 
479
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
480
  """
481
  Generates a full HTML report containing all analysis results and visualizations.
 
482
  """
483
  # 1. Generate Visualizations (Plotly HTML)
484
 
 
565
  </style></head><body>
566
  <div class="container">
567
  <h1>Entity and Topic Analysis Report</h1>
 
568
  <div class="metadata">
569
  <p><strong>Generated At:</strong> {time.strftime('%Y-%m-%d %H:%M:%S')}</p>
570
  <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
 
574
  <div class="highlighted-text-container">
575
  {highlighted_text_html}
576
  </div>
 
577
  <h2>2. Full Extracted Entities Table</h2>
578
  {entity_table_html}
579
  <h2>3. Data Visualizations</h2>
 
580
  <h3>3.1 Entity Distribution Treemap</h3>
581
  <div class="chart-box">{treemap_html}</div>
582
  <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
 
583
  <div class="chart-box">{pie_html}</div>
584
  <div class="chart-box">{bar_category_html}</div>
585
  <div class="chart-box">{bar_freq_html}</div>
 
586
  <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
587
  <div class="chart-box">{network_html}</div>
 
588
  <h2>4. Topic Modeling (LDA on Entities)</h2>
589
  {topic_charts_html}
 
590
  </div></body></html>
591
  """
592
  return html_content
 
618
  border: none;
619
  padding: 10px 20px;
620
  border-radius: 5px;
621
+ transition: background-color 0.3s;
622
+ }
623
+ .stButton > button:hover {
624
+ background-color: #E05C9E; /* Slightly darker pink on hover */
625
  }
626
  /* Expander header and content background */
627
  .streamlit-expanderHeader, .streamlit-expanderContent {
 
632
  """,
633
  unsafe_allow_html=True)
634
  st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
635
+ st.link_button("by nlpblogs", "https://nlpblogs.com", type="secondary")
636
  expander = st.expander("**Important notes**")
637
  expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
638
+ **Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`. If charts in the PPTX are blank, please check your environment's $\text{kaleido}$ installation/permissions.
639
  **Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
640
  **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
641
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 
758
  st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
759
  st.session_state.show_results = True
760
 
761
+ # --- Display Download Link and Results (The missing logic that was completed) ---
762
  if st.session_state.show_results:
763
  df = st.session_state.results_df
 
764
 
765
  if df.empty:
766
+ st.error("No entities were extracted from the text. The report cannot be generated.")
767
  else:
768
+ # --- Generate All Report Files/Buffers ---
769
+ with st.spinner("Generating Report Files (HTML, PPTX, CSV)..."):
770
+ # 1. HTML Report Generation
771
+ html_report_content = generate_html_report(
772
+ df,
773
+ st.session_state.last_text,
774
+ st.session_state.elapsed_time,
775
+ st.session_state.topic_results
776
+ )
 
 
 
 
777
 
778
+ # 2. PPTX Report Generation
779
+ pptx_buffer = generate_pptx_report(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
780
  df,
781
+ st.session_state.last_text,
782
+ st.session_state.elapsed_time,
783
+ st.session_state.topic_results,
784
+ reverse_category_mapping
 
785
  )
 
 
786
 
787
+ # 3. CSV Report Generation
788
+ csv_buffer = generate_entity_csv(df)
789
+
790
+ # --- Display Downloads and Preview ---
791
+ st.markdown("## Download Analysis Reports", anchor=False)
792
  st.markdown("---")
 
793
 
794
  col1, col2, col3 = st.columns(3)
795
 
796
+ with col1:
797
+ st.download_button(
798
+ label="Download HTML Report ๐ŸŒ",
799
+ data=html_report_content,
800
+ file_name="entity_topic_report.html",
801
+ mime="text/html",
802
+ help="A full, interactive report with all charts."
803
+ )
804
+ with col2:
805
+ st.download_button(
806
+ label="Download PowerPoint (.pptx) ๐Ÿ“Š",
807
+ data=pptx_buffer,
808
+ file_name="entity_topic_slides.pptx",
809
+ mime="application/vnd.openxmlformats-officedocument.presentationml.presentation",
810
+ help="A summary presentation with static charts."
811
+ )
812
+ with col3:
813
+ st.download_button(
814
+ label="Download Raw Entities (.csv) ๐Ÿ“‹",
815
+ data=csv_buffer,
816
+ file_name="extracted_entities.csv",
817
+ mime="text/csv",
818
+ help="Raw data table of all extracted entities."
819
+ )
820
 
821
  st.markdown("---")
 
 
 
822
 
823
+ # --- Display Interactive Preview ---
824
+ st.markdown("## Interactive HTML Report Preview", anchor=False)
825
+ st.info("Scroll within the box below to see the complete report and interactive charts.")
 
 
 
 
 
 
 
 
826
 
827
+ # Display the HTML report using the Streamlit component
828
+ components.html(
829
+ html_report_content,
830
+ height=800,
831
+ scrolling=True
 
 
 
 
 
 
 
832
  )
833
 
 
 
 
 
 
 
 
 
 
834
 
835
+
836
+