AIEcosystem commited on
Commit
2ec8241
·
verified ·
1 Parent(s): 965b307

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +73 -196
src/streamlit_app.py CHANGED
@@ -42,30 +42,34 @@ os.environ['HF_HOME'] = '/tmp'
42
  # --- Color Map for Highlighting and Network Graph Nodes ---
43
  entity_color_map = {
44
  "person": "#10b981",
45
- "username": "#3b82f6",
46
- "hashtag": "#4ade80",
47
- "mention" : "#f97316",
48
  "organization": "#f59e0b",
49
- "community": "#8b5cf6",
50
- "position": "#ec4899",
51
- "location": "#06b6d4",
52
- "event": "#f43f5e",
53
- "product": "#a855f7",
54
- "platform": "#eab308",
55
- "date": "#6366f1",
56
- "media_type": "#14b8a6",
57
- "url": "#60a5fa",
58
- "nationality_religion": "#fb7185"
59
  }
60
 
61
  # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
62
  labels = list(entity_color_map.keys())
 
 
 
63
  category_mapping = {
64
- "People & Groups": ["person", "username", "hashtag", "mention", "community", "position", "nationality_religion"],
65
- "Location & Organization": ["location", "organization"],
66
- "Temporal & Events": ["event", "date"],
67
- "Digital & Products": ["platform", "product", "media_type", "url"],
68
  }
 
 
 
 
 
69
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
70
 
71
 
@@ -293,156 +297,11 @@ def generate_network_graph(df, raw_text):
293
  return fig
294
 
295
 
296
- # --- PPTX HELPER FUNCTIONS (Integrated from generate_report.py) ---
297
-
298
- def fig_to_image_buffer(fig):
299
- """
300
- Converts a Plotly figure object into a BytesIO buffer containing PNG data.
301
- Requires 'kaleido' to be installed for image export.
302
- Returns None if export fails.
303
- """
304
- try:
305
- # Use pio.to_image to convert the figure to a PNG byte array
306
- img_bytes = pio.to_image(fig, format="png", width=900, height=500, scale=2)
307
- img_buffer = BytesIO(img_bytes)
308
- return img_buffer
309
- except Exception as e:
310
- # In a Streamlit environment, we can't show this error directly in the app execution flow
311
- print(f"Error converting Plotly figure to image: {e}")
312
- return None
313
-
314
- # --- PPTX GENERATION FUNCTION (Integrated and Adapted) ---
315
-
316
- def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_category_mapping):
317
- """
318
- Generates a PowerPoint presentation (.pptx) file containing key analysis results.
319
- Returns the file content as a BytesIO buffer.
320
- """
321
- prs = Presentation()
322
- # Layout 5: Title and Content (often good for charts)
323
- chart_layout = prs.slide_layouts[5]
324
-
325
- # 1. Title Slide
326
- title_slide_layout = prs.slide_layouts[0]
327
- slide = prs.slides.add_slide(title_slide_layout)
328
- title = slide.shapes.title
329
- subtitle = slide.placeholders[1]
330
- title.text = "NER & Topic Analysis Report"
331
- subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
332
-
333
- # 2. Source Text Slide
334
- slide = prs.slides.add_slide(chart_layout)
335
- slide.shapes.title.text = "Analyzed Source Text"
336
-
337
- # Add the raw text to a text box
338
- left = Inches(0.5)
339
- top = Inches(1.5)
340
- width = Inches(9.0)
341
- height = Inches(5.0)
342
- txBox = slide.shapes.add_textbox(left, top, width, height)
343
- tf = txBox.text_frame
344
- tf.margin_top = Inches(0.1)
345
- tf.margin_bottom = Inches(0.1)
346
- tf.word_wrap = True
347
- p = tf.add_paragraph()
348
- p.text = text_input
349
- p.font.size = Pt(14)
350
- p.font.name = 'Arial'
351
-
352
- # 3. Entity Summary Slide (Table)
353
- slide = prs.slides.add_slide(chart_layout)
354
- slide.shapes.title.text = "Entity Summary (Count by Category and Label)"
355
-
356
- # Create the summary table using the app's established logic
357
- grouped_entity_table = df['label'].value_counts().reset_index()
358
- grouped_entity_table.columns = ['Entity Label', 'Count']
359
- grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(
360
- lambda x: reverse_category_mapping.get(x, 'Other')
361
- )
362
- grouped_entity_table = grouped_entity_table[['Category', 'Entity Label', 'Count']]
363
-
364
- # Simple way to insert a table:
365
- rows, cols = grouped_entity_table.shape
366
- x, y, cx, cy = Inches(1), Inches(1.5), Inches(8), Inches(4.5)
367
- # Add 1 row for the header
368
- table = slide.shapes.add_table(rows + 1, cols, x, y, cx, cy).table
369
-
370
- # Set column widths
371
- table.columns[0].width = Inches(2.7)
372
- table.columns[1].width = Inches(2.8)
373
- table.columns[2].width = Inches(2.5)
374
-
375
- # Set column headers
376
- for i, col in enumerate(grouped_entity_table.columns):
377
- cell = table.cell(0, i)
378
- cell.text = col
379
- cell.fill.solid()
380
- # Optional: Add simple styling to header
381
-
382
- # Fill in the data
383
- for i in range(rows):
384
- for j in range(cols):
385
- cell = table.cell(i+1, j)
386
- cell.text = str(grouped_entity_table.iloc[i, j])
387
- # Optional: Style data cells
388
-
389
- # 4. Treemap Slide (Visualization)
390
- fig_treemap = px.treemap(
391
- df,
392
- path=[px.Constant("All Entities"), 'category', 'label', 'text'],
393
- values='score',
394
- color='category',
395
- title="Entity Distribution by Category and Label",
396
- color_discrete_sequence=px.colors.qualitative.Dark24
397
- )
398
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
399
- treemap_image = fig_to_image_buffer(fig_treemap)
400
-
401
- if treemap_image:
402
- slide = prs.slides.add_slide(chart_layout)
403
- slide.shapes.title.text = "Entity Distribution Treemap"
404
- slide.shapes.add_picture(treemap_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
405
-
406
- # 5. Entity Count Bar Chart Slide (Visualization)
407
- grouped_counts = df['category'].value_counts().reset_index()
408
- grouped_counts.columns = ['Category', 'Count']
409
- fig_bar_category = px.bar(
410
- grouped_counts,
411
- x='Category',
412
- y='Count',
413
- color='Category',
414
- title='Total Entities per Category',
415
- color_discrete_sequence=px.colors.qualitative.Pastel
416
- )
417
- fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
418
- bar_category_image = fig_to_image_buffer(fig_bar_category)
419
-
420
- if bar_category_image:
421
- slide = prs.slides.add_slide(chart_layout)
422
- slide.shapes.title.text = "Total Entities per Category"
423
- slide.shapes.add_picture(bar_category_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
424
 
425
- # 6. Topic Modeling Bubble Chart Slide
426
- if df_topic_data is not None and not df_topic_data.empty:
427
- # Ensure data frame is in the format expected by create_topic_word_bubbles
428
- df_topic_data_pptx = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
429
- bubble_figure = create_topic_word_bubbles(df_topic_data_pptx)
430
- bubble_image = fig_to_image_buffer(bubble_figure)
431
- if bubble_image:
432
- slide = prs.slides.add_slide(chart_layout)
433
- slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
434
- slide.shapes.add_picture(bubble_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
435
- else:
436
- # Placeholder slide if topic modeling is not available
437
- slide = prs.slides.add_slide(chart_layout)
438
- slide.shapes.title.text = "Topic Modeling Results"
439
- slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
440
 
441
- # Save the presentation to an in-memory buffer
442
- pptx_buffer = BytesIO()
443
- prs.save(pptx_buffer)
444
- pptx_buffer.seek(0)
445
- return pptx_buffer
446
 
447
  # --- NEW CSV GENERATION FUNCTION ---
448
  def generate_entity_csv(df):
@@ -568,9 +427,9 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
568
  <div class="chart-box">{pie_html}</div>
569
  <div class="chart-box">{bar_category_html}</div>
570
  <div class="chart-box">{bar_freq_html}</div>
571
- <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
572
  <div class="chart-box">{network_html}</div>
573
- <h2>4. Topic Modeling (LDA on Entities)</h2>
574
  {topic_charts_html}
575
  </div></body></html>
576
  """
@@ -612,13 +471,24 @@ st.markdown(
612
  </style>
613
  """,
614
  unsafe_allow_html=True)
615
- st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
616
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 
 
 
 
617
  expander = st.expander("**Important notes**")
618
- expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
619
- **Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`.
620
- **Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
621
- **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
 
 
 
 
 
 
 
622
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
623
 
624
  # --- Comet ML Setup (Placeholder/Conditional) ---
@@ -753,20 +623,23 @@ if st.session_state.show_results:
753
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
754
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
755
 
756
- # 2. Entity Summary Table
757
- st.markdown("### 2. Entity Summary Table (Count by Label)")
758
- grouped_entity_table = df['label'].value_counts().reset_index()
759
- grouped_entity_table.columns = ['Entity Label', 'Count']
760
- grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(reverse_category_mapping)
761
- st.dataframe(grouped_entity_table[['Category', 'Entity Label', 'Count']], use_container_width=True)
762
- st.markdown("---")
763
 
764
- # 3. Detailed Entity Analysis Tabs
765
- st.markdown("### 3. Detailed Entity Analysis")
766
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
767
 
768
  with tab_category_details:
769
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
 
 
 
 
 
 
 
 
 
770
  unique_categories = list(category_mapping.keys())
771
  tabs_category = st.tabs(unique_categories)
772
  for category, tab in zip(unique_categories, tabs_category):
@@ -795,9 +668,9 @@ if st.session_state.show_results:
795
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
796
  st.plotly_chart(fig_treemap, use_container_width=True)
797
 
798
- # 4. Comparative Charts
799
  st.markdown("---")
800
- st.markdown("### 4. Comparative Charts")
801
 
802
  col1, col2, col3 = st.columns(3)
803
 
@@ -826,12 +699,12 @@ if st.session_state.show_results:
826
  st.info("No entities repeat for frequency chart.")
827
 
828
  st.markdown("---")
829
- st.markdown("### 5. Entity Co-occurrence Network")
830
  network_fig = generate_network_graph(df, st.session_state.last_text)
831
  st.plotly_chart(network_fig, use_container_width=True)
832
 
833
  st.markdown("---")
834
- st.markdown("### 6. Topic Modeling Analysis")
835
 
836
  if df_topic_data is not None and not df_topic_data.empty:
837
  bubble_figure = create_topic_word_bubbles(df_topic_data)
@@ -856,17 +729,9 @@ if st.session_state.show_results:
856
  type="primary"
857
  )
858
 
859
- # 2. PowerPoint PPTX Download (Retained)
860
- pptx_buffer = generate_pptx_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data, reverse_category_mapping)
861
- st.download_button(
862
- label="Download Presentation Slides (.pptx)",
863
- data=pptx_buffer,
864
- file_name="ner_topic_report.pptx",
865
- mime="application/vnd.openxmlformats-officedocument.presentationml.presentation",
866
- type="primary"
867
- )
868
 
869
- # 3. CSV Data Download (NEW)
870
  csv_buffer = generate_entity_csv(df)
871
  st.download_button(
872
  label="Download Extracted Entities (CSV)",
@@ -875,4 +740,16 @@ if st.session_state.show_results:
875
  mime="text/csv",
876
  type="secondary"
877
  )
878
-
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # --- Color Map for Highlighting and Network Graph Nodes ---
43
  entity_color_map = {
44
  "person": "#10b981",
45
+ "country": "#3b82f6",
46
+ "city": "#4ade80",
47
+
48
  "organization": "#f59e0b",
49
+ "date": "#8b5cf6",
50
+ "time": "#ec4899",
51
+ "cardinal": "#06b6d4",
52
+ "money": "#f43f5e",
53
+ "position": "#a855f7",
54
+
 
 
 
 
55
  }
56
 
57
  # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
58
  labels = list(entity_color_map.keys())
59
+
60
+
61
+
62
  category_mapping = {
63
+ "People": ["person", "organization", "position"],
64
+ "Locations": ["country", "city"],
65
+ "Time": ["date", "time"],
66
+ "Numbers": ["money", "cardinal"]
67
  }
68
+
69
+
70
+
71
+
72
+
73
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
74
 
75
 
 
297
  return fig
298
 
299
 
300
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
+
 
 
 
 
305
 
306
  # --- NEW CSV GENERATION FUNCTION ---
307
  def generate_entity_csv(df):
 
427
  <div class="chart-box">{pie_html}</div>
428
  <div class="chart-box">{bar_category_html}</div>
429
  <div class="chart-box">{bar_freq_html}</div>
430
+ <h3>3.3 Entity Relationship Map (Edges = Same Sentence)</h3>
431
  <div class="chart-box">{network_html}</div>
432
+ <h2>4. Topic Modelling</h2>
433
  {topic_charts_html}
434
  </div></body></html>
435
  """
 
471
  </style>
472
  """,
473
  unsafe_allow_html=True)
474
+ st.subheader("Entity and Topic Analysis Report Generator", divider="rainbow")
475
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
476
+
477
+
478
+
479
+
480
  expander = st.expander("**Important notes**")
481
+ expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
482
+
483
+ **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
484
+
485
+ **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
486
+
487
+ **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
488
+
489
+
490
+
491
+
492
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
493
 
494
  # --- Comet ML Setup (Placeholder/Conditional) ---
 
623
  st.markdown("### 1. Analyzed Text with Highlighted Entities")
624
  st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
625
 
626
+
 
 
 
 
 
 
627
 
628
+ # 2. Detailed Entity Analysis Tabs
629
+ st.markdown("### 2. Detailed Entity Analysis")
630
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
631
 
632
  with tab_category_details:
633
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
634
+ with st.expander("See Glossary of tags"):
635
+ st.write('''
636
+ - **text**: ['entity extracted from your text data']
637
+ - **label**: ['label (tag) assigned to a given extracted entity']
638
+ - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
639
+ - **start**: ['index of the start of the corresponding entity']
640
+ - **end**: ['index of the end of the corresponding entity']
641
+ ''')
642
+
643
  unique_categories = list(category_mapping.keys())
644
  tabs_category = st.tabs(unique_categories)
645
  for category, tab in zip(unique_categories, tabs_category):
 
668
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
669
  st.plotly_chart(fig_treemap, use_container_width=True)
670
 
671
+ # 3. Comparative Charts
672
  st.markdown("---")
673
+ st.markdown("### 3. Comparative Charts")
674
 
675
  col1, col2, col3 = st.columns(3)
676
 
 
699
  st.info("No entities repeat for frequency chart.")
700
 
701
  st.markdown("---")
702
+ st.markdown("### 4. Entity Relationship Map")
703
  network_fig = generate_network_graph(df, st.session_state.last_text)
704
  st.plotly_chart(network_fig, use_container_width=True)
705
 
706
  st.markdown("---")
707
+ st.markdown("### 5. Topic Modelling Analysis")
708
 
709
  if df_topic_data is not None and not df_topic_data.empty:
710
  bubble_figure = create_topic_word_bubbles(df_topic_data)
 
729
  type="primary"
730
  )
731
 
732
+
 
 
 
 
 
 
 
 
733
 
734
+ # 2. CSV Data Download (NEW)
735
  csv_buffer = generate_entity_csv(df)
736
  st.download_button(
737
  label="Download Extracted Entities (CSV)",
 
740
  mime="text/csv",
741
  type="secondary"
742
  )
743
+
744
+ with st.expander("See Glossary of tags"):
745
+ st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
746
+ code = '''
747
+ <iframe
748
+ src="https://aiecosystem-dataharvest.hf.space"
749
+ frameborder="0"
750
+ width="850"
751
+ height="450"
752
+ ></iframe>
753
+ '''
754
+ st.code(code, language="html")
755
+