AIEcosystem commited on
Commit
471ac48
·
verified ·
1 Parent(s): f6ebbfc

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +28 -73
src/streamlit_app.py CHANGED
@@ -11,7 +11,7 @@ import numpy as np
11
  import re
12
  import string
13
  import json
14
- # --- PPTX Imports (NEW) ---
15
  from io import BytesIO
16
  from pptx import Presentation
17
  from pptx.util import Inches, Pt
@@ -444,7 +444,21 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
444
  pptx_buffer.seek(0)
445
  return pptx_buffer
446
 
447
- # --- Existing App Functionality (HTML and JSON) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
450
  """
@@ -569,65 +583,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
569
  """
570
  return html_content
571
 
572
- def generate_presentation_json(df, elapsed_time, df_topic_data):
573
- """
574
- Generates a structured dictionary of all analysis results suitable for
575
- importing into a presentation tool, then serializes it to JSON.
576
- """
577
- if df.empty:
578
- return {"error": "No entities found for presentation export."}
579
-
580
- total_entities = len(df)
581
- unique_entities = len(df['text'].unique())
582
- category_counts = df['category'].value_counts()
583
- top_categories = category_counts.head(3).to_dict()
584
-
585
- summary_stats = {
586
- "Total Entities Found": total_entities,
587
- "Unique Entities Found": unique_entities,
588
- "Top_3_Entity_Categories": top_categories
589
- }
590
-
591
- grouped_entity_table = category_counts.reset_index()
592
- grouped_entity_table.columns = ['Category', 'Count']
593
-
594
- word_counts = df['text'].value_counts().reset_index()
595
- word_counts.columns = ['Entity', 'Count']
596
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
597
-
598
- topic_data = "Not enough unique data for topic modeling."
599
- if df_topic_data is not None and not df_topic_data.empty:
600
- topic_data = df_topic_data.to_dict('records')
601
-
602
- presentation_data = {
603
- "ReportTitle": "NER and Topic Analysis Presentation Data",
604
- "GeneratedAt": time.strftime('%Y-%m-%d %H:%M:%S'),
605
- "ProcessingTimeSeconds": f"{elapsed_time:.2f}",
606
- "Slides": [
607
- {
608
- "SlideTitle": "1. Analysis Overview and Key Metrics",
609
- "Metrics": summary_stats,
610
- "Note": "This data can be used for the introductory slide."
611
- },
612
- {
613
- "SlideTitle": "2. Entity Category Distribution (Chart Data)",
614
- "Data": grouped_entity_table.to_dict('records'),
615
- "Note": "Data for Pie Chart and Category Count Bar Chart."
616
- },
617
- {
618
- "SlideTitle": "3. Most Frequent Entities (Top 10)",
619
- "Data": repeating_entities.to_dict('records'),
620
- "Note": "Data for the Top 10 Frequent Entities Bar Chart."
621
- },
622
- {
623
- "SlideTitle": "4. Topic Modeling Results (Key Words)",
624
- "Data": topic_data,
625
- "Note": "Key entities and their weights per topic from LDA."
626
- }
627
- ]
628
- }
629
- return presentation_data
630
-
631
 
632
  # --- Page Configuration and Styling (No Sidebar) ---
633
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
@@ -669,7 +624,7 @@ st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
669
  expander = st.expander("**Important notes**")
670
  expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
671
  **Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`.
672
- **Results:** Results are compiled into a single, comprehensive **HTML report** and a **PowerPoint (.pptx) file** for easy download and sharing.
673
  **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
674
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
675
 
@@ -680,7 +635,7 @@ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
680
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
681
 
682
  # --- Model Loading ---
683
- @st.cache_resource
684
  def load_ner_model():
685
  """Loads the GLiNER model and caches it."""
686
  try:
@@ -898,7 +853,7 @@ if st.session_state.show_results:
898
  st.markdown("---")
899
  st.markdown("### Download Full Report Artifacts")
900
 
901
- # 1. HTML Report Download
902
  html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
903
  st.download_button(
904
  label="Download Comprehensive HTML Report",
@@ -908,7 +863,7 @@ if st.session_state.show_results:
908
  type="primary"
909
  )
910
 
911
- # 2. PowerPoint PPTX Download (NEW)
912
  pptx_buffer = generate_pptx_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data, reverse_category_mapping)
913
  st.download_button(
914
  label="Download Presentation Slides (.pptx)",
@@ -918,17 +873,17 @@ if st.session_state.show_results:
918
  type="primary"
919
  )
920
 
921
- # 3. Presentation JSON Data Download
922
- presentation_data = generate_presentation_json(df, st.session_state.elapsed_time, df_topic_data)
923
- presentation_json_data = json.dumps(presentation_data, indent=4)
924
-
925
  st.download_button(
926
- label="Download Presentation Data (JSON)",
927
- data=presentation_json_data,
928
- file_name="ner_presentation_data.json",
929
- mime="application/json",
930
  type="secondary"
931
  )
932
 
933
 
934
 
 
 
 
11
  import re
12
  import string
13
  import json
14
+ # --- PPTX Imports ---
15
  from io import BytesIO
16
  from pptx import Presentation
17
  from pptx.util import Inches, Pt
 
444
  pptx_buffer.seek(0)
445
  return pptx_buffer
446
 
447
+ # --- NEW CSV GENERATION FUNCTION ---
448
+ def generate_entity_csv(df):
449
+ """
450
+ Generates a CSV file of the extracted entities in an in-memory buffer,
451
+ including text, label, category, score, start, and end indices.
452
+ """
453
+ csv_buffer = BytesIO()
454
+ # Select desired columns and write to buffer
455
+ df_export = df[['text', 'label', 'category', 'score', 'start', 'end']]
456
+ csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
457
+ csv_buffer.seek(0)
458
+ return csv_buffer
459
+ # -----------------------------------
460
+
461
+ # --- Existing App Functionality (HTML) ---
462
 
463
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
464
  """
 
583
  """
584
  return html_content
585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
 
587
  # --- Page Configuration and Styling (No Sidebar) ---
588
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 
624
  expander = st.expander("**Important notes**")
625
  expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
626
  **Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`.
627
+ **Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
628
  **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
629
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
630
 
 
635
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
636
 
637
  # --- Model Loading ---
638
+ @st.cache_resourced
639
  def load_ner_model():
640
  """Loads the GLiNER model and caches it."""
641
  try:
 
853
  st.markdown("---")
854
  st.markdown("### Download Full Report Artifacts")
855
 
856
+ # 1. HTML Report Download (Retained)
857
  html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
858
  st.download_button(
859
  label="Download Comprehensive HTML Report",
 
863
  type="primary"
864
  )
865
 
866
+ # 2. PowerPoint PPTX Download (Retained)
867
  pptx_buffer = generate_pptx_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data, reverse_category_mapping)
868
  st.download_button(
869
  label="Download Presentation Slides (.pptx)",
 
873
  type="primary"
874
  )
875
 
876
+ # 3. CSV Data Download (NEW)
877
+ csv_buffer = generate_entity_csv(df)
 
 
878
  st.download_button(
879
+ label="Download Extracted Entities (CSV)",
880
+ data=csv_buffer,
881
+ file_name="extracted_entities.csv",
882
+ mime="text/csv",
883
  type="secondary"
884
  )
885
 
886
 
887
 
888
+
889
+