Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +28 -73
src/streamlit_app.py
CHANGED
|
@@ -11,7 +11,7 @@ import numpy as np
|
|
| 11 |
import re
|
| 12 |
import string
|
| 13 |
import json
|
| 14 |
-
# --- PPTX Imports
|
| 15 |
from io import BytesIO
|
| 16 |
from pptx import Presentation
|
| 17 |
from pptx.util import Inches, Pt
|
|
@@ -444,7 +444,21 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
|
|
| 444 |
pptx_buffer.seek(0)
|
| 445 |
return pptx_buffer
|
| 446 |
|
| 447 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
def generate_html_report(df, text_input, elapsed_time, df_topic_data):
|
| 450 |
"""
|
|
@@ -569,65 +583,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
|
|
| 569 |
"""
|
| 570 |
return html_content
|
| 571 |
|
| 572 |
-
def generate_presentation_json(df, elapsed_time, df_topic_data):
|
| 573 |
-
"""
|
| 574 |
-
Generates a structured dictionary of all analysis results suitable for
|
| 575 |
-
importing into a presentation tool, then serializes it to JSON.
|
| 576 |
-
"""
|
| 577 |
-
if df.empty:
|
| 578 |
-
return {"error": "No entities found for presentation export."}
|
| 579 |
-
|
| 580 |
-
total_entities = len(df)
|
| 581 |
-
unique_entities = len(df['text'].unique())
|
| 582 |
-
category_counts = df['category'].value_counts()
|
| 583 |
-
top_categories = category_counts.head(3).to_dict()
|
| 584 |
-
|
| 585 |
-
summary_stats = {
|
| 586 |
-
"Total Entities Found": total_entities,
|
| 587 |
-
"Unique Entities Found": unique_entities,
|
| 588 |
-
"Top_3_Entity_Categories": top_categories
|
| 589 |
-
}
|
| 590 |
-
|
| 591 |
-
grouped_entity_table = category_counts.reset_index()
|
| 592 |
-
grouped_entity_table.columns = ['Category', 'Count']
|
| 593 |
-
|
| 594 |
-
word_counts = df['text'].value_counts().reset_index()
|
| 595 |
-
word_counts.columns = ['Entity', 'Count']
|
| 596 |
-
repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
|
| 597 |
-
|
| 598 |
-
topic_data = "Not enough unique data for topic modeling."
|
| 599 |
-
if df_topic_data is not None and not df_topic_data.empty:
|
| 600 |
-
topic_data = df_topic_data.to_dict('records')
|
| 601 |
-
|
| 602 |
-
presentation_data = {
|
| 603 |
-
"ReportTitle": "NER and Topic Analysis Presentation Data",
|
| 604 |
-
"GeneratedAt": time.strftime('%Y-%m-%d %H:%M:%S'),
|
| 605 |
-
"ProcessingTimeSeconds": f"{elapsed_time:.2f}",
|
| 606 |
-
"Slides": [
|
| 607 |
-
{
|
| 608 |
-
"SlideTitle": "1. Analysis Overview and Key Metrics",
|
| 609 |
-
"Metrics": summary_stats,
|
| 610 |
-
"Note": "This data can be used for the introductory slide."
|
| 611 |
-
},
|
| 612 |
-
{
|
| 613 |
-
"SlideTitle": "2. Entity Category Distribution (Chart Data)",
|
| 614 |
-
"Data": grouped_entity_table.to_dict('records'),
|
| 615 |
-
"Note": "Data for Pie Chart and Category Count Bar Chart."
|
| 616 |
-
},
|
| 617 |
-
{
|
| 618 |
-
"SlideTitle": "3. Most Frequent Entities (Top 10)",
|
| 619 |
-
"Data": repeating_entities.to_dict('records'),
|
| 620 |
-
"Note": "Data for the Top 10 Frequent Entities Bar Chart."
|
| 621 |
-
},
|
| 622 |
-
{
|
| 623 |
-
"SlideTitle": "4. Topic Modeling Results (Key Words)",
|
| 624 |
-
"Data": topic_data,
|
| 625 |
-
"Note": "Key entities and their weights per topic from LDA."
|
| 626 |
-
}
|
| 627 |
-
]
|
| 628 |
-
}
|
| 629 |
-
return presentation_data
|
| 630 |
-
|
| 631 |
|
| 632 |
# --- Page Configuration and Styling (No Sidebar) ---
|
| 633 |
st.set_page_config(layout="wide", page_title="NER & Topic Report App")
|
|
@@ -669,7 +624,7 @@ st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
|
|
| 669 |
expander = st.expander("**Important notes**")
|
| 670 |
expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
|
| 671 |
**Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`.
|
| 672 |
-
**Results:** Results are compiled into a single, comprehensive **HTML report**
|
| 673 |
**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
|
| 674 |
st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
|
| 675 |
|
|
@@ -680,7 +635,7 @@ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
|
|
| 680 |
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
|
| 681 |
|
| 682 |
# --- Model Loading ---
|
| 683 |
-
@st.
|
| 684 |
def load_ner_model():
|
| 685 |
"""Loads the GLiNER model and caches it."""
|
| 686 |
try:
|
|
@@ -898,7 +853,7 @@ if st.session_state.show_results:
|
|
| 898 |
st.markdown("---")
|
| 899 |
st.markdown("### Download Full Report Artifacts")
|
| 900 |
|
| 901 |
-
# 1. HTML Report Download
|
| 902 |
html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
|
| 903 |
st.download_button(
|
| 904 |
label="Download Comprehensive HTML Report",
|
|
@@ -908,7 +863,7 @@ if st.session_state.show_results:
|
|
| 908 |
type="primary"
|
| 909 |
)
|
| 910 |
|
| 911 |
-
# 2. PowerPoint PPTX Download (
|
| 912 |
pptx_buffer = generate_pptx_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data, reverse_category_mapping)
|
| 913 |
st.download_button(
|
| 914 |
label="Download Presentation Slides (.pptx)",
|
|
@@ -918,17 +873,17 @@ if st.session_state.show_results:
|
|
| 918 |
type="primary"
|
| 919 |
)
|
| 920 |
|
| 921 |
-
# 3.
|
| 922 |
-
|
| 923 |
-
presentation_json_data = json.dumps(presentation_data, indent=4)
|
| 924 |
-
|
| 925 |
st.download_button(
|
| 926 |
-
label="Download
|
| 927 |
-
data=
|
| 928 |
-
file_name="
|
| 929 |
-
mime="
|
| 930 |
type="secondary"
|
| 931 |
)
|
| 932 |
|
| 933 |
|
| 934 |
|
|
|
|
|
|
|
|
|
| 11 |
import re
|
| 12 |
import string
|
| 13 |
import json
|
| 14 |
+
# --- PPTX Imports ---
|
| 15 |
from io import BytesIO
|
| 16 |
from pptx import Presentation
|
| 17 |
from pptx.util import Inches, Pt
|
|
|
|
| 444 |
pptx_buffer.seek(0)
|
| 445 |
return pptx_buffer
|
| 446 |
|
| 447 |
+
# --- NEW CSV GENERATION FUNCTION ---
|
| 448 |
+
def generate_entity_csv(df):
|
| 449 |
+
"""
|
| 450 |
+
Generates a CSV file of the extracted entities in an in-memory buffer,
|
| 451 |
+
including text, label, category, score, start, and end indices.
|
| 452 |
+
"""
|
| 453 |
+
csv_buffer = BytesIO()
|
| 454 |
+
# Select desired columns and write to buffer
|
| 455 |
+
df_export = df[['text', 'label', 'category', 'score', 'start', 'end']]
|
| 456 |
+
csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
|
| 457 |
+
csv_buffer.seek(0)
|
| 458 |
+
return csv_buffer
|
| 459 |
+
# -----------------------------------
|
| 460 |
+
|
| 461 |
+
# --- Existing App Functionality (HTML) ---
|
| 462 |
|
| 463 |
def generate_html_report(df, text_input, elapsed_time, df_topic_data):
|
| 464 |
"""
|
|
|
|
| 583 |
"""
|
| 584 |
return html_content
|
| 585 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
|
| 587 |
# --- Page Configuration and Styling (No Sidebar) ---
|
| 588 |
st.set_page_config(layout="wide", page_title="NER & Topic Report App")
|
|
|
|
| 624 |
expander = st.expander("**Important notes**")
|
| 625 |
expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
|
| 626 |
**Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`.
|
| 627 |
+
**Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
|
| 628 |
**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
|
| 629 |
st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
|
| 630 |
|
|
|
|
| 635 |
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
|
| 636 |
|
| 637 |
# --- Model Loading ---
|
| 638 |
+
@st.cache_resourced
|
| 639 |
def load_ner_model():
|
| 640 |
"""Loads the GLiNER model and caches it."""
|
| 641 |
try:
|
|
|
|
| 853 |
st.markdown("---")
|
| 854 |
st.markdown("### Download Full Report Artifacts")
|
| 855 |
|
| 856 |
+
# 1. HTML Report Download (Retained)
|
| 857 |
html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
|
| 858 |
st.download_button(
|
| 859 |
label="Download Comprehensive HTML Report",
|
|
|
|
| 863 |
type="primary"
|
| 864 |
)
|
| 865 |
|
| 866 |
+
# 2. PowerPoint PPTX Download (Retained)
|
| 867 |
pptx_buffer = generate_pptx_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data, reverse_category_mapping)
|
| 868 |
st.download_button(
|
| 869 |
label="Download Presentation Slides (.pptx)",
|
|
|
|
| 873 |
type="primary"
|
| 874 |
)
|
| 875 |
|
| 876 |
+
# 3. CSV Data Download (NEW)
|
| 877 |
+
csv_buffer = generate_entity_csv(df)
|
|
|
|
|
|
|
| 878 |
st.download_button(
|
| 879 |
+
label="Download Extracted Entities (CSV)",
|
| 880 |
+
data=csv_buffer,
|
| 881 |
+
file_name="extracted_entities.csv",
|
| 882 |
+
mime="text/csv",
|
| 883 |
type="secondary"
|
| 884 |
)
|
| 885 |
|
| 886 |
|
| 887 |
|
| 888 |
+
|
| 889 |
+
|