Spaces:

AIEcosystem
/

relationship-map

Sleeping

App Files Files Community

AIEcosystem commited on Oct 8, 2025

Commit

bebe5f4

verified ·

1 Parent(s): d81c772

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +134 -45

src/streamlit_app.py CHANGED Viewed

@@ -16,7 +16,7 @@ from io import BytesIO
 from pptx import Presentation
 from pptx.util import Inches, Pt
 from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
-import plotly.io as pio # Required for image export
 # ---------------------------
 # --- Stable Scikit-learn LDA Imports ---
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -184,14 +184,14 @@ def create_topic_word_bubbles(df_topic_data):
 def generate_network_graph(df, raw_text):
     """
     Generates a network graph visualization (Node Plot) with edges
-    based on entity co-occurrence in sentences. (Content omitted for brevity but assumed to be here).
     """
-    # Using the existing generate_network_graph logic from previous context...
     entity_counts = df['text'].value_counts().reset_index()
     entity_counts.columns = ['text', 'frequency']
     unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
     if unique_entities.shape[0] < 2:
         return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
     num_nodes = len(unique_entities)
@@ -307,8 +307,9 @@ def fig_to_image_buffer(fig):
         img_buffer = BytesIO(img_bytes)
         return img_buffer
     except Exception as e:
-        # In a Streamlit environment, we can't show this error directly in the app execution flow
-        print(f"Error converting Plotly figure to image: {e}")
         return None
 # --- PPTX GENERATION FUNCTION (Integrated and Adapted) ---
@@ -322,7 +323,7 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
     # Layout 5: Title and Content (often good for charts)
     chart_layout = prs.slide_layouts[5]
-    # 1. Title Slide
     title_slide_layout = prs.slide_layouts[0]
     slide = prs.slides.add_slide(title_slide_layout)
     title = slide.shapes.title
@@ -330,9 +331,9 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
     title.text = "NER & Topic Analysis Report"
     subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
-    # 2. Source Text Slide
     slide = prs.slides.add_slide(chart_layout)
-    slide.shapes.title.text = "Analyzed Source Text"
     # Add the raw text to a text box
     left = Inches(0.5)
@@ -349,44 +350,83 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
     p.font.size = Pt(14)
     p.font.name = 'Arial'
-    # 3. Entity Summary Slide (Table)
     slide = prs.slides.add_slide(chart_layout)
-    slide.shapes.title.text = "Entity Summary (Count by Category and Label)"
-    # Create the summary table using the app's established logic
-    grouped_entity_table = df['label'].value_counts().reset_index()
-    grouped_entity_table.columns = ['Entity Label', 'Count']
-    grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(
-        lambda x: reverse_category_mapping.get(x, 'Other')
-    )
-    grouped_entity_table = grouped_entity_table[['Category', 'Entity Label', 'Count']]
     # Simple way to insert a table:
-    rows, cols = grouped_entity_table.shape
-    x, y, cx, cy = Inches(1), Inches(1.5), Inches(8), Inches(4.5)
     # Add 1 row for the header
-    table = slide.shapes.add_table(rows + 1, cols, x, y, cx, cy).table
     # Set column widths
-    table.columns[0].width = Inches(2.7)
-    table.columns[1].width = Inches(2.8)
-    table.columns[2].width = Inches(2.5)
     # Set column headers
-    for i, col in enumerate(grouped_entity_table.columns):
         cell = table.cell(0, i)
         cell.text = col
-        cell.fill.solid()
         # Optional: Add simple styling to header
     # Fill in the data
-    for i in range(rows):
         for j in range(cols):
             cell = table.cell(i+1, j)
-            cell.text = str(grouped_entity_table.iloc[i, j])
             # Optional: Style data cells
-    # 4. Treemap Slide (Visualization)
     fig_treemap = px.treemap(
         df,
         path=[px.Constant("All Entities"), 'category', 'label', 'text'],
@@ -398,14 +438,31 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
     fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     treemap_image = fig_to_image_buffer(fig_treemap)
     if treemap_image:
-        slide = prs.slides.add_slide(chart_layout)
-        slide.shapes.title.text = "Entity Distribution Treemap"
         slide.shapes.add_picture(treemap_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
-    # 5. Entity Count Bar Chart Slide (Visualization)
     grouped_counts = df['category'].value_counts().reset_index()
     grouped_counts.columns = ['Category', 'Count']
     fig_bar_category = px.bar(
         grouped_counts,
         x='Category',
@@ -417,12 +474,47 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
     fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
     bar_category_image = fig_to_image_buffer(fig_bar_category)
     if bar_category_image:
-        slide = prs.slides.add_slide(chart_layout)
-        slide.shapes.title.text = "Total Entities per Category"
         slide.shapes.add_picture(bar_category_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
-    # 6. Topic Modeling Bubble Chart Slide
     if df_topic_data is not None and not df_topic_data.empty:
         # Ensure data frame is in the format expected by create_topic_word_bubbles
         df_topic_data_pptx = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
@@ -432,8 +524,11 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
             slide = prs.slides.add_slide(chart_layout)
             slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
             slide.shapes.add_picture(bubble_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
     else:
-        # Placeholder slide if topic modeling is not available
         slide = prs.slides.add_slide(chart_layout)
         slide.shapes.title.text = "Topic Modeling Results"
         slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
@@ -444,7 +539,7 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
     pptx_buffer.seek(0)
     return pptx_buffer
-# --- NEW CSV GENERATION FUNCTION ---
 def generate_entity_csv(df):
     """
     Generates a CSV file of the extracted entities in an in-memory buffer,
@@ -458,7 +553,7 @@ def generate_entity_csv(df):
     return csv_buffer
 # -----------------------------------
-# --- Existing App Functionality (HTML) ---
 def generate_html_report(df, text_input, elapsed_time, df_topic_data):
     """
@@ -623,7 +718,7 @@ st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 expander = st.expander("**Important notes**")
 expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
-**Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`.
 **Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
 **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
 st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
@@ -635,7 +730,7 @@ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
 comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 # --- Model Loading ---
-@st.cache_resource
 def load_ner_model():
     """Loads the GLiNER model and caches it."""
     try:
@@ -882,9 +977,3 @@ if st.session_state.show_results:
             mime="text/csv",
             type="secondary"
         )

 from pptx import Presentation
 from pptx.util import Inches, Pt
 from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
+import plotly.io as pio # Required for image export (needs kaleido!)
 # ---------------------------
 # --- Stable Scikit-learn LDA Imports ---
 from sklearn.feature_extraction.text import TfidfVectorizer
 def generate_network_graph(df, raw_text):
     """
     Generates a network graph visualization (Node Plot) with edges
+    based on entity co-occurrence in sentences.
     """
     entity_counts = df['text'].value_counts().reset_index()
     entity_counts.columns = ['text', 'frequency']
     unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
     if unique_entities.shape[0] < 2:
+        # Return a blank figure if not enough entities
         return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
     num_nodes = len(unique_entities)
         img_buffer = BytesIO(img_bytes)
         return img_buffer
     except Exception as e:
+        # Print the error for debugging purposes in the Streamlit console
+        # This message is CRITICAL for the user to understand why plots are missing
+        print(f"ERROR: Failed to convert Plotly figure to image for PPTX. This usually means 'kaleido' is missing. Error: {e}")
         return None
 # --- PPTX GENERATION FUNCTION (Integrated and Adapted) ---
     # Layout 5: Title and Content (often good for charts)
     chart_layout = prs.slide_layouts[5]
+    # --- 1. Title Slide ---
     title_slide_layout = prs.slide_layouts[0]
     slide = prs.slides.add_slide(title_slide_layout)
     title = slide.shapes.title
     title.text = "NER & Topic Analysis Report"
     subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
+    # --- 2. Source Text Slide ---
     slide = prs.slides.add_slide(chart_layout)
+    slide.shapes.title.text = "Analyzed Source Text (Raw)"
     # Add the raw text to a text box
     left = Inches(0.5)
     p.font.size = Pt(14)
     p.font.name = 'Arial'
+    # --- 3. Highlighted Text Slide ---
     slide = prs.slides.add_slide(chart_layout)
+    slide.shapes.title.text = "Analyzed Source Text with Entity Highlights"
+    # Generate the HTML for highlighting (we need to strip the HTML formatting for PPTX text box)
+    highlighted_html = highlight_entities(text_input, df)
+    # Simple regex to remove the HTML tags, keeping only the text content
+    highlighted_clean_text = re.sub(r'<[^>]*>', '', highlighted_html)
+    highlighted_clean_text = highlighted_clean_text.replace("div style", "").strip()
+    # Add the text to a text box
+    left = Inches(0.5)
+    top = Inches(1.5)
+    width = Inches(9.0)
+    height = Inches(5.0)
+    txBox = slide.shapes.add_textbox(left, top, width, height)
+    tf = txBox.text_frame
+    tf.margin_top = Inches(0.1)
+    tf.margin_bottom = Inches(0.1)
+    tf.word_wrap = True
+    p = tf.add_paragraph()
+    p.text = highlighted_clean_text
+    p.font.size = Pt(12)
+    p.font.name = 'Arial'
+    p.font.color.rgb = prs.theme.theme_color_scheme.get_color(0) # Default text color
+    # --- 4. Extracted Entities Table Slide ---
+    slide = prs.slides.add_slide(chart_layout)
+    slide.shapes.title.text = "Extracted Entities Table"
+    # Prepare the dataframe for the table
+    table_df = df[['category', 'label', 'text', 'score']].sort_values(by=['category', 'label', 'score'], ascending=[True, True, False])
     # Simple way to insert a table:
+    rows, cols = table_df.shape
+    # Cap the table size for the slide, otherwise it gets too cramped
+    max_rows = 15
+    table_to_display = table_df.head(max_rows)
+    rows_display = len(table_to_display)
+    x, y, cx, cy = Inches(0.2), Inches(1.2), Inches(9.6), Inches(6.0)
     # Add 1 row for the header
+    table = slide.shapes.add_table(rows_display + 1, cols, x, y, cx, cy).table
     # Set column widths
+    table.columns[0].width = Inches(2.0) # Category
+    table.columns[1].width = Inches(2.0) # Label
+    table.columns[2].width = Inches(4.0) # Text
+    table.columns[3].width = Inches(1.6) # Score
     # Set column headers
+    header_cols = ['Category', 'Label', 'Text', 'Score']
+    for i, col in enumerate(header_cols):
         cell = table.cell(0, i)
         cell.text = col
         # Optional: Add simple styling to header
     # Fill in the data
+    for i in range(rows_display):
         for j in range(cols):
             cell = table.cell(i+1, j)
+            if table_df.columns[j] == 'score':
+                cell.text = f"{table_to_display.iloc[i, j]:.4f}"
+            else:
+                cell.text = str(table_to_display.iloc[i, j])
             # Optional: Style data cells
+    if rows > max_rows:
+        slide.placeholders[1].text = f"... Table truncated for slide readability. Full data contains {rows} entries. See CSV file for all data."
+        slide.placeholders[1].top = Inches(6.5)
+        slide.placeholders[1].left = Inches(0.5)
+        slide.placeholders[1].width = Inches(9.0)
+        slide.placeholders[1].height = Inches(0.5)
+    # --- 5. Treemap Slide (Visualization) ---
     fig_treemap = px.treemap(
         df,
         path=[px.Constant("All Entities"), 'category', 'label', 'text'],
     fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     treemap_image = fig_to_image_buffer(fig_treemap)
+    slide = prs.slides.add_slide(chart_layout)
+    slide.shapes.title.text = "Entity Distribution Treemap"
     if treemap_image:
         slide.shapes.add_picture(treemap_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
+    else:
+        slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
+    # --- 6. Pie Chart Slide (Visualization) ---
     grouped_counts = df['category'].value_counts().reset_index()
     grouped_counts.columns = ['Category', 'Count']
+    fig_pie = px.pie(grouped_counts, values='Count', names='Category', title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
+    fig_pie.update_layout(margin=dict(t=50, b=10))
+    pie_image = fig_to_image_buffer(fig_pie)
+    slide = prs.slides.add_slide(chart_layout)
+    slide.shapes.title.text = "Entity Distribution Pie Chart"
+    if pie_image:
+        # Pie charts often look better centered on the slide
+        slide.shapes.add_picture(pie_image, Inches(1.5), Inches(1.5), width=Inches(7.0))
+    else:
+        slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
+    # --- 7. Category Count Bar Chart Slide (Visualization) ---
     fig_bar_category = px.bar(
         grouped_counts,
         x='Category',
     fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
     bar_category_image = fig_to_image_buffer(fig_bar_category)
+    slide = prs.slides.add_slide(chart_layout)
+    slide.shapes.title.text = "Total Entities per Category Bar Chart"
     if bar_category_image:
         slide.shapes.add_picture(bar_category_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
+    else:
+        slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
+    # --- 8. Most Frequent Entities Bar Chart Slide (Visualization) ---
+    word_counts = df['text'].value_counts().reset_index()
+    word_counts.columns = ['Entity', 'Count']
+    repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
+    if not repeating_entities.empty:
+        fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
+        fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
+        bar_freq_image = fig_to_image_buffer(fig_bar_freq)
+        slide = prs.slides.add_slide(chart_layout)
+        slide.shapes.title.text = "Top 10 Most Frequent Entities Bar Chart"
+        if bar_freq_image:
+            slide.shapes.add_picture(bar_freq_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
+        else:
+            slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
+    else:
+        slide = prs.slides.add_slide(chart_layout)
+        slide.shapes.title.text = "Top 10 Most Frequent Entities Bar Chart"
+        slide.placeholders[1].text = "No entities repeat in the text, so a frequency chart was not generated."
+    # --- 9. Network Graph Slide (Visualization) ---
+    network_fig = generate_network_graph(df, text_input)
+    network_image = fig_to_image_buffer(network_fig)
+    slide = prs.slides.add_slide(chart_layout)
+    slide.shapes.title.text = "Entity Co-occurrence Network"
+    if network_image:
+        slide.shapes.add_picture(network_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
+    else:
+        slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
+    # --- 10. Topic Modeling Bubble Chart Slide ---
     if df_topic_data is not None and not df_topic_data.empty:
         # Ensure data frame is in the format expected by create_topic_word_bubbles
         df_topic_data_pptx = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
             slide = prs.slides.add_slide(chart_layout)
             slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
             slide.shapes.add_picture(bubble_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
+        else:
+            slide = prs.slides.add_slide(chart_layout)
+            slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
+            slide.placeholders[1].text = "Chart generation failed. Ensure the 'kaleido' library is installed for Plotly image export."
     else:
         slide = prs.slides.add_slide(chart_layout)
         slide.shapes.title.text = "Topic Modeling Results"
         slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
     pptx_buffer.seek(0)
     return pptx_buffer
+# --- NEW CSV GENERATION FUNCTION (Retained) ---
 def generate_entity_csv(df):
     """
     Generates a CSV file of the extracted entities in an in-memory buffer,
     return csv_buffer
 # -----------------------------------
+# --- Existing App Functionality (HTML) (Retained) ---
 def generate_html_report(df, text_input, elapsed_time, df_topic_data):
     """
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 expander = st.expander("**Important notes**")
 expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
+**Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and crucially, **`kaleido`** (for converting Plotly charts into static images).
 **Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
 **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
 st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 # --- Model Loading ---
+@st.cache_resourced
 def load_ner_model():
     """Loads the GLiNER model and caches it."""
     try:
             mime="text/csv",
             type="secondary"
         )