Spaces:

AIEcosystem
/

relationship-map

Sleeping

App Files Files Community

AIEcosystem commited on Oct 8, 2025

Commit

2ec8241

verified ·

1 Parent(s): 965b307

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +73 -196

src/streamlit_app.py CHANGED Viewed

@@ -42,30 +42,34 @@ os.environ['HF_HOME'] = '/tmp'
 # --- Color Map for Highlighting and Network Graph Nodes ---
 entity_color_map = {
     "person": "#10b981",
-    "username": "#3b82f6",
-    "hashtag": "#4ade80",
-    "mention" : "#f97316",
     "organization": "#f59e0b",
-    "community": "#8b5cf6",
-    "position": "#ec4899",
-    "location": "#06b6d4",
-    "event": "#f43f5e",
-    "product": "#a855f7",
-    "platform": "#eab308",
-    "date": "#6366f1",
-    "media_type": "#14b8a6",
-    "url": "#60a5fa",
-    "nationality_religion": "#fb7185"
 }
 # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
 labels = list(entity_color_map.keys())
 category_mapping = {
-    "People & Groups": ["person", "username", "hashtag", "mention", "community", "position", "nationality_religion"],
-    "Location & Organization": ["location", "organization"],
-    "Temporal & Events": ["event", "date"],
-    "Digital & Products": ["platform", "product", "media_type", "url"],
 }
 reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
@@ -293,156 +297,11 @@ def generate_network_graph(df, raw_text):
     return fig
-# --- PPTX HELPER FUNCTIONS (Integrated from generate_report.py) ---
-def fig_to_image_buffer(fig):
-    """
-    Converts a Plotly figure object into a BytesIO buffer containing PNG data.
-    Requires 'kaleido' to be installed for image export.
-    Returns None if export fails.
-    """
-    try:
-        # Use pio.to_image to convert the figure to a PNG byte array
-        img_bytes = pio.to_image(fig, format="png", width=900, height=500, scale=2)
-        img_buffer = BytesIO(img_bytes)
-        return img_buffer
-    except Exception as e:
-        # In a Streamlit environment, we can't show this error directly in the app execution flow
-        print(f"Error converting Plotly figure to image: {e}")
-        return None
-# --- PPTX GENERATION FUNCTION (Integrated and Adapted) ---
-def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_category_mapping):
-    """
-    Generates a PowerPoint presentation (.pptx) file containing key analysis results.
-    Returns the file content as a BytesIO buffer.
-    """
-    prs = Presentation()
-    # Layout 5: Title and Content (often good for charts)
-    chart_layout = prs.slide_layouts[5]
-    # 1. Title Slide
-    title_slide_layout = prs.slide_layouts[0]
-    slide = prs.slides.add_slide(title_slide_layout)
-    title = slide.shapes.title
-    subtitle = slide.placeholders[1]
-    title.text = "NER & Topic Analysis Report"
-    subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
-    # 2. Source Text Slide
-    slide = prs.slides.add_slide(chart_layout)
-    slide.shapes.title.text = "Analyzed Source Text"
-    # Add the raw text to a text box
-    left = Inches(0.5)
-    top = Inches(1.5)
-    width = Inches(9.0)
-    height = Inches(5.0)
-    txBox = slide.shapes.add_textbox(left, top, width, height)
-    tf = txBox.text_frame
-    tf.margin_top = Inches(0.1)
-    tf.margin_bottom = Inches(0.1)
-    tf.word_wrap = True
-    p = tf.add_paragraph()
-    p.text = text_input
-    p.font.size = Pt(14)
-    p.font.name = 'Arial'
-    # 3. Entity Summary Slide (Table)
-    slide = prs.slides.add_slide(chart_layout)
-    slide.shapes.title.text = "Entity Summary (Count by Category and Label)"
-    # Create the summary table using the app's established logic
-    grouped_entity_table = df['label'].value_counts().reset_index()
-    grouped_entity_table.columns = ['Entity Label', 'Count']
-    grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(
-        lambda x: reverse_category_mapping.get(x, 'Other')
-    )
-    grouped_entity_table = grouped_entity_table[['Category', 'Entity Label', 'Count']]
-    # Simple way to insert a table:
-    rows, cols = grouped_entity_table.shape
-    x, y, cx, cy = Inches(1), Inches(1.5), Inches(8), Inches(4.5)
-    # Add 1 row for the header
-    table = slide.shapes.add_table(rows + 1, cols, x, y, cx, cy).table
-    # Set column widths
-    table.columns[0].width = Inches(2.7)
-    table.columns[1].width = Inches(2.8)
-    table.columns[2].width = Inches(2.5)
-    # Set column headers
-    for i, col in enumerate(grouped_entity_table.columns):
-        cell = table.cell(0, i)
-        cell.text = col
-        cell.fill.solid()
-        # Optional: Add simple styling to header
-    # Fill in the data
-    for i in range(rows):
-        for j in range(cols):
-            cell = table.cell(i+1, j)
-            cell.text = str(grouped_entity_table.iloc[i, j])
-            # Optional: Style data cells
-    # 4. Treemap Slide (Visualization)
-    fig_treemap = px.treemap(
-        df,
-        path=[px.Constant("All Entities"), 'category', 'label', 'text'],
-        values='score',
-        color='category',
-        title="Entity Distribution by Category and Label",
-        color_discrete_sequence=px.colors.qualitative.Dark24
-    )
-    fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
-    treemap_image = fig_to_image_buffer(fig_treemap)
-    if treemap_image:
-        slide = prs.slides.add_slide(chart_layout)
-        slide.shapes.title.text = "Entity Distribution Treemap"
-        slide.shapes.add_picture(treemap_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
-    # 5. Entity Count Bar Chart Slide (Visualization)
-    grouped_counts = df['category'].value_counts().reset_index()
-    grouped_counts.columns = ['Category', 'Count']
-    fig_bar_category = px.bar(
-        grouped_counts,
-        x='Category',
-        y='Count',
-        color='Category',
-        title='Total Entities per Category',
-        color_discrete_sequence=px.colors.qualitative.Pastel
-    )
-    fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
-    bar_category_image = fig_to_image_buffer(fig_bar_category)
-    if bar_category_image:
-        slide = prs.slides.add_slide(chart_layout)
-        slide.shapes.title.text = "Total Entities per Category"
-        slide.shapes.add_picture(bar_category_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
-    # 6. Topic Modeling Bubble Chart Slide
-    if df_topic_data is not None and not df_topic_data.empty:
-        # Ensure data frame is in the format expected by create_topic_word_bubbles
-        df_topic_data_pptx = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
-        bubble_figure = create_topic_word_bubbles(df_topic_data_pptx)
-        bubble_image = fig_to_image_buffer(bubble_figure)
-        if bubble_image:
-            slide = prs.slides.add_slide(chart_layout)
-            slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
-            slide.shapes.add_picture(bubble_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
-    else:
-        # Placeholder slide if topic modeling is not available
-        slide = prs.slides.add_slide(chart_layout)
-        slide.shapes.title.text = "Topic Modeling Results"
-        slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
-    # Save the presentation to an in-memory buffer
-    pptx_buffer = BytesIO()
-    prs.save(pptx_buffer)
-    pptx_buffer.seek(0)
-    return pptx_buffer
 # --- NEW CSV GENERATION FUNCTION ---
 def generate_entity_csv(df):
@@ -568,9 +427,9 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
         <div class="chart-box">{pie_html}</div>
         <div class="chart-box">{bar_category_html}</div>
         <div class="chart-box">{bar_freq_html}</div>
-        <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
         <div class="chart-box">{network_html}</div>
-        <h2>4. Topic Modeling (LDA on Entities)</h2>
         {topic_charts_html}
     </div></body></html>
     """
@@ -612,13 +471,24 @@ st.markdown(
     </style>
     """,
     unsafe_allow_html=True)
-st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 expander = st.expander("**Important notes**")
-expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
-**Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`.
-**Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
-**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
 st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 # --- Comet ML Setup (Placeholder/Conditional) ---
@@ -753,20 +623,23 @@ if st.session_state.show_results:
         st.markdown("### 1. Analyzed Text with Highlighted Entities")
         st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
-        # 2. Entity Summary Table
-        st.markdown("### 2. Entity Summary Table (Count by Label)")
-        grouped_entity_table = df['label'].value_counts().reset_index()
-        grouped_entity_table.columns = ['Entity Label', 'Count']
-        grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(reverse_category_mapping)
-        st.dataframe(grouped_entity_table[['Category', 'Entity Label', 'Count']], use_container_width=True)
-        st.markdown("---")
-        # 3. Detailed Entity Analysis Tabs
-        st.markdown("### 3. Detailed Entity Analysis")
         tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
         with tab_category_details:
             st.markdown("#### Detailed Entities Table (Grouped by Category)")
             unique_categories = list(category_mapping.keys())
             tabs_category = st.tabs(unique_categories)
             for category, tab in zip(unique_categories, tabs_category):
@@ -795,9 +668,9 @@ if st.session_state.show_results:
             fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
             st.plotly_chart(fig_treemap, use_container_width=True)
-        # 4. Comparative Charts
         st.markdown("---")
-        st.markdown("### 4. Comparative Charts")
         col1, col2, col3 = st.columns(3)
@@ -826,12 +699,12 @@ if st.session_state.show_results:
                 st.info("No entities repeat for frequency chart.")
         st.markdown("---")
-        st.markdown("### 5. Entity Co-occurrence Network")
         network_fig = generate_network_graph(df, st.session_state.last_text)
         st.plotly_chart(network_fig, use_container_width=True)
         st.markdown("---")
-        st.markdown("### 6. Topic Modeling Analysis")
         if df_topic_data is not None and not df_topic_data.empty:
             bubble_figure = create_topic_word_bubbles(df_topic_data)
@@ -856,17 +729,9 @@ if st.session_state.show_results:
             type="primary"
         )
-        # 2. PowerPoint PPTX Download (Retained)
-        pptx_buffer = generate_pptx_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data, reverse_category_mapping)
-        st.download_button(
-            label="Download Presentation Slides (.pptx)",
-            data=pptx_buffer,
-            file_name="ner_topic_report.pptx",
-            mime="application/vnd.openxmlformats-officedocument.presentationml.presentation",
-            type="primary"
-        )
-        # 3. CSV Data Download (NEW)
         csv_buffer = generate_entity_csv(df)
         st.download_button(
             label="Download Extracted Entities (CSV)",
@@ -875,4 +740,16 @@ if st.session_state.show_results:
             mime="text/csv",
             type="secondary"
         )

 # --- Color Map for Highlighting and Network Graph Nodes ---
 entity_color_map = {
     "person": "#10b981",
+    "country": "#3b82f6",
+    "city": "#4ade80",
     "organization": "#f59e0b",
+    "date": "#8b5cf6",
+    "time": "#ec4899",
+    "cardinal": "#06b6d4",
+    "money": "#f43f5e",
+    "position": "#a855f7",
 }
 # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
 labels = list(entity_color_map.keys())
 category_mapping = {
+   "People": ["person", "organization", "position"],
+   "Locations": ["country", "city"],
+   "Time": ["date", "time"],
+   "Numbers": ["money", "cardinal"]
 }
 reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
     return fig
 # --- NEW CSV GENERATION FUNCTION ---
 def generate_entity_csv(df):
         <div class="chart-box">{pie_html}</div>
         <div class="chart-box">{bar_category_html}</div>
         <div class="chart-box">{bar_freq_html}</div>
+        <h3>3.3 Entity Relationship Map (Edges = Same Sentence)</h3>
         <div class="chart-box">{network_html}</div>
+        <h2>4. Topic Modelling</h2>
         {topic_charts_html}
     </div></body></html>
     """
     </style>
     """,
     unsafe_allow_html=True)
+st.subheader("Entity and Topic Analysis Report Generator", divider="rainbow")
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 expander = st.expander("**Important notes**")
+expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
+**Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
+**How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
+**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
 st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 # --- Comet ML Setup (Placeholder/Conditional) ---
         st.markdown("### 1. Analyzed Text with Highlighted Entities")
         st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
+        # 2. Detailed Entity Analysis Tabs
+        st.markdown("### 2. Detailed Entity Analysis")
         tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
         with tab_category_details:
             st.markdown("#### Detailed Entities Table (Grouped by Category)")
+            with st.expander("See Glossary of tags"):
+                st.write('''
+                - **text**: ['entity extracted from your text data']
+                - **label**: ['label (tag) assigned to a given extracted entity']
+                - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
+                - **start**: ['index of the start of the corresponding entity']
+                - **end**: ['index of the end of the corresponding entity']
+                ''')
             unique_categories = list(category_mapping.keys())
             tabs_category = st.tabs(unique_categories)
             for category, tab in zip(unique_categories, tabs_category):
             fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
             st.plotly_chart(fig_treemap, use_container_width=True)
+        # 3. Comparative Charts
         st.markdown("---")
+        st.markdown("### 3. Comparative Charts")
         col1, col2, col3 = st.columns(3)
                 st.info("No entities repeat for frequency chart.")
         st.markdown("---")
+        st.markdown("### 4. Entity Relationship Map")
         network_fig = generate_network_graph(df, st.session_state.last_text)
         st.plotly_chart(network_fig, use_container_width=True)
         st.markdown("---")
+        st.markdown("### 5. Topic Modelling Analysis")
         if df_topic_data is not None and not df_topic_data.empty:
             bubble_figure = create_topic_word_bubbles(df_topic_data)
             type="primary"
         )
+        # 2. CSV Data Download (NEW)
         csv_buffer = generate_entity_csv(df)
         st.download_button(
             label="Download Extracted Entities (CSV)",
             mime="text/csv",
             type="secondary"
         )
+       with st.expander("See Glossary of tags"):
+           st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
+           code = '''
+           <iframe
+           src="https://aiecosystem-dataharvest.hf.space"
+           frameborder="0"
+           width="850"
+           height="450"
+           ></iframe>
+           '''
+           st.code(code, language="html")