Spaces:

AIEcosystem
/

relationship-map

Sleeping

App Files Files Community

AIEcosystem commited on Oct 8, 2025

Commit

932f856

verified ·

1 Parent(s): b90f5cd

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +31 -133

src/streamlit_app.py CHANGED Viewed

@@ -24,7 +24,6 @@ from sklearn.decomposition import LatentDirichletAllocation
 # ------------------------------
 from gliner import GLiNER
 from streamlit_extras.stylable_container import stylable_container
 # Using a try/except for comet_ml import
 try:
     from comet_ml import Experiment
@@ -34,79 +33,56 @@ except ImportError:
         def log_parameter(self, *args): pass
         def log_table(self, *args): pass
         def end(self): pass
 # --- Model Home Directory (Fix for deployment environments) ---
 # Set HF_HOME environment variable to a writable path
 os.environ['HF_HOME'] = '/tmp'
 # --- Color Map for Highlighting and Network Graph Nodes ---
 entity_color_map = {
     "person": "#10b981",
     "country": "#3b82f6",
     "city": "#4ade80",
     "organization": "#f59e0b",
     "date": "#8b5cf6",
     "time": "#ec4899",
     "cardinal": "#06b6d4",
     "money": "#f43f5e",
     "position": "#a855f7",
-}
 # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
 labels = list(entity_color_map.keys())
 category_mapping = {
    "People": ["person", "organization", "position"],
    "Locations": ["country", "city"],
    "Time": ["date", "time"],
-   "Numbers": ["money", "cardinal"]
-}
 reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 # --- Utility Functions for Analysis and Plotly ---
 def extract_label(node_name):
     """Extracts the label from a node string like 'Text (Label)'."""
     match = re.search(r'\(([^)]+)\)$', node_name)
     return match.group(1) if match else "Unknown"
 def remove_trailing_punctuation(text_string):
     """Removes trailing punctuation from a string."""
     return text_string.rstrip(string.punctuation)
 def highlight_entities(text, df_entities):
     """Generates HTML to display text with entities highlighted and colored."""
     if df_entities.empty:
         return text
     # Sort entities by start index descending to insert highlights without affecting subsequent indices
     entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
     highlighted_text = text
     for entity in entities:
         start = entity['start']
         end = entity['end']
         label = entity['label']
         entity_text = entity['text']
         color = entity_color_map.get(label, '#000000')
         # Create a span with background color and tooltip
         highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
         # Replace the original text segment with the highlighted HTML
         highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
     # Use a div to mimic the Streamlit input box style for the report
-    return f'<div style="border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     """
     Performs basic Topic Modeling using LDA on the extracted entities
@@ -115,7 +91,6 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     documents = df_entities['text'].unique().tolist()
     if len(documents) < 2:
         return None
     N = min(num_top_words, len(documents))
     try:
         tfidf_vectorizer = TfidfVectorizer(
@@ -125,7 +100,6 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
         )
         tfidf = tfidf_vectorizer.fit_transform(documents)
         tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
         lda = LatentDirichletAllocation(
             n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
         )
@@ -145,13 +119,11 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     except Exception as e:
         st.error(f"Topic modeling failed: {e}")
         return None
 def create_topic_word_bubbles(df_topic_data):
     """Generates a Plotly Bubble Chart for top words across all topics."""
     # Renaming columns to match the output of perform_topic_modeling
     df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
     df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
     if df_topic_data.empty:
         return None
     fig = px.scatter(
@@ -177,14 +149,13 @@ def create_topic_word_bubbles(df_topic_data):
         xaxis={'tickangle': -45, 'showgrid': False},
         yaxis={'showgrid': True},
         showlegend=True,
-        plot_bgcolor='#FFF0F5',
-        paper_bgcolor='#FFF0F5',
         height=600,
         margin=dict(t=50, b=100, l=50, r=10),
     )
     fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>', marker=dict(line=dict(width=1, color='DarkSlateGrey')))
     return fig
 def generate_network_graph(df, raw_text):
     """
     Generates a network graph visualization (Node Plot) with edges
@@ -193,21 +164,16 @@ def generate_network_graph(df, raw_text):
     # Using the existing generate_network_graph logic from previous context...
     entity_counts = df['text'].value_counts().reset_index()
     entity_counts.columns = ['text', 'frequency']
     unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
     if unique_entities.shape[0] < 2:
         return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
     num_nodes = len(unique_entities)
     thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
     radius = 10
     unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
     unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
     pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
     edges = set()
     sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
     for sentence in sentences:
         entities_in_sentence = []
@@ -215,25 +181,20 @@ def generate_network_graph(df, raw_text):
             if entity_text.lower() in sentence.lower():
                 entities_in_sentence.append(entity_text)
         unique_entities_in_sentence = list(set(entities_in_sentence))
         for i in range(len(unique_entities_in_sentence)):
             for j in range(i + 1, len(unique_entities_in_sentence)):
                 node1 = unique_entities_in_sentence[i]
                 node2 = unique_entities_in_sentence[j]
                 edge_tuple = tuple(sorted((node1, node2)))
                 edges.add(edge_tuple)
     edge_x = []
     edge_y = []
     for edge in edges:
         n1, n2 = edge
         if n1 in pos_map and n2 in pos_map:
             edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
             edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
     fig = go.Figure()
     edge_trace = go.Scatter(
         x=edge_x, y=edge_y,
         line=dict(width=0.5, color='#888'),
@@ -243,7 +204,6 @@ def generate_network_graph(df, raw_text):
         showlegend=False
     )
     fig.add_trace(edge_trace)
     fig.add_trace(go.Scatter(
         x=unique_entities['x'],
         y=unique_entities['y'],
@@ -268,7 +228,6 @@ def generate_network_graph(df, raw_text):
             "Frequency: %{customdata[2]}<extra></extra>"
         )
     ))
     legend_traces = []
     seen_labels = set()
     for index, row in unique_entities.iterrows():
@@ -281,7 +240,6 @@ def generate_network_graph(df, raw_text):
             ))
     for trace in legend_traces:
         fig.add_trace(trace)
     fig.update_layout(
         title='Entity Co-occurrence Network (Edges = Same Sentence)',
         showlegend=True,
@@ -293,16 +251,7 @@ def generate_network_graph(df, raw_text):
         margin=dict(t=50, b=10, l=10, r=10),
         height=600
     )
     return fig
 # --- NEW CSV GENERATION FUNCTION ---
 def generate_entity_csv(df):
     """
@@ -316,16 +265,13 @@ def generate_entity_csv(df):
     csv_buffer.seek(0)
     return csv_buffer
 # -----------------------------------
 # --- Existing App Functionality (HTML) ---
 def generate_html_report(df, text_input, elapsed_time, df_topic_data):
     """
     Generates a full HTML report containing all analysis results and visualizations.
     (Content omitted for brevity but assumed to be here).
     """
     # 1. Generate Visualizations (Plotly HTML)
     # 1a. Treemap
     fig_treemap = px.treemap(
         df,
@@ -337,34 +283,30 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
     )
     fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
     # 1b. Pie Chart
     grouped_counts = df['category'].value_counts().reset_index()
     grouped_counts.columns = ['Category', 'Count']
-    fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
     fig_pie.update_layout(margin=dict(t=50, b=10))
     pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
     # 1c. Bar Chart (Category Count)
     fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
     fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
     bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
     # 1d. Bar Chart (Most Frequent Entities)
     word_counts = df['text'].value_counts().reset_index()
     word_counts.columns = ['Entity', 'Count']
     repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
     bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
     if not repeating_entities.empty:
-        fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
         fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
         bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
     # 1e. Network Graph HTML
     network_fig = generate_network_graph(df, text_input)
     network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
     # 1f. Topic Charts HTML
     topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
     if df_topic_data is not None and not df_topic_data.empty:
@@ -374,20 +316,17 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
         else:
             topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
     else:
-        topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #FF69B4;">'
         topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
         topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
         topic_charts_html += '</div>'
     # 2. Get Highlighted Text
     highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
     # 3. Entity Tables (Pandas to HTML)
     entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
         classes='table table-striped',
         index=False
     )
     # 4. Construct the Final HTML
     html_content = f"""<!DOCTYPE html><html lang="en"><head>
     <meta charset="UTF-8">
@@ -397,21 +336,20 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
     <style>
         body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
         .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
-        h1 {{ color: #FF69B4; border-bottom: 3px solid #FF69B4; padding-bottom: 10px; margin-top: 0; }}
         h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
         h3 {{ color: #555; margin-top: 20px; }}
-        .metadata {{ background-color: #FFE4E1; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
         .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
         table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
         table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
         table th {{ background-color: #f0f0f0; }}
-        .highlighted-text {{ border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
     </style></head><body>
     <div class="container">
         <h1>Entity and Topic Analysis Report</h1>
         <div class="metadata">
-            <p><strong>Generated at:</strong> {time.strftime('%Y-%m-%d')}</p>
             <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
         </div>
         <h2>1. Analyzed Text & Extracted Entities</h2>
@@ -435,8 +373,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
     </div></body></html>
     """
     return html_content
 # --- Page Configuration and Styling (No Sidebar) ---
 st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 st.markdown(
@@ -444,21 +380,21 @@ st.markdown(
     <style>
     /* Overall app container - NO SIDEBAR */
     .main {
-        background-color: #FFF0F5; /* Blanched Almond/Light Pink */
         color: #333333; /* Dark grey text for contrast */
     }
     .stApp {
-        background-color: #FFF0F5;
     }
     /* Text Area background and text color (input fields) */
     .stTextArea textarea {
-        background-color: #FFFAF0; /* Floral White/Near white for input fields */
         color: #000000; /* Black text for input */
-        border: 1px solid #FF69B4; /* Deep Pink border */
     }
     /* Button styling */
     .stButton > button {
-        background-color: #FF69B4; /* Deep Pink for the button */
         color: #FFFFFF; /* White text for contrast */
         border: none;
         padding: 10px 20px;
@@ -466,24 +402,25 @@ st.markdown(
     }
     /* Expander header and content background */
     .streamlit-expanderHeader, .streamlit-expanderContent {
-        background-color: #FFE4E1; /* Misty Rose/Lighter Pink */
         color: #333333;
     }
     </style>
     """,
     unsafe_allow_html=True)
-st.subheader("Entity and Topic Analysis Report Generator", divider="rainbow")
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 tab1, tab2 = st.tabs(["Important Notes", "Embed"])
 with tab1:
     expander = st.expander("**Important notes**")
     expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
     **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
     **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
     **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
 with tab2:
     with st.expander("Embed"):
         st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
@@ -496,15 +433,13 @@ with tab2:
     ></iframe>
     '''
         st.code(code, language="html")
 st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 # --- Comet ML Setup (Placeholder/Conditional) ---
 COMET_API_KEY = os.environ.get("COMET_API_KEY")
 COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
 comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 # --- Model Loading ---
 @st.cache_resource
 def load_ner_model():
@@ -514,9 +449,7 @@ def load_ner_model():
     except Exception as e:
         st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
         st.stop()
 model = load_ner_model()
 # --- LONG DEFAULT TEXT (178 Words) ---
 DEFAULT_TEXT = (
     "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
@@ -531,8 +464,7 @@ DEFAULT_TEXT = (
     "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
     "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
     "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
-    "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
-)
 # -----------------------------------
 # --- Session State Initialization (CRITICAL FIX) ---
 if 'show_results' not in st.session_state:
@@ -547,7 +479,6 @@ if 'topic_results' not in st.session_state:
     st.session_state.topic_results = None
 if 'my_text_area' not in st.session_state:
     st.session_state.my_text_area = DEFAULT_TEXT
 # --- Clear Button Function (MODIFIED) ---
 def clear_text():
     """Clears the text area (sets it to an empty string) and hides results."""
@@ -557,7 +488,6 @@ def clear_text():
     st.session_state.results_df = pd.DataFrame()
     st.session_state.elapsed_time = 0.0
     st.session_state.topic_results = None
 # --- Text Input and Clear Button ---
 word_limit = 1000
 text = st.text_area(
@@ -565,11 +495,9 @@ text = st.text_area(
     height=250,
     key='my_text_area',
     value=st.session_state.my_text_area)
 word_count = len(text.split())
 st.markdown(f"**Word count:** {word_count}/{word_limit}")
 st.button("Clear text", on_click=clear_text)
 # --- Results Trigger and Processing (Updated Logic) ---
 if st.button("Results"):
     if not text.strip():
@@ -583,25 +511,20 @@ if st.button("Results"):
             if text != st.session_state.last_text:
                 st.session_state.last_text = text
                 start_time = time.time()
                 # --- Model Prediction & Dataframe Creation ---
                 entities = model.predict_entities(text, labels)
                 df = pd.DataFrame(entities)
                 if not df.empty:
                     df['text'] = df['text'].apply(remove_trailing_punctuation)
                     df['category'] = df['label'].map(reverse_category_mapping)
                     st.session_state.results_df = df
                     unique_entity_count = len(df['text'].unique())
                     N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
                     st.session_state.topic_results = perform_topic_modeling(
                         df,
                         num_topics=2,
                         num_top_words=N_TOP_WORDS_TO_USE
                     )
                     if comet_initialized:
                         experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
                         experiment.log_parameter("input_text", text)
@@ -610,33 +533,25 @@ if st.button("Results"):
                 else:
                     st.session_state.results_df = pd.DataFrame()
                     st.session_state.topic_results = None
                 end_time = time.time()
                 st.session_state.elapsed_time = end_time - start_time
                 st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
             st.session_state.show_results = True
 # --- Display Download Link and Results ---
 if st.session_state.show_results:
     df = st.session_state.results_df
     df_topic_data = st.session_state.topic_results
     if df.empty:
         st.warning("No entities were found in the provided text.")
     else:
         st.subheader("Analysis Results", divider="blue")
         # 1. Highlighted Text
         st.markdown("### 1. Analyzed Text with Highlighted Entities")
         st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
         # 2. Detailed Entity Analysis Tabs
         st.markdown("### 2. Detailed Entity Analysis")
         tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
         with tab_category_details:
             st.markdown("#### Detailed Entities Table (Grouped by Category)")
             with st.expander("See Glossary of tags"):
@@ -647,7 +562,6 @@ if st.session_state.show_results:
                 - **start**: ['index of the start of the corresponding entity']
                 - **end**: ['index of the end of the corresponding entity']
                 ''')
             unique_categories = list(category_mapping.keys())
             tabs_category = st.tabs(unique_categories)
             for category, tab in zip(unique_categories, tabs_category):
@@ -662,7 +576,6 @@ if st.session_state.show_results:
                         )
                     else:
                         st.info(f"No entities of category **{category}** were found in the text.")
         with tab_treemap_viz:
             st.markdown("#### Treemap: Entity Distribution")
             fig_treemap = px.treemap(
@@ -670,50 +583,42 @@ if st.session_state.show_results:
                 path=[px.Constant("All Entities"), 'category', 'label', 'text'],
                 values='score',
                 color='category',
                 color_discrete_sequence=px.colors.qualitative.Dark24
             )
             fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
             st.plotly_chart(fig_treemap, use_container_width=True)
         # 3. Comparative Charts
         st.markdown("---")
         st.markdown("### 3. Comparative Charts")
         col1, col2, col3 = st.columns(3)
         grouped_counts = df['category'].value_counts().reset_index()
         grouped_counts.columns = ['Category', 'Count']
         with col1: # Pie Chart
-            fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
             fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
             st.plotly_chart(fig_pie, use_container_width=True)
         with col2: # Bar Chart (Category Count)
             fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
             fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
             st.plotly_chart(fig_bar_category, use_container_width=True)
         with col3: # Bar Chart (Most Frequent Entities)
             word_counts = df['text'].value_counts().reset_index()
             word_counts.columns = ['Entity', 'Count']
             repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
             if not repeating_entities.empty:
-                fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
                 fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
                 st.plotly_chart(fig_bar_freq, use_container_width=True)
             else:
                 st.info("No entities repeat for frequency chart.")
         st.markdown("---")
         st.markdown("### 4. Entity Relationship Map")
         network_fig = generate_network_graph(df, st.session_state.last_text)
         st.plotly_chart(network_fig, use_container_width=True)
         st.markdown("---")
         st.markdown("### 5. Topic Modelling Analysis")
         if df_topic_data is not None and not df_topic_data.empty:
             bubble_figure = create_topic_word_bubbles(df_topic_data)
             if bubble_figure:
@@ -722,11 +627,9 @@ if st.session_state.show_results:
                 st.error("Error generating Topic Word Bubble Chart.")
         else:
             st.info("Topic modeling requires more unique input (at least two unique entities).")
         # --- Report Download ---
         st.markdown("---")
         st.markdown("### Download Full Report Artifacts")
         # 1. HTML Report Download (Retained)
         html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
         st.download_button(
@@ -737,9 +640,7 @@ if st.session_state.show_results:
             type="primary"
         )
-        # 2. CSV Data Download (NEW)
         csv_buffer = generate_entity_csv(df)
         st.download_button(
             label="Download Extracted Entities (CSV)",
@@ -748,6 +649,3 @@ if st.session_state.show_results:
             mime="text/csv",
             type="secondary"
         )

 # ------------------------------
 from gliner import GLiNER
 from streamlit_extras.stylable_container import stylable_container
 # Using a try/except for comet_ml import
 try:
     from comet_ml import Experiment
         def log_parameter(self, *args): pass
         def log_table(self, *args): pass
         def end(self): pass
 # --- Model Home Directory (Fix for deployment environments) ---
 # Set HF_HOME environment variable to a writable path
 os.environ['HF_HOME'] = '/tmp'
 # --- Color Map for Highlighting and Network Graph Nodes ---
 entity_color_map = {
     "person": "#10b981",
     "country": "#3b82f6",
     "city": "#4ade80",
     "organization": "#f59e0b",
     "date": "#8b5cf6",
     "time": "#ec4899",
     "cardinal": "#06b6d4",
     "money": "#f43f5e",
     "position": "#a855f7",
+    }
 # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
 labels = list(entity_color_map.keys())
 category_mapping = {
    "People": ["person", "organization", "position"],
    "Locations": ["country", "city"],
    "Time": ["date", "time"],
+   "Numbers": ["money", "cardinal"]}
 reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 # --- Utility Functions for Analysis and Plotly ---
 def extract_label(node_name):
     """Extracts the label from a node string like 'Text (Label)'."""
     match = re.search(r'\(([^)]+)\)$', node_name)
     return match.group(1) if match else "Unknown"
 def remove_trailing_punctuation(text_string):
     """Removes trailing punctuation from a string."""
     return text_string.rstrip(string.punctuation)
 def highlight_entities(text, df_entities):
     """Generates HTML to display text with entities highlighted and colored."""
     if df_entities.empty:
         return text
     # Sort entities by start index descending to insert highlights without affecting subsequent indices
     entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
     highlighted_text = text
     for entity in entities:
         start = entity['start']
         end = entity['end']
         label = entity['label']
         entity_text = entity['text']
         color = entity_color_map.get(label, '#000000')
         # Create a span with background color and tooltip
         highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
         # Replace the original text segment with the highlighted HTML
         highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
     # Use a div to mimic the Streamlit input box style for the report
+    return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     """
     Performs basic Topic Modeling using LDA on the extracted entities
     documents = df_entities['text'].unique().tolist()
     if len(documents) < 2:
         return None
     N = min(num_top_words, len(documents))
     try:
         tfidf_vectorizer = TfidfVectorizer(
         )
         tfidf = tfidf_vectorizer.fit_transform(documents)
         tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
         lda = LatentDirichletAllocation(
             n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
         )
     except Exception as e:
         st.error(f"Topic modeling failed: {e}")
         return None
 def create_topic_word_bubbles(df_topic_data):
     """Generates a Plotly Bubble Chart for top words across all topics."""
     # Renaming columns to match the output of perform_topic_modeling
     df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
     df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
     if df_topic_data.empty:
         return None
     fig = px.scatter(
         xaxis={'tickangle': -45, 'showgrid': False},
         yaxis={'showgrid': True},
         showlegend=True,
+        plot_bgcolor='#f9f9f9', # Changed from pink
+        paper_bgcolor='#f9f9f9', # Changed from pink
         height=600,
         margin=dict(t=50, b=100, l=50, r=10),
     )
     fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>', marker=dict(line=dict(width=1, color='DarkSlateGrey')))
     return fig
 def generate_network_graph(df, raw_text):
     """
     Generates a network graph visualization (Node Plot) with edges
     # Using the existing generate_network_graph logic from previous context...
     entity_counts = df['text'].value_counts().reset_index()
     entity_counts.columns = ['text', 'frequency']
     unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
     if unique_entities.shape[0] < 2:
         return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
     num_nodes = len(unique_entities)
     thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
     radius = 10
     unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
     unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
     pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
     edges = set()
     sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
     for sentence in sentences:
         entities_in_sentence = []
             if entity_text.lower() in sentence.lower():
                 entities_in_sentence.append(entity_text)
         unique_entities_in_sentence = list(set(entities_in_sentence))
         for i in range(len(unique_entities_in_sentence)):
             for j in range(i + 1, len(unique_entities_in_sentence)):
                 node1 = unique_entities_in_sentence[i]
                 node2 = unique_entities_in_sentence[j]
                 edge_tuple = tuple(sorted((node1, node2)))
                 edges.add(edge_tuple)
     edge_x = []
     edge_y = []
     for edge in edges:
         n1, n2 = edge
         if n1 in pos_map and n2 in pos_map:
             edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
             edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
     fig = go.Figure()
     edge_trace = go.Scatter(
         x=edge_x, y=edge_y,
         line=dict(width=0.5, color='#888'),
         showlegend=False
     )
     fig.add_trace(edge_trace)
     fig.add_trace(go.Scatter(
         x=unique_entities['x'],
         y=unique_entities['y'],
             "Frequency: %{customdata[2]}<extra></extra>"
         )
     ))
     legend_traces = []
     seen_labels = set()
     for index, row in unique_entities.iterrows():
             ))
     for trace in legend_traces:
         fig.add_trace(trace)
     fig.update_layout(
         title='Entity Co-occurrence Network (Edges = Same Sentence)',
         showlegend=True,
         margin=dict(t=50, b=10, l=10, r=10),
         height=600
     )
     return fig
 # --- NEW CSV GENERATION FUNCTION ---
 def generate_entity_csv(df):
     """
     csv_buffer.seek(0)
     return csv_buffer
 # -----------------------------------
 # --- Existing App Functionality (HTML) ---
 def generate_html_report(df, text_input, elapsed_time, df_topic_data):
     """
     Generates a full HTML report containing all analysis results and visualizations.
     (Content omitted for brevity but assumed to be here).
     """
     # 1. Generate Visualizations (Plotly HTML)
     # 1a. Treemap
     fig_treemap = px.treemap(
         df,
     )
     fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
     # 1b. Pie Chart
     grouped_counts = df['category'].value_counts().reset_index()
     grouped_counts.columns = ['Category', 'Count']
+    # Changed color_discrete_sequence from sequential.RdBu (which has reds) to sequential.Cividis
+    fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
     fig_pie.update_layout(margin=dict(t=50, b=10))
     pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
     # 1c. Bar Chart (Category Count)
     fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
     fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
     bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
     # 1d. Bar Chart (Most Frequent Entities)
     word_counts = df['text'].value_counts().reset_index()
     word_counts.columns = ['Entity', 'Count']
     repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
     bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
     if not repeating_entities.empty:
+        # Changed color_discrete_sequence from sequential.Plasma (which has pink/magenta) to sequential.Viridis
+        fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
         fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
         bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
     # 1e. Network Graph HTML
     network_fig = generate_network_graph(df, text_input)
     network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
     # 1f. Topic Charts HTML
     topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
     if df_topic_data is not None and not df_topic_data.empty:
         else:
             topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
     else:
+        topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">' # Changed border color
         topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
         topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
         topic_charts_html += '</div>'
     # 2. Get Highlighted Text
     highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
     # 3. Entity Tables (Pandas to HTML)
     entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
         classes='table table-striped',
         index=False
     )
     # 4. Construct the Final HTML
     html_content = f"""<!DOCTYPE html><html lang="en"><head>
     <meta charset="UTF-8">
     <style>
         body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
         .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
+        h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom: 10px; margin-top: 0; }}
         h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
         h3 {{ color: #555; margin-top: 20px; }}
+        .metadata {{ background-color: #e6f0ff; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
         .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
         table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
         table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
         table th {{ background-color: #f0f0f0; }}
+        .highlighted-text {{ border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
     </style></head><body>
     <div class="container">
         <h1>Entity and Topic Analysis Report</h1>
         <div class="metadata">
+            <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
             <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
         </div>
         <h2>1. Analyzed Text & Extracted Entities</h2>
     </div></body></html>
     """
     return html_content
 # --- Page Configuration and Styling (No Sidebar) ---
 st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 st.markdown(
     <style>
     /* Overall app container - NO SIDEBAR */
     .main {
+        background-color: #f4f4f9; /* Changed from light pink */
         color: #333333; /* Dark grey text for contrast */
     }
     .stApp {
+        background-color: #f4f4f9; /* Changed from light pink */
     }
     /* Text Area background and text color (input fields) */
     .stTextArea textarea {
+        background-color: #ffffff; /* Changed from near white/pinkish */
         color: #000000; /* Black text for input */
+        border: 1px solid #888888; /* Changed border from pink to grey */
     }
     /* Button styling */
     .stButton > button {
+        background-color: #007bff; /* Changed from Deep Pink to Blue */
         color: #FFFFFF; /* White text for contrast */
         border: none;
         padding: 10px 20px;
     }
     /* Expander header and content background */
     .streamlit-expanderHeader, .streamlit-expanderContent {
+        background-color: #e9ecef; /* Changed from lighter pink to light grey/blue */
         color: #333333;
     }
     </style>
     """,
     unsafe_allow_html=True)
+st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 tab1, tab2 = st.tabs(["Important Notes", "Embed"])
 with tab1:
     expander = st.expander("**Important notes**")
     expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
     **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
     **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
     **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
 with tab2:
     with st.expander("Embed"):
         st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
     ></iframe>
     '''
         st.code(code, language="html")
 st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
 # --- Comet ML Setup (Placeholder/Conditional) ---
 COMET_API_KEY = os.environ.get("COMET_API_KEY")
 COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
 comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 # --- Model Loading ---
 @st.cache_resource
 def load_ner_model():
     except Exception as e:
         st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
         st.stop()
 model = load_ner_model()
 # --- LONG DEFAULT TEXT (178 Words) ---
 DEFAULT_TEXT = (
     "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
     "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
     "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
     "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
+    "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026.")
 # -----------------------------------
 # --- Session State Initialization (CRITICAL FIX) ---
 if 'show_results' not in st.session_state:
     st.session_state.topic_results = None
 if 'my_text_area' not in st.session_state:
     st.session_state.my_text_area = DEFAULT_TEXT
 # --- Clear Button Function (MODIFIED) ---
 def clear_text():
     """Clears the text area (sets it to an empty string) and hides results."""
     st.session_state.results_df = pd.DataFrame()
     st.session_state.elapsed_time = 0.0
     st.session_state.topic_results = None
 # --- Text Input and Clear Button ---
 word_limit = 1000
 text = st.text_area(
     height=250,
     key='my_text_area',
     value=st.session_state.my_text_area)
 word_count = len(text.split())
 st.markdown(f"**Word count:** {word_count}/{word_limit}")
 st.button("Clear text", on_click=clear_text)
 # --- Results Trigger and Processing (Updated Logic) ---
 if st.button("Results"):
     if not text.strip():
             if text != st.session_state.last_text:
                 st.session_state.last_text = text
                 start_time = time.time()
                 # --- Model Prediction & Dataframe Creation ---
                 entities = model.predict_entities(text, labels)
                 df = pd.DataFrame(entities)
                 if not df.empty:
                     df['text'] = df['text'].apply(remove_trailing_punctuation)
                     df['category'] = df['label'].map(reverse_category_mapping)
                     st.session_state.results_df = df
                     unique_entity_count = len(df['text'].unique())
                     N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
                     st.session_state.topic_results = perform_topic_modeling(
                         df,
                         num_topics=2,
                         num_top_words=N_TOP_WORDS_TO_USE
                     )
                     if comet_initialized:
                         experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
                         experiment.log_parameter("input_text", text)
                 else:
                     st.session_state.results_df = pd.DataFrame()
                     st.session_state.topic_results = None
                 end_time = time.time()
                 st.session_state.elapsed_time = end_time - start_time
                 st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
             st.session_state.show_results = True
 # --- Display Download Link and Results ---
 if st.session_state.show_results:
     df = st.session_state.results_df
     df_topic_data = st.session_state.topic_results
     if df.empty:
         st.warning("No entities were found in the provided text.")
     else:
         st.subheader("Analysis Results", divider="blue")
         # 1. Highlighted Text
         st.markdown("### 1. Analyzed Text with Highlighted Entities")
         st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
         # 2. Detailed Entity Analysis Tabs
         st.markdown("### 2. Detailed Entity Analysis")
         tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
         with tab_category_details:
             st.markdown("#### Detailed Entities Table (Grouped by Category)")
             with st.expander("See Glossary of tags"):
                 - **start**: ['index of the start of the corresponding entity']
                 - **end**: ['index of the end of the corresponding entity']
                 ''')
             unique_categories = list(category_mapping.keys())
             tabs_category = st.tabs(unique_categories)
             for category, tab in zip(unique_categories, tabs_category):
                         )
                     else:
                         st.info(f"No entities of category **{category}** were found in the text.")
         with tab_treemap_viz:
             st.markdown("#### Treemap: Entity Distribution")
             fig_treemap = px.treemap(
                 path=[px.Constant("All Entities"), 'category', 'label', 'text'],
                 values='score',
                 color='category',
                 color_discrete_sequence=px.colors.qualitative.Dark24
             )
             fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
             st.plotly_chart(fig_treemap, use_container_width=True)
         # 3. Comparative Charts
         st.markdown("---")
         st.markdown("### 3. Comparative Charts")
         col1, col2, col3 = st.columns(3)
         grouped_counts = df['category'].value_counts().reset_index()
         grouped_counts.columns = ['Category', 'Count']
         with col1: # Pie Chart
+            # Changed color_discrete_sequence
+            fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
             fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
             st.plotly_chart(fig_pie, use_container_width=True)
         with col2: # Bar Chart (Category Count)
             fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
             fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
             st.plotly_chart(fig_bar_category, use_container_width=True)
         with col3: # Bar Chart (Most Frequent Entities)
             word_counts = df['text'].value_counts().reset_index()
             word_counts.columns = ['Entity', 'Count']
             repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
             if not repeating_entities.empty:
+                # Changed color_discrete_sequence
+                fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
                 fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
                 st.plotly_chart(fig_bar_freq, use_container_width=True)
             else:
                 st.info("No entities repeat for frequency chart.")
         st.markdown("---")
         st.markdown("### 4. Entity Relationship Map")
         network_fig = generate_network_graph(df, st.session_state.last_text)
         st.plotly_chart(network_fig, use_container_width=True)
         st.markdown("---")
         st.markdown("### 5. Topic Modelling Analysis")
         if df_topic_data is not None and not df_topic_data.empty:
             bubble_figure = create_topic_word_bubbles(df_topic_data)
             if bubble_figure:
                 st.error("Error generating Topic Word Bubble Chart.")
         else:
             st.info("Topic modeling requires more unique input (at least two unique entities).")
         # --- Report Download ---
         st.markdown("---")
         st.markdown("### Download Full Report Artifacts")
         # 1. HTML Report Download (Retained)
         html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
         st.download_button(
             type="primary"
         )
+        # 2. CSV Data Download (NEW)
         csv_buffer = generate_entity_csv(df)
         st.download_button(
             label="Download Extracted Entities (CSV)",
             mime="text/csv",
             type="secondary"
         )