Spaces:

AIEcosystem
/

relationship-map

Sleeping

App Files Files Community

AIEcosystem commited on Oct 8, 2025

Commit

1c3f8f0

verified ·

1 Parent(s): b04eb29

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +506 -492

src/streamlit_app.py CHANGED Viewed

@@ -18,464 +18,490 @@ from gliner import GLiNER
 from streamlit_extras.stylable_container import stylable_container
 # Using a try/except for comet_ml import
 try:
-   from comet_ml import Experiment
 except ImportError:
-   class Experiment:
-       def __init__(self, **kwargs): pass
-       def log_parameter(self, *args): pass
-       def log_table(self, *args): pass
-       def end(self): pass
 # --- Model Home Directory (Fix for deployment environments) ---
 # Set HF_HOME environment variable to a writable path
 os.environ['HF_HOME'] = '/tmp'
 # --- Color Map for Highlighting and Network Graph Nodes ---
 entity_color_map = {
-   "person": "#10b981",
-   "username": "#3b82f6",
-   "hashtag": "#4ade80",
-   "mention" : "#f97316",
-   "organization": "#f59e0b",
-   "community": "#8b5cf6",
-   "position": "#ec4899",
-   "location": "#06b6d4",
-   "event": "#f43f5e",
-   "product": "#a855f7",
-   "platform": "#eab308",
-   "date": "#6366f1",
-   "media_type": "#14b8a6",
-   "url": "#60a5fa",
-   "nationality_religion": "#fb7185"
 }
 # --- Utility Functions ---
 def extract_label(node_name):
-   """Extracts the label from a node string like 'Text (Label)'."""
-   match = re.search(r'\(([^)]+)\)$', node_name)
-   return match.group(1) if match else "Unknown"
 def remove_trailing_punctuation(text_string):
-   """Removes trailing punctuation from a string."""
-   return text_string.rstrip(string.punctuation)
 def highlight_entities(text, df_entities):
-   """Generates HTML to display text with entities highlighted and colored."""
-   if df_entities.empty:
-       return text
     # Sort entities by start index descending to insert highlights without affecting subsequent indices
-   entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
-   highlighted_text = text
-   for entity in entities:
-       start = entity['start']
-       end = entity['end']
-       label = entity['label']
-       entity_text = entity['text']
-       color = entity_color_map.get(label, '#000000')
-       # Create a span with background color and tooltip
-       highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
-       # Replace the original text segment with the highlighted HTML
-       highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
     # Use a div to mimic the Streamlit input box style for the report
-   return f'<div style="border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
-   """
-   Performs basic Topic Modeling using LDA on the extracted entities
-   and returns structured data for visualization.
-   Includes updated TF-IDF parameters (stop_words='english', max_df=0.95, min_df=1).
-   """
     # Aggregate all unique entity text into a single document list
-   documents = df_entities['text'].unique().tolist()
-   if len(documents) < 2:
-       return None
-   N = min(num_top_words, len(documents))
-   try:
-       # UPDATED: Added stop_words='english' to filter common words tokenized
-       # from multi-word entities (e.g., "The" from "The White House").
-       tfidf_vectorizer = TfidfVectorizer(
-           max_df=0.95,
-           min_df=1, # Retained at 1 to keep all unique entities
-           stop_words='english' # <-- THIS IS THE KEY ADDITION
-       )
-       tfidf = tfidf_vectorizer.fit_transform(documents)
-       tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
-       lda = LatentDirichletAllocation(
-           n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1
-       )
-       lda.fit(tfidf)
-       topic_data_list = []
-       for topic_idx, topic in enumerate(lda.components_):
-           top_words_indices = topic.argsort()[:-N - 1:-1]
-           top_words = [tfidf_feature_names[i] for i in top_words_indices]
-           word_weights = [topic[i] for i in top_words_indices]
-           for word, weight in zip(top_words, word_weights):
                 topic_data_list.append({
                     'Topic_ID': f'Topic #{topic_idx + 1}',
                     'Word': word,
                     'Weight': weight,
                 })
-       return pd.DataFrame(topic_data_list)
-   except Exception as e:
-       st.error(f"Topic modeling failed: {e}")
-       return None
 def create_topic_word_bubbles(df_topic_data):
-   """Generates a Plotly Bubble Chart for top words across all topics."""
-   if df_topic_data.empty:
-       return None
-   fig = px.scatter(
-       df_topic_data,
-       x='Word',
-       y='Topic_ID',
-       size='Weight',
-       color='Topic_ID',
-       size_max=80,
-       title='Topic Word Weights (Bubble Chart)',
-       color_discrete_sequence=px.colors.qualitative.Bold,
-       hover_data={'Word': True, 'Weight': ':.3f', 'Topic_ID': False}
     )
-   fig.update_layout(
-       xaxis_title="Entity/Word (Bubble size = Word Weight)",
-       yaxis_title="Topic ID",
-       xaxis={'tickangle': -45, 'showgrid': False},
-       yaxis={'showgrid': True, 'autorange': 'reversed'},
-       showlegend=True,
-       plot_bgcolor='#FFF0F5',
-       paper_bgcolor='#FFF0F5',
-       height=600,
-       margin=dict(t=50, b=100, l=50, r=10),
     )
-   fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
-   return fig
 def generate_network_graph(df, raw_text):
-   """
-   Generates a network graph visualization (Node Plot) with edges
-   based on entity co-occurrence in sentences.
-   """
-   entity_counts = df['text'].value_counts().reset_index()
-   entity_counts.columns = ['text', 'frequency']
     # Merge counts with unique entities (text + label)
-   unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
-   if unique_entities.shape[0] < 2:
-       # Return a simple figure with a message if not enough data
-       return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
-   num_nodes = len(unique_entities)
-   thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
-   radius = 10
     # Assign circular positions + a little randomness
-   unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
-   unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
     # Map entity text to its coordinates for easy lookup
-   pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
     # ----------------------------------------------------------------------
     # 1. Identify Edges (Co-occurrence in sentences)
     # ----------------------------------------------------------------------
-   edges = set()
     # Simple sentence segmentation (handles standard punctuation followed by space)
-   sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
-   for sentence in sentences:
-       # Find unique entities that are substrings of this sentence
-       entities_in_sentence = []
-       for entity_text in unique_entities['text'].unique():
-           if entity_text.lower() in sentence.lower():
-               entities_in_sentence.append(entity_text)
-       # Create edges (pairs) based on co-occurrence
-       unique_entities_in_sentence = list(set(entities_in_sentence))
-       # Create all unique pairs (edges)
-       for i in range(len(unique_entities_in_sentence)):
-           for j in range(i + 1, len(unique_entities_in_sentence)):
                 node1 = unique_entities_in_sentence[i]
                 node2 = unique_entities_in_sentence[j]
                 # Ensure consistent order for the set to avoid duplicates like (A, B) and (B, A)
                 edge_tuple = tuple(sorted((node1, node2)))
                 edges.add(edge_tuple)
     # ----------------------------------------------------------------------
     # 2. Create Plotly Trace Data for Edges
     # ----------------------------------------------------------------------
-   edge_x = []
-   edge_y = []
-   for edge in edges:
-       n1, n2 = edge
-       if n1 in pos_map and n2 in pos_map:
-           # Append coordinates for line segment: [x1, x2, None] for separation
-           edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
-           edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
-   fig = go.Figure()
     # Add Edge Trace (Lines)
-   edge_trace = go.Scatter(
-       x=edge_x, y=edge_y,
-       line=dict(width=0.5, color='#888'),
-       hoverinfo='none',
-       mode='lines',
-       name='Co-occurrence Edges',
-       showlegend=False # Edges don't need a legend entry
     )
-   fig.add_trace(edge_trace)
     # ----------------------------------------------------------------------
     # 3. Add Node Trace (Markers)
     # ----------------------------------------------------------------------
-   fig.add_trace(go.Scatter(
-       x=unique_entities['x'],
-       y=unique_entities['y'],
-       mode='markers+text',
-       name='Entities',
-       text=unique_entities['text'],
-       textposition="top center",
-       # FIX: Explicitly set showlegend=False for the main node trace
-       # as we are creating separate traces for the legend colors below.
-       showlegend=False,
-       marker=dict(
-           size=unique_entities['frequency'] * 5 + 10,
-           color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
-           line_width=1,
-           line_color='black',
-           opacity=0.9
-       ),
-       textfont=dict(size=10),
-       customdata=unique_entities[['label', 'score', 'frequency']],
-       hovertemplate=(
-           "<b>%{text}</b><br>" +
-           "Label: %{customdata[0]}<br>" +
-           "Score: %{customdata[1]:.2f}<br>" +
-           "Frequency: %{customdata[2]}<extra></extra>"
-       )
-   ))
     # Adding discrete traces for the legend based on unique labels
-   legend_traces = []
-   seen_labels = set()
-   for index, row in unique_entities.iterrows():
-       label = row['label']
-       if label not in seen_labels:
-           seen_labels.add(label)
-           color = entity_color_map.get(label, '#cccccc')
-           legend_traces.append(go.Scatter(
                 x=[None],
                 y=[None],
                 mode='markers',
                 marker=dict(size=10, color=color),
-               name=f"{label.capitalize()}",
                 showlegend=True # Ensure legend traces are explicitly visible
-           ))
-   for trace in legend_traces:
-       fig.add_trace(trace)
-   fig.update_layout(
-       title='Entity Co-occurrence Network (Edges = Same Sentence)',
-       showlegend=True,
-       hovermode='closest',
-       # Set explicit range to ensure padding for text labels on the edge
-       xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
-       yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
-       plot_bgcolor='#f9f9f9',
-       paper_bgcolor='#f9f9f9',
-       margin=dict(t=50, b=10, l=10, r=10),
-       height=600
     )
-   return fig
 def generate_html_report(df, text_input, elapsed_time, df_topic_data):
-   """
-   Generates a full HTML report containing all analysis results and visualizations.
-   FIXED: Treemap color (added color_continuous_scale) and chart overlap (set explicit heights).
-   """
-    # 1. Generate Visualizations (Plotly HTML)
-    # 1a. Treemap - FIX: Added color_continuous_scale to ensure color renders in static HTML
-   fig_treemap = px.treemap(
-       df,
-       path=[px.Constant("All Entities"), 'category', 'label', 'text'],
-       values='score',
-       color='category',
-       title="Entity Distribution by Category and Label",
-       color_continuous_scale=px.colors.sequential.Agsunset # Force a color scale
-   )
-   fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), height=500) # Added height for treemap
-   treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
-    # 1b. Pie Chart - FIX: Set explicit height to prevent overlap in the grid
-   grouped_counts = df['category'].value_counts().reset_index()
-   grouped_counts.columns = ['Category', 'Count']
-   fig_pie = px.pie(grouped_counts, values='Count', names='Category', title='Distribution of Entities by Category', color_discrete_sequence=px.colors.sequential.RdBu)
-   fig_pie.update_layout(margin=dict(t=50, b=10), height=400)
-   pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
-    # 1c. Bar Chart (Category Count) - FIX: Set explicit height
-   fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=px.colors.qualitative.Pastel)
-   fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=10), height=400)
-   bar_category_html = fig_bar_category.to_html(full_html=False, include_plotlyjs='cdn')
-    # 1d. Bar Chart (Most Frequent Entities) - FIX: Set explicit height
-   word_counts = df['text'].value_counts().reset_index()
-   word_counts.columns = ['Entity', 'Count']
     # Top 10 repeating entities
-   repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
-   bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
-   if not repeating_entities.empty:
-       fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count', color='Entity', title='Top 10 Most Frequent Entities', color_discrete_sequence=px.colors.sequential.Plasma)
-       fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=10), height=400)
-       bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
     # 1e. Network Graph HTML - UPDATED to pass text_input
-   network_fig = generate_network_graph(df, text_input)
-   network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
     # 1f. Topic Charts HTML (Now a single Bubble Chart with Placeholder logic)
-   topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
-   if df_topic_data is not None and not df_topic_data.empty:
-       bubble_figure = create_topic_word_bubbles(df_topic_data)
-       if bubble_figure:
-           topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn")}</div>'
-       else:
-           topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
-   else:
-       # Placeholder for low data
-       topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #FF69B4;">'
-       topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
-       topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
-       topic_charts_html += '</div>'
     # 2. Get Highlighted Text
-   highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
     # 3. Entity Tables (Pandas to HTML)
-   entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
-       classes='table table-striped',
-       index=False
     )
     # 4. Construct the Final HTML
-   html_content = f"""<!DOCTYPE html><html lang="en"><head>
-   <meta charset="UTF-8">
-   <meta name="viewport" content="width=device-width, initial-scale=1.0">
-   <title>Entity and Topic Analysis Report</title>
-   <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
-   <style>
-       body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
-       .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
-       h1 {{ color: #FF69B4; border-bottom: 3px solid #FF69B4; padding-bottom: 10px; margin-top: 0; }}
-       h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
-       h3 {{ color: #555; margin-top: 20px; }}
-       .metadata {{ background-color: #FFE4E1; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
-       .grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin-top: 20px; }}
-       .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }}
-       table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
-       table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
-       table th {{ background-color: #f0f0f0; }}
-       /* Specific styling for highlighted text element */
-       .highlighted-text {{ border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
-       @media (max-width: 768px) {{ .grid {{ grid-template-columns: 1fr; }} }}
-   </style></head><body>
-   <div class="container">
-       <h1>Entity and Topic Analysis Report</h1>
-       <div class="metadata">
-           <p><strong>Generated At:</strong> {time.strftime('%Y-%m-%d %H:%M:%S')}</p>
-           <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
-       </div>
-       <!-- Section 1: Original Text & Highlighted Entities -->
-       <h2>1. Analyzed Text & Extracted Entities</h2>
-       <h3>Original Text with Highlighted Entities</h3>
-       <div class="highlighted-text-container">
-           {highlighted_text_html}
-       </div>
-       <!-- Section 2: Full Extracted Entities Table -->
-       <h2>2. Full Extracted Entities Table</h2>
-       {entity_table_html}
-       <!-- Section 3: Visualizations (Treemap, Pie, Bar Charts) -->
-       <h2>3. Data Visualizations</h2>
-       <h3>3.1 Entity Distribution Treemap</h3>
-       <div class="chart-box">{treemap_html}</div>
-       <h3>3.2 Comparative Charts (Pie, Category Count, Frequency)</h3>
-       <div class="grid">
-           <div class="chart-box">{pie_html}</div>
-           <div class="chart-box">{bar_category_html}</div>
-           <div class="chart-box">{bar_freq_html}</div>
-       </div>
-       <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
-       <div class="chart-box">{network_html}</div>
-       <!-- Section 4: Topic Modeling -->
-       <h2>4. Topic Modeling (LDA on Entities)</h2>
-       {topic_charts_html}
-   </div></body></html>
-   """
-   return html_content
 # --- Page Configuration and Styling (No Sidebar) ---
 st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 st.markdown(
-   """
-   <style>
-   /* Overall app container - NO SIDEBAR */
-   .main {
-       background-color: #FFF0F5; /* Blanched Almond/Light Pink */
-       color: #333333; /* Dark grey text for contrast */
     }
-   .stApp {
-       background-color: #FFF0F5;
     }
-   /* Text Area background and text color (input fields) */
-   .stTextArea textarea {
-       background-color: #FFFAF0; /* Floral White/Near white for input fields */
-       color: #000000; /* Black text for input */
-       border: 1px solid #FF69B4; /* Deep Pink border */
     }
-   /* Button styling */
-   .stButton > button {
-       background-color: #FF69B4; /* Deep Pink for the button */
-       color: #FFFFFF; /* White text for contrast */
-       border: none;
-       padding: 10px 20px;
-       border-radius: 5px;
     }
-   /* Expander header and content background */
-   .streamlit-expanderHeader, .streamlit-expanderContent {
-       background-color: #FFE4E1; /* Misty Rose/Lighter Pink */
-       color: #333333;
     }
-   </style>
-   """,
-   unsafe_allow_html=True)
 st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 expander = st.expander("**Important notes**")
@@ -489,77 +515,77 @@ comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAM
 # --- Label Definitions and Category Mapping ---
 labels = list(entity_color_map.keys())
 category_mapping = {
-   "People & Groups": ["person", "username", "hashtag", "mention", "community", "position", "nationality_religion"],
-   "Location & Organization": ["location", "organization"],
-   "Temporal & Events": ["event", "date"],
-   "Digital & Products": ["platform", "product", "media_type", "url"],
 }
 reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 # --- Model Loading ---
 @st.cache_resource
 def load_ner_model():
-   """Loads the GLiNER model and caches it."""
-   try:
-       # Use nested_ner=True and num_gen_sequences=2 for potentially higher recall
-       return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
-   except Exception as e:
-       st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
-       st.stop()
 model = load_ner_model()
 # --- LONG DEFAULT TEXT (178 Words) ---
 DEFAULT_TEXT = (
-   "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
-   "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
-   "leap forward for commercial space technology across the entire European Union. The agreement, finalized "
-   "on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
-   "software platform. This platform is critical for processing and managing the vast amounts of data being sent "
-   "back from the recent Mars rover mission. The core team, including lead engineer Marcus Davies, will hold "
-   "their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
-   "media platform X (under the username @TechSolutionsCEO) was overwhelmingly positive, with many major tech "
-   "publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
-   "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
-   "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
-   "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
-   "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
 )
 # -----------------------------------
 # --- Session State Initialization (CRITICAL FIX) ---
 if 'show_results' not in st.session_state:
-   st.session_state.show_results = False
 if 'last_text' not in st.session_state:
-   st.session_state.last_text = ""
 if 'results_df' not in st.session_state:
-   st.session_state.results_df = pd.DataFrame()
 if 'elapsed_time' not in st.session_state:
-   st.session_state.elapsed_time = 0.0
 if 'topic_results' not in st.session_state:
-   st.session_state.topic_results = None
 # FIX: Initialize the text area key with default text before st.text_area is called
 if 'my_text_area' not in st.session_state:
-   st.session_state.my_text_area = DEFAULT_TEXT
 # --- Clear Button Function (MODIFIED) ---
 def clear_text():
-   """Clears the text area (sets it to an empty string) and hides results."""
     # MODIFIED: Set to empty string for true clearing
-   st.session_state['my_text_area'] = ""
-   st.session_state.show_results = False
-   st.session_state.last_text = ""
-   st.session_state.results_df = pd.DataFrame()
-   st.session_state.elapsed_time = 0.0
-   st.session_state.topic_results = None
 # --- Text Input and Clear Button ---
 word_limit = 1000
 # The text area now safely uses the pre-initialized session state value
 text = st.text_area(
-   f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
-   height=250,
-   key='my_text_area',
-   value=st.session_state.my_text_area
 )
 word_count = len(text.split())
@@ -568,15 +594,15 @@ st.button("Clear text", on_click=clear_text)
 # --- Results Trigger and Processing (Updated Logic) ---
 if st.button("Results"):
-   if not text.strip():
-       st.warning("Please enter some text to extract entities.")
-       st.session_state.show_results = False
-   elif word_count > word_limit:
-       st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
-       st.session_state.show_results = False
-   else:
-       with st.spinner("Extracting entities and generating report data...", show_time=True):
-           if text != st.session_state.last_text:
                 st.session_state.last_text = text
                 start_time = time.time()
@@ -599,6 +625,7 @@ if st.button("Results"):
                     )
                     if comet_initialized:
                         experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
                         experiment.log_parameter("input_text", text)
                         experiment.log_table("predicted_entities", df)
@@ -611,8 +638,8 @@ if st.button("Results"):
                 st.session_state.elapsed_time = end_time - start_time
                 st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
-           st.session_state.show_results = True
 # --- Display Download Link and Results (FIXED INDENTATION AND NEW LAYOUT) ---
 if st.session_state.show_results:
     df = st.session_state.results_df
@@ -671,14 +698,15 @@ if st.session_state.show_results:
         # TAB 2: Treemap
         with tab_treemap_viz:
             st.markdown("#### Treemap: Entity Distribution")
-            # Treemap (Uses the corrected color in the report generation function)
             fig_treemap = px.treemap(
                 df,
                 path=[px.Constant("All Entities"), 'category', 'label', 'text'],
-                values='score',
                 color='category',
                 title="Entity Distribution by Category and Label",
-                color_continuous_scale=px.colors.sequential.Agsunset # Added color scale here for Streamlit preview too
             )
             fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
             st.plotly_chart(fig_treemap, use_container_width=True)
@@ -687,16 +715,22 @@ if st.session_state.show_results:
         st.markdown("---")
         st.markdown("### 4. Comparative Charts")
         col1, col2, col3 = st.columns(3)
         # Pie Chart
         grouped_counts = df['category'].value_counts().reset_index()
         grouped_counts.columns = ['Category', 'Count']
-        fig_pie = px.pie(grouped_counts, values='Count', names='Category', title='Distribution by Category', color_discrete_sequence=px.colors.sequential.RdBu)
         with col1:
             st.plotly_chart(fig_pie, use_container_width=True)
         # Category Count Bar Chart
-        fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=px.colors.qualitative.Pastel)
         with col2:
             st.plotly_chart(fig_bar_category, use_container_width=True)
         # Most Frequent Entities Bar Chart
@@ -705,68 +739,48 @@ if st.session_state.show_results:
         repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
         fig_bar_freq = go.Figure().update_layout(title="No repeating entities for plot")
         if not repeating_entities.empty:
-            fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count', color='Entity', title='Top 10 Most Frequent Entities', color_discrete_sequence=px.colors.sequential.Plasma)
         with col3:
             st.plotly_chart(fig_bar_freq, use_container_width=True)
-        # 5. Network Graph (NOW OUTSIDE ALL TABS)
         st.markdown("---")
         st.markdown("### 5. Entity Co-occurrence Network")
-        st.markdown("Edges connect entities that appear in the same sentence.")
-        fig_network = generate_network_graph(df, st.session_state.last_text)
-        if not isinstance(fig_network, go.Figure):
-            # If the function returned the string message (not enough data)
-            st.info(fig_network.layout.title.text)
-        else:
-            st.plotly_chart(fig_network, use_container_width=True)
         # 6. Topic Modeling
         st.markdown("---")
         st.markdown("### 6. Topic Modeling (LDA on Entities)")
         if df_topic_data is not None and not df_topic_data.empty:
-            st.markdown("##### Topic Word Weights (Bubble Chart)")
             bubble_figure = create_topic_word_bubbles(df_topic_data)
-            st.plotly_chart(bubble_figure, use_container_width=True)
         else:
-            st.info("Topic Modeling requires at least two unique entities to generate the Topic Bubble Chart.")
-        # 7. Download Button (HTML Report)
-        # Generate the full report HTML for download
-        report_html_content = generate_html_report(
-            df,
-            st.session_state.last_text,
-            st.session_state.elapsed_time,
-            df_topic_data
-        )
-        # Convert HTML content to bytes for download
-        b64_html = io.BytesIO(report_html_content.encode('utf-8'))
-        st.markdown("---")
-        with stylable_container(
-            key="download_container",
-            css_styles="""
-                button {
-                    background-color: #007bff;
-                    color: white;
-                    font-weight: bold;
-                    border: 2px solid #007bff;
-                    padding: 10px 20px;
-                    border-radius: 8px;
-                }
-                button:hover {
-                    background-color: #0056b3;
-                }
-            """
-        ):
-            st.download_button(
-                label="Download Full HTML Report 📥",
-                data=b64_html,
-                file_name=f"entity_topic_report_{time.strftime('%Y%m%d_%H%M%S')}.html",
-                mime="text/html",
-            )

 from streamlit_extras.stylable_container import stylable_container
 # Using a try/except for comet_ml import
 try:
+    from comet_ml import Experiment
 except ImportError:
+    class Experiment:
+        def __init__(self, **kwargs): pass
+        def log_parameter(self, *args): pass
+        def log_table(self, *args): pass
+        def end(self): pass
 # --- Model Home Directory (Fix for deployment environments) ---
 # Set HF_HOME environment variable to a writable path
 os.environ['HF_HOME'] = '/tmp'
 # --- Color Map for Highlighting and Network Graph Nodes ---
 entity_color_map = {
+    "person": "#10b981",
+    "username": "#3b82f6",
+    "hashtag": "#4ade80",
+    "mention" : "#f97316",
+    "organization": "#f59e0b",
+    "community": "#8b5cf6",
+    "position": "#ec4899",
+    "location": "#06b6d4",
+    "event": "#f43f5e",
+    "product": "#a855f7",
+    "platform": "#eab308",
+    "date": "#6366f1",
+    "media_type": "#14b8a6",
+    "url": "#60a5fa",
+    "nationality_religion": "#fb7185"
 }
 # --- Utility Functions ---
 def extract_label(node_name):
+    """Extracts the label from a node string like 'Text (Label)'."""
+    match = re.search(r'\(([^)]+)\)$', node_name)
+    return match.group(1) if match else "Unknown"
 def remove_trailing_punctuation(text_string):
+    """Removes trailing punctuation from a string."""
+    return text_string.rstrip(string.punctuation)
 def highlight_entities(text, df_entities):
+    """Generates HTML to display text with entities highlighted and colored."""
+    if df_entities.empty:
+        return text
     # Sort entities by start index descending to insert highlights without affecting subsequent indices
+    entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
+    highlighted_text = text
+    for entity in entities:
+        start = entity['start']
+        end = entity['end']
+        label = entity['label']
+        entity_text = entity['text']
+        color = entity_color_map.get(label, '#000000')
+        # Create a span with background color and tooltip
+        highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
+        # Replace the original text segment with the highlighted HTML
+        highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
     # Use a div to mimic the Streamlit input box style for the report
+    return f'<div style="border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
+    """
+    Performs basic Topic Modeling using LDA on the extracted entities
+    and returns structured data for visualization.
+    Includes updated TF-IDF parameters (stop_words='english', max_df=0.95, min_df=1).
+    """
     # Aggregate all unique entity text into a single document list
+    documents = df_entities['text'].unique().tolist()
+    if len(documents) < 2:
+        return None
+    N = min(num_top_words, len(documents))
+    try:
+        # UPDATED: Added stop_words='english' to filter common words tokenized
+        # from multi-word entities (e.g., "The" from "The White House").
+        tfidf_vectorizer = TfidfVectorizer(
+            max_df=0.95,
+            min_df=1, # Retained at 1 to keep all unique entities
+            stop_words='english' # <-- THIS IS THE KEY ADDITION
+        )
+        tfidf = tfidf_vectorizer.fit_transform(documents)
+        tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
+        lda = LatentDirichletAllocation(
+            n_components=num_topics, max_iter=5, learning_method='online',
+random_state=42, n_jobs=-1
+        )
+        lda.fit(tfidf)
+        topic_data_list = []
+        for topic_idx, topic in enumerate(lda.components_):
+            top_words_indices = topic.argsort()[:-N - 1:-1]
+            top_words = [tfidf_feature_names[i] for i in top_words_indices]
+            word_weights = [topic[i] for i in top_words_indices]
+            for word, weight in zip(top_words, word_weights):
                 topic_data_list.append({
                     'Topic_ID': f'Topic #{topic_idx + 1}',
                     'Word': word,
                     'Weight': weight,
                 })
+        return pd.DataFrame(topic_data_list)
+    except Exception as e:
+        st.error(f"Topic modeling failed: {e}")
+        return None
 def create_topic_word_bubbles(df_topic_data):
+    """Generates a Plotly Bubble Chart for top words across all topics."""
+    if df_topic_data.empty:
+        return None
+    fig = px.scatter(
+        df_topic_data,
+        x='Word',
+        y='Topic_ID',
+        size='Weight',
+        color='Topic_ID',
+        size_max=80,
+        title='Topic Word Weights (Bubble Chart)',
+        color_discrete_sequence=px.colors.qualitative.Bold,
+        hover_data={'Word': True, 'Weight': ':.3f', 'Topic_ID': False}
     )
+    fig.update_layout(
+        xaxis_title="Entity/Word (Bubble size = Word Weight)",
+        yaxis_title="Topic ID",
+        xaxis={'tickangle': -45, 'showgrid': False},
+        yaxis={'showgrid': True, 'autorange': 'reversed'},
+        showlegend=True,
+        plot_bgcolor='#FFF0F5',
+        paper_bgcolor='#FFF0F5',
+        height=600,
+        margin=dict(t=50, b=100, l=50, r=10),
     )
+    fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
+    return fig
 def generate_network_graph(df, raw_text):
+    """
+    Generates a network graph visualization (Node Plot) with edges
+    based on entity co-occurrence in sentences.
+    """
+    entity_counts = df['text'].value_counts().reset_index()
+    entity_counts.columns = ['text', 'frequency']
     # Merge counts with unique entities (text + label)
+    unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
+    if unique_entities.shape[0] < 2:
+        # Return a simple figure with a message if not enough data
+        return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
+    num_nodes = len(unique_entities)
+    thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
+    radius = 10
     # Assign circular positions + a little randomness
+    unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
+    unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
     # Map entity text to its coordinates for easy lookup
+    pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
     # ----------------------------------------------------------------------
     # 1. Identify Edges (Co-occurrence in sentences)
     # ----------------------------------------------------------------------
+    edges = set()
     # Simple sentence segmentation (handles standard punctuation followed by space)
+    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
+    for sentence in sentences:
+        # Find unique entities that are substrings of this sentence
+        entities_in_sentence = []
+        for entity_text in unique_entities['text'].unique():
+            if entity_text.lower() in sentence.lower():
+                entities_in_sentence.append(entity_text)
+        # Create edges (pairs) based on co-occurrence
+        unique_entities_in_sentence = list(set(entities_in_sentence))
+        # Create all unique pairs (edges)
+        for i in range(len(unique_entities_in_sentence)):
+            for j in range(i + 1, len(unique_entities_in_sentence)):
                 node1 = unique_entities_in_sentence[i]
                 node2 = unique_entities_in_sentence[j]
                 # Ensure consistent order for the set to avoid duplicates like (A, B) and (B, A)
                 edge_tuple = tuple(sorted((node1, node2)))
                 edges.add(edge_tuple)
     # ----------------------------------------------------------------------
     # 2. Create Plotly Trace Data for Edges
     # ----------------------------------------------------------------------
+    edge_x = []
+    edge_y = []
+    for edge in edges:
+        n1, n2 = edge
+        if n1 in pos_map and n2 in pos_map:
+            # Append coordinates for line segment: [x1, x2, None] for separation
+            edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
+            edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
+    fig = go.Figure()
     # Add Edge Trace (Lines)
+    edge_trace = go.Scatter(
+        x=edge_x, y=edge_y,
+        line=dict(width=0.5, color='#888'),
+        hoverinfo='none',
+        mode='lines',
+        name='Co-occurrence Edges',
+        showlegend=False # Edges don't need a legend entry
     )
+    fig.add_trace(edge_trace)
     # ----------------------------------------------------------------------
     # 3. Add Node Trace (Markers)
     # ----------------------------------------------------------------------
+    fig.add_trace(go.Scatter(
+        x=unique_entities['x'],
+        y=unique_entities['y'],
+        mode='markers+text',
+        name='Entities',
+        text=unique_entities['text'],
+        textposition="top center",
+        # FIX: Explicitly set showlegend=False for the main node trace
+        # as we are creating separate traces for the legend colors below.
+        showlegend=False,
+        marker=dict(
+            size=unique_entities['frequency'] * 5 + 10,
+            color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
+            line_width=1,
+            line_color='black',
+            opacity=0.9
+        ),
+        textfont=dict(size=10),
+        customdata=unique_entities[['label', 'score', 'frequency']],
+        hovertemplate=(
+            "<b>%{text}</b><br>" +
+            "Label: %{customdata[0]}<br>" +
+            "Score: %{customdata[1]:.2f}<br>" +
+            "Frequency: %{customdata[2]}<extra></extra>"
+        )
+    ))
     # Adding discrete traces for the legend based on unique labels
+    legend_traces = []
+    seen_labels = set()
+    for index, row in unique_entities.iterrows():
+        label = row['label']
+        if label not in seen_labels:
+            seen_labels.add(label)
+            color = entity_color_map.get(label, '#cccccc')
+            legend_traces.append(go.Scatter(
                 x=[None],
                 y=[None],
                 mode='markers',
                 marker=dict(size=10, color=color),
+                name=f"{label.capitalize()}",
                 showlegend=True # Ensure legend traces are explicitly visible
+            ))
+    for trace in legend_traces:
+        fig.add_trace(trace)
+    fig.update_layout(
+        title='Entity Co-occurrence Network (Edges = Same Sentence)',
+        showlegend=True,
+        hovermode='closest',
+        # Set explicit range to ensure padding for text labels on the edge
+        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
+        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
+        plot_bgcolor='#f9f9f9',
+        paper_bgcolor='#f9f9f9',
+        margin=dict(t=50, b=10, l=10, r=10),
+        height=600
     )
+    return fig
 def generate_html_report(df, text_input, elapsed_time, df_topic_data):
+    """
+    Generates a full HTML report containing all analysis results and visualizations.
+    FIX 1: Added a discrete color sequence to the Treemap to prevent black color.
+    FIX 2: Adjusted CSS grid properties and added min-width to grid items to prevent plot overlap.
+    """
+    # 1. Generate Visualizations (Plotly HTML)
+    # 1a. Treemap
+    # FIX 1: Explicitly set a color_discrete_sequence to prevent the Treemap from being black
+    fig_treemap = px.treemap(
+        df,
+        path=[px.Constant("All Entities"), 'category', 'label', 'text'],
+        values='score',
+        color='category',
+        title="Entity Distribution by Category and Label",
+        color_discrete_sequence=px.colors.qualitative.Dark24 # Use a robust color sequence
+    )
+    fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+    treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
+    # 1b. Pie Chart
+    grouped_counts = df['category'].value_counts().reset_index()
+    grouped_counts.columns = ['Category', 'Count']
+    fig_pie = px.pie(grouped_counts, values='Count', names='Category',
+title='Distribution of Entities by Category',
+color_discrete_sequence=px.colors.sequential.RdBu)
+    fig_pie.update_layout(margin=dict(t=50, b=10))
+    pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
+    # 1c. Bar Chart (Category Count)
+    fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',
+color='Category', title='Total Entities per Category',
+color_discrete_sequence=px.colors.qualitative.Pastel)
+    fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},
+margin=dict(t=50, b=10))
+    bar_category_html = fig_bar_category.to_html(full_html=False,
+include_plotlyjs='cdn')
+    # 1d. Bar Chart (Most Frequent Entities)
+    word_counts = df['text'].value_counts().reset_index()
+    word_counts.columns = ['Entity', 'Count']
     # Top 10 repeating entities
+    repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
+    bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
+    if not repeating_entities.empty:
+        fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',
+color='Entity', title='Top 10 Most Frequent Entities',
+color_discrete_sequence=px.colors.sequential.Plasma)
+        fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},
+margin=dict(t=50, b=10))
+        bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
     # 1e. Network Graph HTML - UPDATED to pass text_input
+    network_fig = generate_network_graph(df, text_input)
+    network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
     # 1f. Topic Charts HTML (Now a single Bubble Chart with Placeholder logic)
+    topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
+    if df_topic_data is not None and not df_topic_data.empty:
+        bubble_figure = create_topic_word_bubbles(df_topic_data)
+        if bubble_figure:
+            topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn")}</div>'
+        else:
+            topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
+    else:
+        # Placeholder for low data
+        topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #FF69B4;">'
+        topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
+        topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
+        topic_charts_html += '</div>'
     # 2. Get Highlighted Text
+    highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
     # 3. Entity Tables (Pandas to HTML)
+    entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
+        classes='table table-striped',
+        index=False
     )
     # 4. Construct the Final HTML
+    html_content = f"""<!DOCTYPE html><html lang="en"><head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Entity and Topic Analysis Report</title>
+    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+    <style>
+        body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
+        .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
+        h1 {{ color: #FF69B4; border-bottom: 3px solid #FF69B4; padding-bottom: 10px; margin-top: 0; }}
+        h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
+        h3 {{ color: #555; margin-top: 20px; }}
+        .metadata {{ background-color: #FFE4E1; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
+        /* FIX 2: Modified grid to ensure each item gets min 30% of the container width */
+        .grid {{
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); /* Adjusted min-width for better fit */
+            gap: 20px;
+            margin-top: 20px;
+        }}
+        .chart-box {{
+            background-color: #f9f9f9;
+            padding: 15px;
+            border-radius: 8px;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.05);
+            /* Important: Set a minimum width for the chart box in the grid */
+            min-width: 0;
+        }}
+        table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
+        table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
+        table th {{ background-color: #f0f0f0; }}
+        /* Specific styling for highlighted text element */
+        .highlighted-text {{ border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
+        @media (max-width: 1050px) {{ /* Increased breakpoint to help prevent overlap */
+             .grid {{
+                grid-template-columns: 1fr; /* Stack charts vertically on smaller screens */
+            }}
+        }}
+    </style></head><body>
+    <div class="container">
+        <h1>Entity and Topic Analysis Report</h1>
+        <div class="metadata">
+            <p><strong>Generated At:</strong> {time.strftime('%Y-%m-%d %H:%M:%S')}</p>
+            <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
+        </div>
+        <h2>1. Analyzed Text & Extracted Entities</h2>
+        <h3>Original Text with Highlighted Entities</h3>
+        <div class="highlighted-text-container">
+            {highlighted_text_html}
+        </div>
+        <h2>2. Full Extracted Entities Table</h2>
+        {entity_table_html}
+        <h2>3. Data Visualizations</h2>
+        <h3>3.1 Entity Distribution Treemap</h3>
+        <div class="chart-box">{treemap_html}</div>
+        <h3>3.2 Comparative Charts (Pie, Category Count, Frequency)</h3>
+        <div class="grid">
+            <div class="chart-box">{pie_html}</div>
+            <div class="chart-box">{bar_category_html}</div>
+            <div class="chart-box">{bar_freq_html}</div>
+        </div>
+        <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
+        <div class="chart-box">{network_html}</div>
+        <h2>4. Topic Modeling (LDA on Entities)</h2>
+        {topic_charts_html}
+    </div></body></html>
+    """
+    return html_content
 # --- Page Configuration and Styling (No Sidebar) ---
 st.set_page_config(layout="wide", page_title="NER & Topic Report App")
 st.markdown(
+    """
+    <style>
+    /* Overall app container - NO SIDEBAR */
+    .main {
+        background-color: #FFF0F5; /* Blanched Almond/Light Pink */
+        color: #333333; /* Dark grey text for contrast */
     }
+    .stApp {
+        background-color: #FFF0F5;
     }
+    /* Text Area background and text color (input fields) */
+    .stTextArea textarea {
+        background-color: #FFFAF0; /* Floral White/Near white for input fields */
+        color: #000000; /* Black text for input */
+        border: 1px solid #FF69B4; /* Deep Pink border */
     }
+    /* Button styling */
+    .stButton > button {
+        background-color: #FF69B4; /* Deep Pink for the button */
+        color: #FFFFFF; /* White text for contrast */
+        border: none;
+        padding: 10px 20px;
+        border-radius: 5px;
     }
+    /* Expander header and content background */
+    .streamlit-expanderHeader, .streamlit-expanderContent {
+        background-color: #FFE4E1; /* Misty Rose/Lighter Pink */
+        color: #333333;
     }
+    </style>
+    """,
+    unsafe_allow_html=True)
 st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 expander = st.expander("**Important notes**")
 # --- Label Definitions and Category Mapping ---
 labels = list(entity_color_map.keys())
 category_mapping = {
+    "People & Groups": ["person", "username", "hashtag", "mention", "community", "position", "nationality_religion"],
+    "Location & Organization": ["location", "organization"],
+    "Temporal & Events": ["event", "date"],
+    "Digital & Products": ["platform", "product", "media_type", "url"],
 }
 reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 # --- Model Loading ---
 @st.cache_resource
 def load_ner_model():
+    """Loads the GLiNER model and caches it."""
+    try:
+        # Use nested_ner=True and num_gen_sequences=2 for potentially higher recall
+        return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
+    except Exception as e:
+        st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
+        st.stop()
 model = load_ner_model()
 # --- LONG DEFAULT TEXT (178 Words) ---
 DEFAULT_TEXT = (
+    "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
+    "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
+    "leap forward for commercial space technology across the entire European Union. The agreement, finalized "
+    "on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
+    "software platform. This platform is critical for processing and managing the vast amounts of data being sent "
+    "back from the recent Mars rover mission. The core team, including lead engineer Marcus Davies, will hold "
+    "their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
+    "media platform X (under the username @TechSolutionsCEO) was overwhelmingly positive, with many major tech "
+    "publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
+    "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
+    "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
+    "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
+    "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
 )
 # -----------------------------------
 # --- Session State Initialization (CRITICAL FIX) ---
 if 'show_results' not in st.session_state:
+    st.session_state.show_results = False
 if 'last_text' not in st.session_state:
+    st.session_state.last_text = ""
 if 'results_df' not in st.session_state:
+    st.session_state.results_df = pd.DataFrame()
 if 'elapsed_time' not in st.session_state:
+    st.session_state.elapsed_time = 0.0
 if 'topic_results' not in st.session_state:
+    st.session_state.topic_results = None
 # FIX: Initialize the text area key with default text before st.text_area is called
 if 'my_text_area' not in st.session_state:
+    st.session_state.my_text_area = DEFAULT_TEXT
 # --- Clear Button Function (MODIFIED) ---
 def clear_text():
+    """Clears the text area (sets it to an empty string) and hides results."""
     # MODIFIED: Set to empty string for true clearing
+    st.session_state['my_text_area'] = ""
+    st.session_state.show_results = False
+    st.session_state.last_text = ""
+    st.session_state.results_df = pd.DataFrame()
+    st.session_state.elapsed_time = 0.0
+    st.session_state.topic_results = None
 # --- Text Input and Clear Button ---
 word_limit = 1000
 # The text area now safely uses the pre-initialized session state value
 text = st.text_area(
+    f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
+    height=250,
+    key='my_text_area',
+    value=st.session_state.my_text_area
 )
 word_count = len(text.split())
 # --- Results Trigger and Processing (Updated Logic) ---
 if st.button("Results"):
+    if not text.strip():
+        st.warning("Please enter some text to extract entities.")
+        st.session_state.show_results = False
+    elif word_count > word_limit:
+        st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
+        st.session_state.show_results = False
+    else:
+        with st.spinner("Extracting entities and generating report data...", show_time=True):
+            if text != st.session_state.last_text:
                 st.session_state.last_text = text
                 start_time = time.time()
                     )
                     if comet_initialized:
+                        # FIX APPLIED HERE: Corrected indentation for the following lines
                         experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
                         experiment.log_parameter("input_text", text)
                         experiment.log_table("predicted_entities", df)
                 st.session_state.elapsed_time = end_time - start_time
                 st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
+            st.session_state.show_results = True
 # --- Display Download Link and Results (FIXED INDENTATION AND NEW LAYOUT) ---
 if st.session_state.show_results:
     df = st.session_state.results_df
         # TAB 2: Treemap
         with tab_treemap_viz:
             st.markdown("#### Treemap: Entity Distribution")
+            # Treemap
+            # FIX 1 (Streamlit): Added a robust color sequence here too for consistency in the Streamlit plot
             fig_treemap = px.treemap(
                 df,
                 path=[px.Constant("All Entities"), 'category', 'label', 'text'],
+                values='score',
                 color='category',
                 title="Entity Distribution by Category and Label",
+                color_discrete_sequence=px.colors.qualitative.Dark24 # Applied fix here
             )
             fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
             st.plotly_chart(fig_treemap, use_container_width=True)
         st.markdown("---")
         st.markdown("### 4. Comparative Charts")
+        # FIX 2 (Streamlit): The Streamlit plot columns (col1, col2, col3) naturally handle overlap,
+        # so no change is needed here, the fix is only in the HTML report.
         col1, col2, col3 = st.columns(3)
         # Pie Chart
         grouped_counts = df['category'].value_counts().reset_index()
         grouped_counts.columns = ['Category', 'Count']
+        fig_pie = px.pie(grouped_counts, values='Count', names='Category',
+title='Distribution by Category',
+color_discrete_sequence=px.colors.sequential.RdBu)
         with col1:
             st.plotly_chart(fig_pie, use_container_width=True)
         # Category Count Bar Chart
+        fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',
+color='Category', title='Total Entities per Category',
+color_discrete_sequence=px.colors.qualitative.Pastel)
         with col2:
             st.plotly_chart(fig_bar_category, use_container_width=True)
         # Most Frequent Entities Bar Chart
         repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
         fig_bar_freq = go.Figure().update_layout(title="No repeating entities for plot")
         if not repeating_entities.empty:
+            fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',
+color='Entity', title='Top 10 Most Frequent Entities',
+color_discrete_sequence=px.colors.sequential.Plasma)
         with col3:
             st.plotly_chart(fig_bar_freq, use_container_width=True)
+        # 5. Network Graph
         st.markdown("---")
         st.markdown("### 5. Entity Co-occurrence Network")
+        network_fig = generate_network_graph(df, st.session_state.last_text)
+        st.plotly_chart(network_fig, use_container_width=True)
         # 6. Topic Modeling
         st.markdown("---")
         st.markdown("### 6. Topic Modeling (LDA on Entities)")
         if df_topic_data is not None and not df_topic_data.empty:
             bubble_figure = create_topic_word_bubbles(df_topic_data)
+            if bubble_figure:
+                st.plotly_chart(bubble_figure, use_container_width=True)
+            else:
+                st.error("Visualization for Topic Modeling failed.")
         else:
+            st.info("Topic Modeling requires at least two unique entities and sufficient data to generate meaningful topics.")
+        # Final Report Download
+        st.markdown("---")
+        st.markdown("### Download Full HTML Report 🚀")
+        # Generate the full HTML content
+        html_report = generate_html_report(
+            df=df,
+            text_input=st.session_state.last_text,
+            elapsed_time=st.session_state.elapsed_time,
+            df_topic_data=df_topic_data
+        )
+        st.download_button(
+            label="Download Analysis Report (.html)",
+            data=html_report,
+            file_name="entity_analysis_report.html",
+            mime="text/html"
+        )