AIEcosystem commited on
Commit
965b307
·
verified ·
1 Parent(s): ec15b51

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +357 -185
src/streamlit_app.py CHANGED
@@ -11,9 +11,12 @@ import numpy as np
11
  import re
12
  import string
13
  import json
14
- # --- Imports for file generation (no pptx) ---
15
  from io import BytesIO
16
- import plotly.io as pio
 
 
 
17
  # ---------------------------
18
  # --- Stable Scikit-learn LDA Imports ---
19
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -36,31 +39,35 @@ except ImportError:
36
  # Set HF_HOME environment variable to a writable path
37
  os.environ['HF_HOME'] = '/tmp'
38
 
39
- # --- Color Map for Highlighting and Network Graph Nodes (NO PINK COLORS) ---
40
  entity_color_map = {
41
  "person": "#10b981",
42
- "country": "#3b82f6",
43
- "city": "#4ade80",
 
44
  "organization": "#f59e0b",
45
- "date": "#8b5cf6",
46
- "time": "#ec4899",
47
- "cardinal": "#06b6d4",
48
- "money": "#f43f5e",
49
- "position": "#a855f7",
 
 
 
 
 
50
  }
51
 
52
- # --- Label Definitions and Category Mapping (Used by the App) ---
53
  labels = list(entity_color_map.keys())
54
- labels = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
55
  category_mapping = {
56
- "People": ["person", "organization", "position"],
57
- "Locations": ["country", "city"],
58
- "Time": ["date", "time"],
59
- "Numbers": ["money", "cardinal"]}
 
 
60
 
61
- # CORRECTION 1: Reverse category mapping definition moved here for app-wide access
62
- reverse_category_mapping = {label: category
63
- for category, label_list in category_mapping.items() for label in label_list}
64
 
65
  # --- Utility Functions for Analysis and Plotly ---
66
  def extract_label(node_name):
@@ -76,21 +83,25 @@ def highlight_entities(text, df_entities):
76
  """Generates HTML to display text with entities highlighted and colored."""
77
  if df_entities.empty:
78
  return text
 
79
  # Sort entities by start index descending to insert highlights without affecting subsequent indices
80
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
81
  highlighted_text = text
 
82
  for entity in entities:
83
  start = entity['start']
84
  end = entity['end']
85
  label = entity['label']
86
  entity_text = entity['text']
87
  color = entity_color_map.get(label, '#000000')
 
88
  # Create a span with background color and tooltip
89
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
90
  # Replace the original text segment with the highlighted HTML
91
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
 
92
  # Use a div to mimic the Streamlit input box style for the report
93
- return f'<div style="border: 1px solid #CCCCCC; padding: 15px; border-radius: 5px; background-color: #FFFFFF; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
94
 
95
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
96
  """
@@ -100,6 +111,7 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
100
  documents = df_entities['text'].unique().tolist()
101
  if len(documents) < 2:
102
  return None
 
103
  N = min(num_top_words, len(documents))
104
  try:
105
  tfidf_vectorizer = TfidfVectorizer(
@@ -109,6 +121,7 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
109
  )
110
  tfidf = tfidf_vectorizer.fit_transform(documents)
111
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
 
112
  lda = LatentDirichletAllocation(
113
  n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
114
  )
@@ -134,6 +147,7 @@ def create_topic_word_bubbles(df_topic_data):
134
  # Renaming columns to match the output of perform_topic_modeling
135
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
136
  df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
 
137
  if df_topic_data.empty:
138
  return None
139
  fig = px.scatter(
@@ -159,41 +173,45 @@ def create_topic_word_bubbles(df_topic_data):
159
  xaxis={'tickangle': -45, 'showgrid': False},
160
  yaxis={'showgrid': True},
161
  showlegend=True,
162
- plot_bgcolor='#FFFFFF', # Removed pink
163
- paper_bgcolor='#FFFFFF', # Removed pink
164
  height=600,
165
  margin=dict(t=50, b=100, l=50, r=10),
166
  )
167
- fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',marker=dict(line=dict(width=1, color='DarkSlateGrey')))
168
  return fig
169
 
170
  def generate_network_graph(df, raw_text):
171
  """
172
  Generates a network graph visualization (Node Plot) with edges
173
- based on entity co-occurrence in sentences.
174
  """
 
175
  entity_counts = df['text'].value_counts().reset_index()
176
  entity_counts.columns = ['text', 'frequency']
 
177
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
178
  if unique_entities.shape[0] < 2:
179
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
180
 
181
  num_nodes = len(unique_entities)
182
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
 
183
  radius = 10
184
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
185
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
186
- pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
187
 
 
188
  edges = set()
189
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
190
 
 
191
  for sentence in sentences:
192
  entities_in_sentence = []
193
  for entity_text in unique_entities['text'].unique():
194
  if entity_text.lower() in sentence.lower():
195
  entities_in_sentence.append(entity_text)
196
  unique_entities_in_sentence = list(set(entities_in_sentence))
 
197
  for i in range(len(unique_entities_in_sentence)):
198
  for j in range(i + 1, len(unique_entities_in_sentence)):
199
  node1 = unique_entities_in_sentence[i]
@@ -203,6 +221,7 @@ def generate_network_graph(df, raw_text):
203
 
204
  edge_x = []
205
  edge_y = []
 
206
  for edge in edges:
207
  n1, n2 = edge
208
  if n1 in pos_map and n2 in pos_map:
@@ -254,7 +273,7 @@ def generate_network_graph(df, raw_text):
254
  seen_labels.add(label)
255
  color = entity_color_map.get(label, '#cccccc')
256
  legend_traces.append(go.Scatter(
257
- x=[None], y=[None], mode='markers', marker=dict(size=10, color=color),name=f"{label.capitalize()}", showlegend=True
258
  ))
259
  for trace in legend_traces:
260
  fig.add_trace(trace)
@@ -270,8 +289,161 @@ def generate_network_graph(df, raw_text):
270
  margin=dict(t=50, b=10, l=10, r=10),
271
  height=600
272
  )
 
273
  return fig
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  # --- NEW CSV GENERATION FUNCTION ---
276
  def generate_entity_csv(df):
277
  """
@@ -287,12 +459,14 @@ def generate_entity_csv(df):
287
  # -----------------------------------
288
 
289
  # --- Existing App Functionality (HTML) ---
 
290
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
291
  """
292
- Generates a full HTML report containing all analysis results and
293
- visualizations. (Simplified HTML generation for brevity in code)
294
  """
295
  # 1. Generate Visualizations (Plotly HTML)
 
296
  # 1a. Treemap
297
  fig_treemap = px.treemap(
298
  df,
@@ -322,6 +496,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
322
  word_counts.columns = ['Entity', 'Count']
323
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
324
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
 
325
  if not repeating_entities.empty:
326
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
327
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
@@ -340,7 +515,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
340
  else:
341
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
342
  else:
343
- topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #cccccc;">'
344
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
345
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
346
  topic_charts_html += '</div>'
@@ -349,12 +524,9 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
349
  highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
350
 
351
  # 3. Entity Tables (Pandas to HTML)
352
- # The grouped by category table is used here for the HTML export
353
- grouped_entity_table_df = df.groupby(['category', 'label']).size().reset_index(name='Count')
354
- grouped_entity_table_df.columns = ['Category', 'Entity', 'Count'] # Column Renaming
355
- grouped_entity_table_html = grouped_entity_table_df.to_html(
356
- classes='table table-striped',
357
- index=False
358
  )
359
 
360
  # 4. Construct the Final HTML
@@ -366,15 +538,15 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
366
  <style>
367
  body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
368
  .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
369
- h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom: 10px; margin-top: 0; }}
370
  h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
371
  h3 {{ color: #555; margin-top: 20px; }}
372
- .metadata {{ background-color: #e6f7ff; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
373
  .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
374
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
375
  table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
376
  table th {{ background-color: #f0f0f0; }}
377
- .highlighted-text {{ border: 1px solid #CCCCCC; padding: 15px; border-radius: 5px; background-color: #FFFFFF; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
378
  </style></head><body>
379
  <div class="container">
380
  <h1>Entity and Topic Analysis Report</h1>
@@ -387,45 +559,46 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
387
  <div class="highlighted-text-container">
388
  {highlighted_text_html}
389
  </div>
390
- <h2>2. Entities Count by Category and Entity</h2>
391
- {grouped_entity_table_html}
392
  <h2>3. Data Visualizations</h2>
393
  <h3>3.1 Entity Distribution Treemap</h3>
394
  <div class="chart-box">{treemap_html}</div>
395
- <h3>3.2 Comparative Charts</h3>
396
  <div class="chart-box">{pie_html}</div>
397
  <div class="chart-box">{bar_category_html}</div>
398
  <div class="chart-box">{bar_freq_html}</div>
399
- <h3>3.3 Entity Relationship Map</h3>
400
  <div class="chart-box">{network_html}</div>
401
- <h2>4. Topic Modeling</h2>
402
  {topic_charts_html}
403
  </div></body></html>
404
  """
405
  return html_content
406
 
407
- # --- Page Configuration and Styling (No Sidebar, Removed Pink) ---
 
408
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
409
  st.markdown(
410
  """
411
  <style>
412
  /* Overall app container - NO SIDEBAR */
413
  .main {
414
- background-color: #f0f2f6; /* Light Grey/Default */
415
  color: #333333; /* Dark grey text for contrast */
416
  }
417
  .stApp {
418
- background-color: #f0f2f6;
419
  }
420
  /* Text Area background and text color (input fields) */
421
  .stTextArea textarea {
422
- background-color: #FFFFFF; /* White for input fields */
423
  color: #000000; /* Black text for input */
424
- border: 1px solid #CCCCCC; /* Neutral border */
425
  }
426
  /* Button styling */
427
  .stButton > button {
428
- background-color: #007bff; /* Blue for the button */
429
  color: #FFFFFF; /* White text for contrast */
430
  border: none;
431
  padding: 10px 20px;
@@ -433,20 +606,19 @@ st.markdown(
433
  }
434
  /* Expander header and content background */
435
  .streamlit-expanderHeader, .streamlit-expanderContent {
436
- background-color: #e6f7ff; /* Very Light Blue/Neutral */
437
  color: #333333;
438
  }
439
  </style>
440
  """,
441
  unsafe_allow_html=True)
442
-
443
- st.subheader("Entity and Topic Analysis Report Generator", divider="blue")
444
- st.link_button("by nlpblogs", "https://nlpblogs.com", type="secondary")
445
-
446
- # CORRECTION 2: Removed duplicated expander. The following is the second, correct one.
447
  expander = st.expander("**Important notes**")
448
- expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"**Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.**How to Use:** Type or paste your text (max. 1000 words) into the text area below, press Ctrl + Enter, and then click the 'Results' button.**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
449
-
 
 
450
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
451
 
452
  # --- Comet ML Setup (Placeholder/Conditional) ---
@@ -481,9 +653,9 @@ DEFAULT_TEXT = (
481
  "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
482
  "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
483
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
484
- "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026.")
 
485
  # -----------------------------------
486
-
487
  # --- Session State Initialization (CRITICAL FIX) ---
488
  if 'show_results' not in st.session_state:
489
  st.session_state.show_results = False
@@ -515,6 +687,7 @@ text = st.text_area(
515
  height=250,
516
  key='my_text_area',
517
  value=st.session_state.my_text_area)
 
518
  word_count = len(text.split())
519
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
520
  st.button("Clear text", on_click=clear_text)
@@ -532,20 +705,25 @@ if st.button("Results"):
532
  if text != st.session_state.last_text:
533
  st.session_state.last_text = text
534
  start_time = time.time()
 
535
  # --- Model Prediction & Dataframe Creation ---
536
  entities = model.predict_entities(text, labels)
537
  df = pd.DataFrame(entities)
 
538
  if not df.empty:
539
  df['text'] = df['text'].apply(remove_trailing_punctuation)
540
  df['category'] = df['label'].map(reverse_category_mapping)
541
  st.session_state.results_df = df
 
542
  unique_entity_count = len(df['text'].unique())
543
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
 
544
  st.session_state.topic_results = perform_topic_modeling(
545
  df,
546
  num_topics=2,
547
  num_top_words=N_TOP_WORDS_TO_USE
548
  )
 
549
  if comet_initialized:
550
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
551
  experiment.log_parameter("input_text", text)
@@ -554,153 +732,147 @@ if st.button("Results"):
554
  else:
555
  st.session_state.results_df = pd.DataFrame()
556
  st.session_state.topic_results = None
 
557
  end_time = time.time()
558
  st.session_state.elapsed_time = end_time - start_time
559
 
560
- st.session_state.show_results = True
561
-
562
- # --- Results Display ---
563
- if st.session_state.show_results and not st.session_state.results_df.empty:
564
- st.success(f"Processing complete in {st.session_state.elapsed_time:.2f} seconds! 🎉")
565
 
 
 
566
  df = st.session_state.results_df
567
- text_input = st.session_state.last_text
568
- elapsed_time = st.session_state.elapsed_time
569
  df_topic_data = st.session_state.topic_results
570
 
571
- # --- Highlighted Text and Download Buttons (Above Tabs) ---
572
- st.subheader("1. Analyzed Text & Extracted Entities", divider="blue")
573
- st.markdown(
574
- highlight_entities(text_input, df),
575
- unsafe_allow_html=True
576
- )
577
- st.subheader("Downloads", divider="blue")
578
- col1, col2, col3 = st.columns([1, 1, 3])
579
-
580
- # 1. Download CSV
581
- csv_buffer = generate_entity_csv(df)
582
- col1.download_button(
583
- label="Download Entities as CSV",
584
- data=csv_buffer.getvalue(),
585
- file_name="ner_entities.csv",
586
- mime="text/csv"
587
- )
588
- # 2. Download HTML Report
589
- html_content = generate_html_report(df, text_input, elapsed_time, df_topic_data)
590
- col2.download_button(
591
- label="Download Full HTML Report",
592
- data=html_content.encode('utf-8'),
593
- file_name="ner_analysis_report.html",
594
- mime="text/html"
595
- )
596
-
597
- st.markdown("---")
598
-
599
- # CORRECTION 1: Tabs Implementation
600
- tab1, tab2 = st.tabs(["📊 Entity Data (Table) & Glossary", "📈 Visualizations & Topics"])
601
-
602
- with tab1:
603
- # Create the summary table with the requested column name changes
604
- grouped_entity_table = df.groupby(['category', 'label']).size().reset_index(name='Count')
605
- grouped_entity_table.columns = ['Category', 'Entity', 'Count']
606
-
607
- st.markdown("## Entity Counts by Category and Entity")
608
- st.dataframe(grouped_entity_table.sort_values(by=['Category', 'Count'], ascending=[True, False]), use_container_width=True)
609
-
610
  st.markdown("---")
611
- st.markdown("## Glossary of Tags and Category Mapping")
612
-
613
- # Display Category Mapping (forward and reverse)
614
- st.markdown("### Category to Entity Label Mapping (`category_mapping`)")
615
- st.json(category_mapping)
616
-
617
- # Display the requested reverse mapping below the table
618
- st.markdown("### Entity Label to Category Mapping (Reverse Glossary) (`reverse_category_mapping`)")
619
- st.json(reverse_category_mapping) # Display the reverse mapping which was moved to the top
620
-
621
- # Display general glossary
622
- st.markdown("### General Glossary for Extracted Entities")
623
- st.write("""
624
- - **start**: Index of the start of the corresponding entity.
625
- - **end**: Index of the end of the corresponding entity.
626
- - **text**: Entity extracted from your text data.
627
- - **label**: The entity tag assigned to the extracted entity.
628
- - **category**: The broad category (e.g., 'People') derived from the 'label'.
629
- - **score**: Accuracy score; how accurately a tag has been assigned to a given entity.
630
- """)
631
-
632
- with tab2:
633
- st.markdown("## Visualizations")
634
-
635
- # 3a. Treemap (As requested in Tab 2)
636
- fig_treemap = px.treemap(
637
- df,
638
- path=[px.Constant("All Entities"), 'category', 'label', 'text'],
639
- values='score',
640
- color='category',
641
- title="Entity Distribution by Category and Label",
642
- color_discrete_sequence=px.colors.qualitative.Dark24
643
- )
644
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
645
- st.markdown("### Entity Distribution Treemap")
646
- st.plotly_chart(fig_treemap, use_container_width=True)
647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  st.markdown("---")
649
- # 3b. Pie Chart and Category Bar Chart side-by-side
650
- col_pie, col_bar_cat = st.columns(2)
 
651
 
652
- # Pie Chart
653
  grouped_counts = df['category'].value_counts().reset_index()
654
  grouped_counts.columns = ['Category', 'Count']
655
- fig_pie = px.pie(grouped_counts, values='Count', names='Category',
656
- title='Distribution of Entities by Category',
657
- color_discrete_sequence=px.colors.sequential.RdBu)
658
- fig_pie.update_layout(margin=dict(t=50, b=10))
659
- with col_pie:
660
- st.markdown("### Distribution of Entities by Category")
661
  st.plotly_chart(fig_pie, use_container_width=True)
662
- # Category Bar Chart
663
- fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',
664
- color='Category', title='Total Entities per Category',
665
- color_discrete_sequence=px.colors.qualitative.Pastel)
666
- fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=10))
667
- with col_bar_cat:
668
- st.markdown("### Total Entities per Category")
669
  st.plotly_chart(fig_bar_category, use_container_width=True)
670
 
671
- st.markdown("---")
672
- # 3c. Most Frequent Entities Bar Chart
673
- word_counts = df['text'].value_counts().reset_index()
674
- word_counts.columns = ['Entity', 'Count']
675
- repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
676
- st.markdown("### Top 10 Most Frequent Entities")
677
- if not repeating_entities.empty:
678
- fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',
679
- color='Entity', title='Top 10 Most Frequent Entities',
680
- color_discrete_sequence=px.colors.sequential.Plasma)
681
- fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=100))
682
- st.plotly_chart(fig_bar_freq, use_container_width=True)
683
- else:
684
- st.info("No entities appear more than once in the text for visualization.")
685
 
686
  st.markdown("---")
687
- # 3d. Network Graph
688
- st.markdown("### Entity Relationship Map")
689
- network_fig = generate_network_graph(df, text_input)
690
  st.plotly_chart(network_fig, use_container_width=True)
691
 
692
  st.markdown("---")
693
-
694
- # 4. Topic Modeling
695
- st.markdown("## Topic Modeling")
696
 
697
  if df_topic_data is not None and not df_topic_data.empty:
698
- st.markdown("### Bubble size = word weight")
699
  bubble_figure = create_topic_word_bubbles(df_topic_data)
700
- st.plotly_chart(bubble_figure, use_container_width=True)
701
-
702
- st.markdown("### Top Words by Topic")
703
- # Simple table display of topic words
704
- st.dataframe(df_topic_data.rename(columns={'Topic_ID': 'Topic ID', 'Word': 'Top Word', 'Weight': 'Weight'}), use_container_width=True, hide_index=True)
705
  else:
706
- st.info("Topic Modeling requires text containing at least two unique entities.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  import re
12
  import string
13
  import json
14
+ # --- PPTX Imports ---
15
  from io import BytesIO
16
+ from pptx import Presentation
17
+ from pptx.util import Inches, Pt
18
+ from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
+ import plotly.io as pio # Required for image export
20
  # ---------------------------
21
  # --- Stable Scikit-learn LDA Imports ---
22
  from sklearn.feature_extraction.text import TfidfVectorizer
 
39
  # Set HF_HOME environment variable to a writable path
40
  os.environ['HF_HOME'] = '/tmp'
41
 
42
+ # --- Color Map for Highlighting and Network Graph Nodes ---
43
  entity_color_map = {
44
  "person": "#10b981",
45
+ "username": "#3b82f6",
46
+ "hashtag": "#4ade80",
47
+ "mention" : "#f97316",
48
  "organization": "#f59e0b",
49
+ "community": "#8b5cf6",
50
+ "position": "#ec4899",
51
+ "location": "#06b6d4",
52
+ "event": "#f43f5e",
53
+ "product": "#a855f7",
54
+ "platform": "#eab308",
55
+ "date": "#6366f1",
56
+ "media_type": "#14b8a6",
57
+ "url": "#60a5fa",
58
+ "nationality_religion": "#fb7185"
59
  }
60
 
61
+ # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
62
  labels = list(entity_color_map.keys())
 
63
  category_mapping = {
64
+ "People & Groups": ["person", "username", "hashtag", "mention", "community", "position", "nationality_religion"],
65
+ "Location & Organization": ["location", "organization"],
66
+ "Temporal & Events": ["event", "date"],
67
+ "Digital & Products": ["platform", "product", "media_type", "url"],
68
+ }
69
+ reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
70
 
 
 
 
71
 
72
  # --- Utility Functions for Analysis and Plotly ---
73
  def extract_label(node_name):
 
83
  """Generates HTML to display text with entities highlighted and colored."""
84
  if df_entities.empty:
85
  return text
86
+
87
  # Sort entities by start index descending to insert highlights without affecting subsequent indices
88
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
89
  highlighted_text = text
90
+
91
  for entity in entities:
92
  start = entity['start']
93
  end = entity['end']
94
  label = entity['label']
95
  entity_text = entity['text']
96
  color = entity_color_map.get(label, '#000000')
97
+
98
  # Create a span with background color and tooltip
99
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
100
  # Replace the original text segment with the highlighted HTML
101
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
102
+
103
  # Use a div to mimic the Streamlit input box style for the report
104
+ return f'<div style="border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
105
 
106
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
107
  """
 
111
  documents = df_entities['text'].unique().tolist()
112
  if len(documents) < 2:
113
  return None
114
+
115
  N = min(num_top_words, len(documents))
116
  try:
117
  tfidf_vectorizer = TfidfVectorizer(
 
121
  )
122
  tfidf = tfidf_vectorizer.fit_transform(documents)
123
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
124
+
125
  lda = LatentDirichletAllocation(
126
  n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
127
  )
 
147
  # Renaming columns to match the output of perform_topic_modeling
148
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
149
  df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
150
+
151
  if df_topic_data.empty:
152
  return None
153
  fig = px.scatter(
 
173
  xaxis={'tickangle': -45, 'showgrid': False},
174
  yaxis={'showgrid': True},
175
  showlegend=True,
176
+ plot_bgcolor='#FFF0F5',
177
+ paper_bgcolor='#FFF0F5',
178
  height=600,
179
  margin=dict(t=50, b=100, l=50, r=10),
180
  )
181
+ fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>', marker=dict(line=dict(width=1, color='DarkSlateGrey')))
182
  return fig
183
 
184
  def generate_network_graph(df, raw_text):
185
  """
186
  Generates a network graph visualization (Node Plot) with edges
187
+ based on entity co-occurrence in sentences. (Content omitted for brevity but assumed to be here).
188
  """
189
+ # Using the existing generate_network_graph logic from previous context...
190
  entity_counts = df['text'].value_counts().reset_index()
191
  entity_counts.columns = ['text', 'frequency']
192
+
193
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
194
  if unique_entities.shape[0] < 2:
195
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
196
 
197
  num_nodes = len(unique_entities)
198
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
199
+
200
  radius = 10
201
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
202
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
 
203
 
204
+ pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
205
  edges = set()
 
206
 
207
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
208
  for sentence in sentences:
209
  entities_in_sentence = []
210
  for entity_text in unique_entities['text'].unique():
211
  if entity_text.lower() in sentence.lower():
212
  entities_in_sentence.append(entity_text)
213
  unique_entities_in_sentence = list(set(entities_in_sentence))
214
+
215
  for i in range(len(unique_entities_in_sentence)):
216
  for j in range(i + 1, len(unique_entities_in_sentence)):
217
  node1 = unique_entities_in_sentence[i]
 
221
 
222
  edge_x = []
223
  edge_y = []
224
+
225
  for edge in edges:
226
  n1, n2 = edge
227
  if n1 in pos_map and n2 in pos_map:
 
273
  seen_labels.add(label)
274
  color = entity_color_map.get(label, '#cccccc')
275
  legend_traces.append(go.Scatter(
276
+ x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True
277
  ))
278
  for trace in legend_traces:
279
  fig.add_trace(trace)
 
289
  margin=dict(t=50, b=10, l=10, r=10),
290
  height=600
291
  )
292
+
293
  return fig
294
 
295
+
296
+ # --- PPTX HELPER FUNCTIONS (Integrated from generate_report.py) ---
297
+
298
+ def fig_to_image_buffer(fig):
299
+ """
300
+ Converts a Plotly figure object into a BytesIO buffer containing PNG data.
301
+ Requires 'kaleido' to be installed for image export.
302
+ Returns None if export fails.
303
+ """
304
+ try:
305
+ # Use pio.to_image to convert the figure to a PNG byte array
306
+ img_bytes = pio.to_image(fig, format="png", width=900, height=500, scale=2)
307
+ img_buffer = BytesIO(img_bytes)
308
+ return img_buffer
309
+ except Exception as e:
310
+ # In a Streamlit environment, we can't show this error directly in the app execution flow
311
+ print(f"Error converting Plotly figure to image: {e}")
312
+ return None
313
+
314
+ # --- PPTX GENERATION FUNCTION (Integrated and Adapted) ---
315
+
316
+ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_category_mapping):
317
+ """
318
+ Generates a PowerPoint presentation (.pptx) file containing key analysis results.
319
+ Returns the file content as a BytesIO buffer.
320
+ """
321
+ prs = Presentation()
322
+ # Layout 5: Title and Content (often good for charts)
323
+ chart_layout = prs.slide_layouts[5]
324
+
325
+ # 1. Title Slide
326
+ title_slide_layout = prs.slide_layouts[0]
327
+ slide = prs.slides.add_slide(title_slide_layout)
328
+ title = slide.shapes.title
329
+ subtitle = slide.placeholders[1]
330
+ title.text = "NER & Topic Analysis Report"
331
+ subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
332
+
333
+ # 2. Source Text Slide
334
+ slide = prs.slides.add_slide(chart_layout)
335
+ slide.shapes.title.text = "Analyzed Source Text"
336
+
337
+ # Add the raw text to a text box
338
+ left = Inches(0.5)
339
+ top = Inches(1.5)
340
+ width = Inches(9.0)
341
+ height = Inches(5.0)
342
+ txBox = slide.shapes.add_textbox(left, top, width, height)
343
+ tf = txBox.text_frame
344
+ tf.margin_top = Inches(0.1)
345
+ tf.margin_bottom = Inches(0.1)
346
+ tf.word_wrap = True
347
+ p = tf.add_paragraph()
348
+ p.text = text_input
349
+ p.font.size = Pt(14)
350
+ p.font.name = 'Arial'
351
+
352
+ # 3. Entity Summary Slide (Table)
353
+ slide = prs.slides.add_slide(chart_layout)
354
+ slide.shapes.title.text = "Entity Summary (Count by Category and Label)"
355
+
356
+ # Create the summary table using the app's established logic
357
+ grouped_entity_table = df['label'].value_counts().reset_index()
358
+ grouped_entity_table.columns = ['Entity Label', 'Count']
359
+ grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(
360
+ lambda x: reverse_category_mapping.get(x, 'Other')
361
+ )
362
+ grouped_entity_table = grouped_entity_table[['Category', 'Entity Label', 'Count']]
363
+
364
+ # Simple way to insert a table:
365
+ rows, cols = grouped_entity_table.shape
366
+ x, y, cx, cy = Inches(1), Inches(1.5), Inches(8), Inches(4.5)
367
+ # Add 1 row for the header
368
+ table = slide.shapes.add_table(rows + 1, cols, x, y, cx, cy).table
369
+
370
+ # Set column widths
371
+ table.columns[0].width = Inches(2.7)
372
+ table.columns[1].width = Inches(2.8)
373
+ table.columns[2].width = Inches(2.5)
374
+
375
+ # Set column headers
376
+ for i, col in enumerate(grouped_entity_table.columns):
377
+ cell = table.cell(0, i)
378
+ cell.text = col
379
+ cell.fill.solid()
380
+ # Optional: Add simple styling to header
381
+
382
+ # Fill in the data
383
+ for i in range(rows):
384
+ for j in range(cols):
385
+ cell = table.cell(i+1, j)
386
+ cell.text = str(grouped_entity_table.iloc[i, j])
387
+ # Optional: Style data cells
388
+
389
+ # 4. Treemap Slide (Visualization)
390
+ fig_treemap = px.treemap(
391
+ df,
392
+ path=[px.Constant("All Entities"), 'category', 'label', 'text'],
393
+ values='score',
394
+ color='category',
395
+ title="Entity Distribution by Category and Label",
396
+ color_discrete_sequence=px.colors.qualitative.Dark24
397
+ )
398
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
399
+ treemap_image = fig_to_image_buffer(fig_treemap)
400
+
401
+ if treemap_image:
402
+ slide = prs.slides.add_slide(chart_layout)
403
+ slide.shapes.title.text = "Entity Distribution Treemap"
404
+ slide.shapes.add_picture(treemap_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
405
+
406
+ # 5. Entity Count Bar Chart Slide (Visualization)
407
+ grouped_counts = df['category'].value_counts().reset_index()
408
+ grouped_counts.columns = ['Category', 'Count']
409
+ fig_bar_category = px.bar(
410
+ grouped_counts,
411
+ x='Category',
412
+ y='Count',
413
+ color='Category',
414
+ title='Total Entities per Category',
415
+ color_discrete_sequence=px.colors.qualitative.Pastel
416
+ )
417
+ fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
418
+ bar_category_image = fig_to_image_buffer(fig_bar_category)
419
+
420
+ if bar_category_image:
421
+ slide = prs.slides.add_slide(chart_layout)
422
+ slide.shapes.title.text = "Total Entities per Category"
423
+ slide.shapes.add_picture(bar_category_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
424
+
425
+ # 6. Topic Modeling Bubble Chart Slide
426
+ if df_topic_data is not None and not df_topic_data.empty:
427
+ # Ensure data frame is in the format expected by create_topic_word_bubbles
428
+ df_topic_data_pptx = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
429
+ bubble_figure = create_topic_word_bubbles(df_topic_data_pptx)
430
+ bubble_image = fig_to_image_buffer(bubble_figure)
431
+ if bubble_image:
432
+ slide = prs.slides.add_slide(chart_layout)
433
+ slide.shapes.title.text = "Topic Word Weights (Bubble Chart)"
434
+ slide.shapes.add_picture(bubble_image, Inches(0.75), Inches(1.5), width=Inches(8.5))
435
+ else:
436
+ # Placeholder slide if topic modeling is not available
437
+ slide = prs.slides.add_slide(chart_layout)
438
+ slide.shapes.title.text = "Topic Modeling Results"
439
+ slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
440
+
441
+ # Save the presentation to an in-memory buffer
442
+ pptx_buffer = BytesIO()
443
+ prs.save(pptx_buffer)
444
+ pptx_buffer.seek(0)
445
+ return pptx_buffer
446
+
447
  # --- NEW CSV GENERATION FUNCTION ---
448
  def generate_entity_csv(df):
449
  """
 
459
  # -----------------------------------
460
 
461
  # --- Existing App Functionality (HTML) ---
462
+
463
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
464
  """
465
+ Generates a full HTML report containing all analysis results and visualizations.
466
+ (Content omitted for brevity but assumed to be here).
467
  """
468
  # 1. Generate Visualizations (Plotly HTML)
469
+
470
  # 1a. Treemap
471
  fig_treemap = px.treemap(
472
  df,
 
496
  word_counts.columns = ['Entity', 'Count']
497
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
498
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
499
+
500
  if not repeating_entities.empty:
501
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
502
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
 
515
  else:
516
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
517
  else:
518
+ topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #FF69B4;">'
519
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
520
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
521
  topic_charts_html += '</div>'
 
524
  highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
525
 
526
  # 3. Entity Tables (Pandas to HTML)
527
+ entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
528
+ classes='table table-striped',
529
+ index=False
 
 
 
530
  )
531
 
532
  # 4. Construct the Final HTML
 
538
  <style>
539
  body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
540
  .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
541
+ h1 {{ color: #FF69B4; border-bottom: 3px solid #FF69B4; padding-bottom: 10px; margin-top: 0; }}
542
  h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
543
  h3 {{ color: #555; margin-top: 20px; }}
544
+ .metadata {{ background-color: #FFE4E1; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
545
  .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
546
  table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
547
  table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
548
  table th {{ background-color: #f0f0f0; }}
549
+ .highlighted-text {{ border: 1px solid #FF69B4; padding: 15px; border-radius: 5px; background-color: #FFFAF0; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
550
  </style></head><body>
551
  <div class="container">
552
  <h1>Entity and Topic Analysis Report</h1>
 
559
  <div class="highlighted-text-container">
560
  {highlighted_text_html}
561
  </div>
562
+ <h2>2. Full Extracted Entities Table</h2>
563
+ {entity_table_html}
564
  <h2>3. Data Visualizations</h2>
565
  <h3>3.1 Entity Distribution Treemap</h3>
566
  <div class="chart-box">{treemap_html}</div>
567
+ <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
568
  <div class="chart-box">{pie_html}</div>
569
  <div class="chart-box">{bar_category_html}</div>
570
  <div class="chart-box">{bar_freq_html}</div>
571
+ <h3>3.3 Entity Co-occurrence Network (Edges = Same Sentence)</h3>
572
  <div class="chart-box">{network_html}</div>
573
+ <h2>4. Topic Modeling (LDA on Entities)</h2>
574
  {topic_charts_html}
575
  </div></body></html>
576
  """
577
  return html_content
578
 
579
+
580
+ # --- Page Configuration and Styling (No Sidebar) ---
581
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
582
  st.markdown(
583
  """
584
  <style>
585
  /* Overall app container - NO SIDEBAR */
586
  .main {
587
+ background-color: #FFF0F5; /* Blanched Almond/Light Pink */
588
  color: #333333; /* Dark grey text for contrast */
589
  }
590
  .stApp {
591
+ background-color: #FFF0F5;
592
  }
593
  /* Text Area background and text color (input fields) */
594
  .stTextArea textarea {
595
+ background-color: #FFFAF0; /* Floral White/Near white for input fields */
596
  color: #000000; /* Black text for input */
597
+ border: 1px solid #FF69B4; /* Deep Pink border */
598
  }
599
  /* Button styling */
600
  .stButton > button {
601
+ background-color: #FF69B4; /* Deep Pink for the button */
602
  color: #FFFFFF; /* White text for contrast */
603
  border: none;
604
  padding: 10px 20px;
 
606
  }
607
  /* Expander header and content background */
608
  .streamlit-expanderHeader, .streamlit-expanderContent {
609
+ background-color: #FFE4E1; /* Misty Rose/Lighter Pink */
610
  color: #333333;
611
  }
612
  </style>
613
  """,
614
  unsafe_allow_html=True)
615
+ st.subheader("NER and Topic Analysis Report Generator", divider="rainbow")
616
+ st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
 
 
 
617
  expander = st.expander("**Important notes**")
618
+ expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
619
+ **Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`.
620
+ **Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
621
+ **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
622
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
623
 
624
  # --- Comet ML Setup (Placeholder/Conditional) ---
 
653
  "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
654
  "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
655
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
656
+ "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
657
+ )
658
  # -----------------------------------
 
659
  # --- Session State Initialization (CRITICAL FIX) ---
660
  if 'show_results' not in st.session_state:
661
  st.session_state.show_results = False
 
687
  height=250,
688
  key='my_text_area',
689
  value=st.session_state.my_text_area)
690
+
691
  word_count = len(text.split())
692
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
693
  st.button("Clear text", on_click=clear_text)
 
705
  if text != st.session_state.last_text:
706
  st.session_state.last_text = text
707
  start_time = time.time()
708
+
709
  # --- Model Prediction & Dataframe Creation ---
710
  entities = model.predict_entities(text, labels)
711
  df = pd.DataFrame(entities)
712
+
713
  if not df.empty:
714
  df['text'] = df['text'].apply(remove_trailing_punctuation)
715
  df['category'] = df['label'].map(reverse_category_mapping)
716
  st.session_state.results_df = df
717
+
718
  unique_entity_count = len(df['text'].unique())
719
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
720
+
721
  st.session_state.topic_results = perform_topic_modeling(
722
  df,
723
  num_topics=2,
724
  num_top_words=N_TOP_WORDS_TO_USE
725
  )
726
+
727
  if comet_initialized:
728
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
729
  experiment.log_parameter("input_text", text)
 
732
  else:
733
  st.session_state.results_df = pd.DataFrame()
734
  st.session_state.topic_results = None
735
+
736
  end_time = time.time()
737
  st.session_state.elapsed_time = end_time - start_time
738
 
739
+ st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
740
+ st.session_state.show_results = True
 
 
 
741
 
742
+ # --- Display Download Link and Results ---
743
+ if st.session_state.show_results:
744
  df = st.session_state.results_df
 
 
745
  df_topic_data = st.session_state.topic_results
746
 
747
+ if df.empty:
748
+ st.warning("No entities were found in the provided text.")
749
+ else:
750
+ st.subheader("Analysis Results", divider="blue")
751
+
752
+ # 1. Highlighted Text
753
+ st.markdown("### 1. Analyzed Text with Highlighted Entities")
754
+ st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
755
+
756
+ # 2. Entity Summary Table
757
+ st.markdown("### 2. Entity Summary Table (Count by Label)")
758
+ grouped_entity_table = df['label'].value_counts().reset_index()
759
+ grouped_entity_table.columns = ['Entity Label', 'Count']
760
+ grouped_entity_table['Category'] = grouped_entity_table['Entity Label'].map(reverse_category_mapping)
761
+ st.dataframe(grouped_entity_table[['Category', 'Entity Label', 'Count']], use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
762
  st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
763
 
764
+ # 3. Detailed Entity Analysis Tabs
765
+ st.markdown("### 3. Detailed Entity Analysis")
766
+ tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
767
+
768
+ with tab_category_details:
769
+ st.markdown("#### Detailed Entities Table (Grouped by Category)")
770
+ unique_categories = list(category_mapping.keys())
771
+ tabs_category = st.tabs(unique_categories)
772
+ for category, tab in zip(unique_categories, tabs_category):
773
+ df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
774
+ with tab:
775
+ st.markdown(f"##### {category} Entities ({len(df_category)} total)")
776
+ if not df_category.empty:
777
+ st.dataframe(
778
+ df_category,
779
+ use_container_width=True,
780
+ column_config={'score': st.column_config.NumberColumn(format="%.4f")}
781
+ )
782
+ else:
783
+ st.info(f"No entities of category **{category}** were found in the text.")
784
+
785
+ with tab_treemap_viz:
786
+ st.markdown("#### Treemap: Entity Distribution")
787
+ fig_treemap = px.treemap(
788
+ df,
789
+ path=[px.Constant("All Entities"), 'category', 'label', 'text'],
790
+ values='score',
791
+ color='category',
792
+ title="Entity Distribution by Category and Label",
793
+ color_discrete_sequence=px.colors.qualitative.Dark24
794
+ )
795
+ fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
796
+ st.plotly_chart(fig_treemap, use_container_width=True)
797
+
798
+ # 4. Comparative Charts
799
  st.markdown("---")
800
+ st.markdown("### 4. Comparative Charts")
801
+
802
+ col1, col2, col3 = st.columns(3)
803
 
 
804
  grouped_counts = df['category'].value_counts().reset_index()
805
  grouped_counts.columns = ['Category', 'Count']
806
+
807
+ with col1: # Pie Chart
808
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.RdBu)
809
+ fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
 
 
810
  st.plotly_chart(fig_pie, use_container_width=True)
811
+
812
+ with col2: # Bar Chart (Category Count)
813
+ fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
814
+ fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
 
 
 
815
  st.plotly_chart(fig_bar_category, use_container_width=True)
816
 
817
+ with col3: # Bar Chart (Most Frequent Entities)
818
+ word_counts = df['text'].value_counts().reset_index()
819
+ word_counts.columns = ['Entity', 'Count']
820
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
821
+ if not repeating_entities.empty:
822
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
823
+ fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
824
+ st.plotly_chart(fig_bar_freq, use_container_width=True)
825
+ else:
826
+ st.info("No entities repeat for frequency chart.")
 
 
 
 
827
 
828
  st.markdown("---")
829
+ st.markdown("### 5. Entity Co-occurrence Network")
830
+ network_fig = generate_network_graph(df, st.session_state.last_text)
 
831
  st.plotly_chart(network_fig, use_container_width=True)
832
 
833
  st.markdown("---")
834
+ st.markdown("### 6. Topic Modeling Analysis")
 
 
835
 
836
  if df_topic_data is not None and not df_topic_data.empty:
 
837
  bubble_figure = create_topic_word_bubbles(df_topic_data)
838
+ if bubble_figure:
839
+ st.plotly_chart(bubble_figure, use_container_width=True)
840
+ else:
841
+ st.error("Error generating Topic Word Bubble Chart.")
 
842
  else:
843
+ st.info("Topic modeling requires more unique input (at least two unique entities).")
844
+
845
+ # --- Report Download ---
846
+ st.markdown("---")
847
+ st.markdown("### Download Full Report Artifacts")
848
+
849
+ # 1. HTML Report Download (Retained)
850
+ html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
851
+ st.download_button(
852
+ label="Download Comprehensive HTML Report",
853
+ data=html_report,
854
+ file_name="ner_topic_report.html",
855
+ mime="text/html",
856
+ type="primary"
857
+ )
858
+
859
+ # 2. PowerPoint PPTX Download (Retained)
860
+ pptx_buffer = generate_pptx_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data, reverse_category_mapping)
861
+ st.download_button(
862
+ label="Download Presentation Slides (.pptx)",
863
+ data=pptx_buffer,
864
+ file_name="ner_topic_report.pptx",
865
+ mime="application/vnd.openxmlformats-officedocument.presentationml.presentation",
866
+ type="primary"
867
+ )
868
+
869
+ # 3. CSV Data Download (NEW)
870
+ csv_buffer = generate_entity_csv(df)
871
+ st.download_button(
872
+ label="Download Extracted Entities (CSV)",
873
+ data=csv_buffer,
874
+ file_name="extracted_entities.csv",
875
+ mime="text/csv",
876
+ type="secondary"
877
+ )
878
+