AIEcosystem commited on
Commit
f4ad236
ยท
verified ยท
1 Parent(s): f89b757

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +64 -131
src/streamlit_app.py CHANGED
@@ -41,37 +41,26 @@ entity_color_map = {
41
  "person": "#10b981",
42
  "country": "#3b82f6",
43
  "city": "#4ade80",
44
-
45
  "organization": "#f59e0b",
46
  "date": "#8b5cf6",
47
  "time": "#ec4899",
48
  "cardinal": "#06b6d4",
49
  "money": "#f43f5e",
50
  "position": "#a855f7",
51
-
52
  }
53
 
54
  # --- Label Definitions and Category Mapping (Used by the App) ---
55
  labels = list(entity_color_map.keys())
56
-
57
-
58
-
59
-
60
  labels = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
61
  category_mapping = {
62
  "People": ["person", "organization", "position"],
63
  "Locations": ["country", "city"],
64
  "Time": ["date", "time"],
65
- "Numbers": ["money", "cardinal"]
66
- }
67
-
68
-
69
-
70
-
71
-
72
-
73
- reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
74
 
 
 
 
75
 
76
  # --- Utility Functions for Analysis and Plotly ---
77
  def extract_label(node_name):
@@ -87,23 +76,19 @@ def highlight_entities(text, df_entities):
87
  """Generates HTML to display text with entities highlighted and colored."""
88
  if df_entities.empty:
89
  return text
90
-
91
  # Sort entities by start index descending to insert highlights without affecting subsequent indices
92
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
93
  highlighted_text = text
94
-
95
  for entity in entities:
96
  start = entity['start']
97
  end = entity['end']
98
  label = entity['label']
99
  entity_text = entity['text']
100
  color = entity_color_map.get(label, '#000000')
101
-
102
  # Create a span with background color and tooltip
103
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
104
  # Replace the original text segment with the highlighted HTML
105
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
106
-
107
  # Use a div to mimic the Streamlit input box style for the report
108
  return f'<div style="border: 1px solid #CCCCCC; padding: 15px; border-radius: 5px; background-color: #FFFFFF; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
109
 
@@ -115,7 +100,6 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
115
  documents = df_entities['text'].unique().tolist()
116
  if len(documents) < 2:
117
  return None
118
-
119
  N = min(num_top_words, len(documents))
120
  try:
121
  tfidf_vectorizer = TfidfVectorizer(
@@ -125,7 +109,6 @@ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
125
  )
126
  tfidf = tfidf_vectorizer.fit_transform(documents)
127
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
128
-
129
  lda = LatentDirichletAllocation(
130
  n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
131
  )
@@ -151,7 +134,6 @@ def create_topic_word_bubbles(df_topic_data):
151
  # Renaming columns to match the output of perform_topic_modeling
152
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
153
  df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
154
-
155
  if df_topic_data.empty:
156
  return None
157
  fig = px.scatter(
@@ -182,8 +164,7 @@ def create_topic_word_bubbles(df_topic_data):
182
  height=600,
183
  margin=dict(t=50, b=100, l=50, r=10),
184
  )
185
- fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',
186
- marker=dict(line=dict(width=1, color='DarkSlateGrey')))
187
  return fig
188
 
189
  def generate_network_graph(df, raw_text):
@@ -193,29 +174,26 @@ def generate_network_graph(df, raw_text):
193
  """
194
  entity_counts = df['text'].value_counts().reset_index()
195
  entity_counts.columns = ['text', 'frequency']
196
-
197
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
198
  if unique_entities.shape[0] < 2:
199
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
200
 
201
  num_nodes = len(unique_entities)
202
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
203
-
204
  radius = 10
205
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
206
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
207
-
208
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
209
- edges = set()
210
 
 
211
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
 
212
  for sentence in sentences:
213
  entities_in_sentence = []
214
  for entity_text in unique_entities['text'].unique():
215
  if entity_text.lower() in sentence.lower():
216
  entities_in_sentence.append(entity_text)
217
  unique_entities_in_sentence = list(set(entities_in_sentence))
218
-
219
  for i in range(len(unique_entities_in_sentence)):
220
  for j in range(i + 1, len(unique_entities_in_sentence)):
221
  node1 = unique_entities_in_sentence[i]
@@ -225,7 +203,6 @@ def generate_network_graph(df, raw_text):
225
 
226
  edge_x = []
227
  edge_y = []
228
-
229
  for edge in edges:
230
  n1, n2 = edge
231
  if n1 in pos_map and n2 in pos_map:
@@ -277,8 +254,7 @@ def generate_network_graph(df, raw_text):
277
  seen_labels.add(label)
278
  color = entity_color_map.get(label, '#cccccc')
279
  legend_traces.append(go.Scatter(
280
- x=[None], y=[None], mode='markers', marker=dict(size=10, color=color),
281
- name=f"{label.capitalize()}", showlegend=True
282
  ))
283
  for trace in legend_traces:
284
  fig.add_trace(trace)
@@ -294,10 +270,8 @@ name=f"{label.capitalize()}", showlegend=True
294
  margin=dict(t=50, b=10, l=10, r=10),
295
  height=600
296
  )
297
-
298
  return fig
299
 
300
-
301
  # --- NEW CSV GENERATION FUNCTION ---
302
  def generate_entity_csv(df):
303
  """
@@ -313,19 +287,12 @@ def generate_entity_csv(df):
313
  # -----------------------------------
314
 
315
  # --- Existing App Functionality (HTML) ---
316
- # NOTE: Removed the 'grouped_entity_table_html' generation that counted by label,
317
- # keeping only the grouped by category table generation if needed for the HTML report,
318
- # but prioritizing the Streamlit display of the grouped-by-category table.
319
-
320
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
321
  """
322
  Generates a full HTML report containing all analysis results and
323
  visualizations. (Simplified HTML generation for brevity in code)
324
  """
325
- # ... (Plotly chart HTML generation code remains largely the same)
326
-
327
  # 1. Generate Visualizations (Plotly HTML)
328
-
329
  # 1a. Treemap
330
  fig_treemap = px.treemap(
331
  df,
@@ -355,7 +322,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
355
  word_counts.columns = ['Entity', 'Count']
356
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
357
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
358
-
359
  if not repeating_entities.empty:
360
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
361
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
@@ -390,7 +356,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
390
  classes='table table-striped',
391
  index=False
392
  )
393
-
394
  # 4. Construct the Final HTML
395
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
396
  <meta charset="UTF-8">
@@ -438,7 +404,6 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
438
  """
439
  return html_content
440
 
441
-
442
  # --- Page Configuration and Styling (No Sidebar, Removed Pink) ---
443
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
444
  st.markdown(
@@ -474,25 +439,13 @@ st.markdown(
474
  </style>
475
  """,
476
  unsafe_allow_html=True)
477
- st.subheader("NER and Topic Analysis Report Generator", divider="blue")
478
- st.link_button("by nlpblogs", "https://nlpblogs.com", type="secondary")
479
- expander = st.expander("**Important notes**")
480
- expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
481
- **Dependencies:** Note that **image export** requires the Python libraries `plotly` and `kaleido`.
482
 
483
- **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
 
484
 
 
485
  expander = st.expander("**Important notes**")
486
- expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
487
-
488
- **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
489
-
490
- **How to Use:** Type or paste your text (max. 1000 words) into the text area below, press Ctrl + Enter, and then click the 'Results' button.
491
-
492
- **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
493
-
494
-
495
-
496
 
497
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
498
 
@@ -503,7 +456,7 @@ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
503
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
504
 
505
  # --- Model Loading ---
506
- @st.cache_resource
507
  def load_ner_model():
508
  """Loads the GLiNER model and caches it."""
509
  try:
@@ -528,9 +481,9 @@ DEFAULT_TEXT = (
528
  "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
529
  "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
530
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
531
- "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026."
532
- )
533
  # -----------------------------------
 
534
  # --- Session State Initialization (CRITICAL FIX) ---
535
  if 'show_results' not in st.session_state:
536
  st.session_state.show_results = False
@@ -562,7 +515,6 @@ text = st.text_area(
562
  height=250,
563
  key='my_text_area',
564
  value=st.session_state.my_text_area)
565
-
566
  word_count = len(text.split())
567
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
568
  st.button("Clear text", on_click=clear_text)
@@ -580,25 +532,20 @@ if st.button("Results"):
580
  if text != st.session_state.last_text:
581
  st.session_state.last_text = text
582
  start_time = time.time()
583
-
584
  # --- Model Prediction & Dataframe Creation ---
585
  entities = model.predict_entities(text, labels)
586
  df = pd.DataFrame(entities)
587
-
588
  if not df.empty:
589
  df['text'] = df['text'].apply(remove_trailing_punctuation)
590
  df['category'] = df['label'].map(reverse_category_mapping)
591
  st.session_state.results_df = df
592
-
593
  unique_entity_count = len(df['text'].unique())
594
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
595
-
596
  st.session_state.topic_results = perform_topic_modeling(
597
  df,
598
  num_topics=2,
599
  num_top_words=N_TOP_WORDS_TO_USE
600
  )
601
-
602
  if comet_initialized:
603
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
604
  experiment.log_parameter("input_text", text)
@@ -607,31 +554,29 @@ if st.button("Results"):
607
  else:
608
  st.session_state.results_df = pd.DataFrame()
609
  st.session_state.topic_results = None
610
-
611
  end_time = time.time()
612
  st.session_state.elapsed_time = end_time - start_time
613
-
614
- st.session_state.show_results = True
615
 
616
  # --- Results Display ---
617
  if st.session_state.show_results and not st.session_state.results_df.empty:
618
  st.success(f"Processing complete in {st.session_state.elapsed_time:.2f} seconds! ๐ŸŽ‰")
619
-
620
  df = st.session_state.results_df
621
  text_input = st.session_state.last_text
622
  elapsed_time = st.session_state.elapsed_time
623
  df_topic_data = st.session_state.topic_results
624
-
625
  # --- Highlighted Text and Download Buttons (Above Tabs) ---
626
  st.subheader("1. Analyzed Text & Extracted Entities", divider="blue")
627
  st.markdown(
628
  highlight_entities(text_input, df),
629
  unsafe_allow_html=True
630
  )
631
-
632
  st.subheader("Downloads", divider="blue")
633
  col1, col2, col3 = st.columns([1, 1, 3])
634
-
635
  # 1. Download CSV
636
  csv_buffer = generate_entity_csv(df)
637
  col1.download_button(
@@ -640,7 +585,6 @@ if st.session_state.show_results and not st.session_state.results_df.empty:
640
  file_name="ner_entities.csv",
641
  mime="text/csv"
642
  )
643
-
644
  # 2. Download HTML Report
645
  html_content = generate_html_report(df, text_input, elapsed_time, df_topic_data)
646
  col2.download_button(
@@ -649,32 +593,45 @@ if st.session_state.show_results and not st.session_state.results_df.empty:
649
  file_name="ner_analysis_report.html",
650
  mime="text/html"
651
  )
652
-
653
  st.markdown("---")
654
-
655
- # --- Tabs Implementation ---
656
- tab1, tab2 = st.tabs(["๐Ÿ“Š Entity Data (Table)", "๐Ÿ“ˆ Visualizations & Topics"])
657
-
658
  with tab1:
659
  # Create the summary table with the requested column name changes
660
  grouped_entity_table = df.groupby(['category', 'label']).size().reset_index(name='Count')
661
  grouped_entity_table.columns = ['Category', 'Entity', 'Count']
662
-
663
  st.markdown("## Entity Counts by Category and Entity")
664
  st.dataframe(grouped_entity_table.sort_values(by=['Category', 'Count'], ascending=[True, False]), use_container_width=True)
665
- with st.expander("See Glossary of tags"):
666
- st.write('''
667
- - **start**: ['index of the start of the corresponding entity']
668
- - **end**: ['index of the end of the corresponding entity']
669
- - **text**: ['entity extracted from your text data']
670
- - **label**: ['label (tag) assigned to a given extracted entity']
671
- - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
672
- ''')
673
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
 
675
  with tab2:
676
  st.markdown("## Visualizations")
677
-
678
  # 3a. Treemap (As requested in Tab 2)
679
  fig_treemap = px.treemap(
680
  df,
@@ -687,12 +644,11 @@ if st.session_state.show_results and not st.session_state.results_df.empty:
687
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
688
  st.markdown("### Entity Distribution Treemap")
689
  st.plotly_chart(fig_treemap, use_container_width=True)
690
-
691
- st.markdown("---")
692
 
 
693
  # 3b. Pie Chart and Category Bar Chart side-by-side
694
  col_pie, col_bar_cat = st.columns(2)
695
-
696
  # Pie Chart
697
  grouped_counts = df['category'].value_counts().reset_index()
698
  grouped_counts.columns = ['Category', 'Count']
@@ -703,71 +659,48 @@ if st.session_state.show_results and not st.session_state.results_df.empty:
703
  with col_pie:
704
  st.markdown("### Distribution of Entities by Category")
705
  st.plotly_chart(fig_pie, use_container_width=True)
706
-
707
  # Category Bar Chart
708
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',
709
- color='Category', title='Total Entities per Category',
710
- color_discrete_sequence=px.colors.qualitative.Pastel)
711
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=10))
712
  with col_bar_cat:
713
  st.markdown("### Total Entities per Category")
714
  st.plotly_chart(fig_bar_category, use_container_width=True)
715
-
716
- st.markdown("---")
717
 
 
718
  # 3c. Most Frequent Entities Bar Chart
719
  word_counts = df['text'].value_counts().reset_index()
720
  word_counts.columns = ['Entity', 'Count']
721
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
722
-
723
  st.markdown("### Top 10 Most Frequent Entities")
724
  if not repeating_entities.empty:
725
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',
726
- color='Entity', title='Top 10 Most Frequent Entities',
727
- color_discrete_sequence=px.colors.sequential.Plasma)
728
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=100))
729
  st.plotly_chart(fig_bar_freq, use_container_width=True)
730
  else:
731
  st.info("No entities appear more than once in the text for visualization.")
732
-
733
- st.markdown("---")
734
 
 
735
  # 3d. Network Graph
736
  st.markdown("### Entity Relationship Map")
737
  network_fig = generate_network_graph(df, text_input)
738
  st.plotly_chart(network_fig, use_container_width=True)
739
-
740
  st.markdown("---")
741
-
742
  # 4. Topic Modeling
743
  st.markdown("## Topic Modeling")
744
-
745
  if df_topic_data is not None and not df_topic_data.empty:
746
  st.markdown("### Bubble size = word weight")
747
  bubble_figure = create_topic_word_bubbles(df_topic_data)
748
  st.plotly_chart(bubble_figure, use_container_width=True)
749
-
750
  st.markdown("### Top Words by Topic")
751
- # Simple table display for topic data
752
- st.dataframe(df_topic_data, use_container_width=True)
753
  else:
754
- st.info("Topic Modeling requires more unique input (at least two unique entities) to be performed.")
755
-
756
- elif st.session_state.show_results and st.session_state.results_df.empty:
757
- st.warning("No entities were extracted from the provided text.")
758
-
759
-
760
- st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
761
- code = '''
762
- <iframe
763
- src="https://aiecosystem-dataharvest.hf.space"
764
- frameborder="0"
765
- width="850"
766
- height="450"
767
- ></iframe>
768
- '''
769
- st.code(code, language="html")
770
-
771
-
772
-
773
-
 
41
  "person": "#10b981",
42
  "country": "#3b82f6",
43
  "city": "#4ade80",
 
44
  "organization": "#f59e0b",
45
  "date": "#8b5cf6",
46
  "time": "#ec4899",
47
  "cardinal": "#06b6d4",
48
  "money": "#f43f5e",
49
  "position": "#a855f7",
 
50
  }
51
 
52
  # --- Label Definitions and Category Mapping (Used by the App) ---
53
  labels = list(entity_color_map.keys())
 
 
 
 
54
  labels = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
55
  category_mapping = {
56
  "People": ["person", "organization", "position"],
57
  "Locations": ["country", "city"],
58
  "Time": ["date", "time"],
59
+ "Numbers": ["money", "cardinal"]}
 
 
 
 
 
 
 
 
60
 
61
+ # CORRECTION 1: Reverse category mapping definition moved here for app-wide access
62
+ reverse_category_mapping = {label: category
63
+ for category, label_list in category_mapping.items() for label in label_list}
64
 
65
  # --- Utility Functions for Analysis and Plotly ---
66
  def extract_label(node_name):
 
76
  """Generates HTML to display text with entities highlighted and colored."""
77
  if df_entities.empty:
78
  return text
 
79
  # Sort entities by start index descending to insert highlights without affecting subsequent indices
80
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
81
  highlighted_text = text
 
82
  for entity in entities:
83
  start = entity['start']
84
  end = entity['end']
85
  label = entity['label']
86
  entity_text = entity['text']
87
  color = entity_color_map.get(label, '#000000')
 
88
  # Create a span with background color and tooltip
89
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
90
  # Replace the original text segment with the highlighted HTML
91
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
 
92
  # Use a div to mimic the Streamlit input box style for the report
93
  return f'<div style="border: 1px solid #CCCCCC; padding: 15px; border-radius: 5px; background-color: #FFFFFF; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
94
 
 
100
  documents = df_entities['text'].unique().tolist()
101
  if len(documents) < 2:
102
  return None
 
103
  N = min(num_top_words, len(documents))
104
  try:
105
  tfidf_vectorizer = TfidfVectorizer(
 
109
  )
110
  tfidf = tfidf_vectorizer.fit_transform(documents)
111
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
 
112
  lda = LatentDirichletAllocation(
113
  n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
114
  )
 
134
  # Renaming columns to match the output of perform_topic_modeling
135
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
136
  df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
 
137
  if df_topic_data.empty:
138
  return None
139
  fig = px.scatter(
 
164
  height=600,
165
  margin=dict(t=50, b=100, l=50, r=10),
166
  )
167
+ fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>',marker=dict(line=dict(width=1, color='DarkSlateGrey')))
 
168
  return fig
169
 
170
  def generate_network_graph(df, raw_text):
 
174
  """
175
  entity_counts = df['text'].value_counts().reset_index()
176
  entity_counts.columns = ['text', 'frequency']
 
177
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
178
  if unique_entities.shape[0] < 2:
179
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
180
 
181
  num_nodes = len(unique_entities)
182
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
 
183
  radius = 10
184
  unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
185
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
 
186
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
 
187
 
188
+ edges = set()
189
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
190
+
191
  for sentence in sentences:
192
  entities_in_sentence = []
193
  for entity_text in unique_entities['text'].unique():
194
  if entity_text.lower() in sentence.lower():
195
  entities_in_sentence.append(entity_text)
196
  unique_entities_in_sentence = list(set(entities_in_sentence))
 
197
  for i in range(len(unique_entities_in_sentence)):
198
  for j in range(i + 1, len(unique_entities_in_sentence)):
199
  node1 = unique_entities_in_sentence[i]
 
203
 
204
  edge_x = []
205
  edge_y = []
 
206
  for edge in edges:
207
  n1, n2 = edge
208
  if n1 in pos_map and n2 in pos_map:
 
254
  seen_labels.add(label)
255
  color = entity_color_map.get(label, '#cccccc')
256
  legend_traces.append(go.Scatter(
257
+ x=[None], y=[None], mode='markers', marker=dict(size=10, color=color),name=f"{label.capitalize()}", showlegend=True
 
258
  ))
259
  for trace in legend_traces:
260
  fig.add_trace(trace)
 
270
  margin=dict(t=50, b=10, l=10, r=10),
271
  height=600
272
  )
 
273
  return fig
274
 
 
275
  # --- NEW CSV GENERATION FUNCTION ---
276
  def generate_entity_csv(df):
277
  """
 
287
  # -----------------------------------
288
 
289
  # --- Existing App Functionality (HTML) ---
 
 
 
 
290
  def generate_html_report(df, text_input, elapsed_time, df_topic_data):
291
  """
292
  Generates a full HTML report containing all analysis results and
293
  visualizations. (Simplified HTML generation for brevity in code)
294
  """
 
 
295
  # 1. Generate Visualizations (Plotly HTML)
 
296
  # 1a. Treemap
297
  fig_treemap = px.treemap(
298
  df,
 
322
  word_counts.columns = ['Entity', 'Count']
323
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
324
  bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
 
325
  if not repeating_entities.empty:
326
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Plasma)
327
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
 
356
  classes='table table-striped',
357
  index=False
358
  )
359
+
360
  # 4. Construct the Final HTML
361
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
362
  <meta charset="UTF-8">
 
404
  """
405
  return html_content
406
 
 
407
  # --- Page Configuration and Styling (No Sidebar, Removed Pink) ---
408
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
409
  st.markdown(
 
439
  </style>
440
  """,
441
  unsafe_allow_html=True)
 
 
 
 
 
442
 
443
+ st.subheader("Entity and Topic Analysis Report Generator", divider="blue")
444
+ st.link_button("by nlpblogs", "https://nlpblogs.com", type="secondary")
445
 
446
+ # CORRECTION 2: Removed duplicated expander. The following is the second, correct one.
447
  expander = st.expander("**Important notes**")
448
+ expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"**Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.**How to Use:** Type or paste your text (max. 1000 words) into the text area below, press Ctrl + Enter, and then click the 'Results' button.**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
 
 
 
 
 
 
 
 
 
449
 
450
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
451
 
 
456
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
457
 
458
  # --- Model Loading ---
459
+ @st.cache_resourced
460
  def load_ner_model():
461
  """Loads the GLiNER model and caches it."""
462
  try:
 
481
  "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
482
  "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
483
  "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
484
+ "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026.")
 
485
  # -----------------------------------
486
+
487
  # --- Session State Initialization (CRITICAL FIX) ---
488
  if 'show_results' not in st.session_state:
489
  st.session_state.show_results = False
 
515
  height=250,
516
  key='my_text_area',
517
  value=st.session_state.my_text_area)
 
518
  word_count = len(text.split())
519
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
520
  st.button("Clear text", on_click=clear_text)
 
532
  if text != st.session_state.last_text:
533
  st.session_state.last_text = text
534
  start_time = time.time()
 
535
  # --- Model Prediction & Dataframe Creation ---
536
  entities = model.predict_entities(text, labels)
537
  df = pd.DataFrame(entities)
 
538
  if not df.empty:
539
  df['text'] = df['text'].apply(remove_trailing_punctuation)
540
  df['category'] = df['label'].map(reverse_category_mapping)
541
  st.session_state.results_df = df
 
542
  unique_entity_count = len(df['text'].unique())
543
  N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
 
544
  st.session_state.topic_results = perform_topic_modeling(
545
  df,
546
  num_topics=2,
547
  num_top_words=N_TOP_WORDS_TO_USE
548
  )
 
549
  if comet_initialized:
550
  experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
551
  experiment.log_parameter("input_text", text)
 
554
  else:
555
  st.session_state.results_df = pd.DataFrame()
556
  st.session_state.topic_results = None
 
557
  end_time = time.time()
558
  st.session_state.elapsed_time = end_time - start_time
559
+
560
+ st.session_state.show_results = True
561
 
562
  # --- Results Display ---
563
  if st.session_state.show_results and not st.session_state.results_df.empty:
564
  st.success(f"Processing complete in {st.session_state.elapsed_time:.2f} seconds! ๐ŸŽ‰")
565
+
566
  df = st.session_state.results_df
567
  text_input = st.session_state.last_text
568
  elapsed_time = st.session_state.elapsed_time
569
  df_topic_data = st.session_state.topic_results
570
+
571
  # --- Highlighted Text and Download Buttons (Above Tabs) ---
572
  st.subheader("1. Analyzed Text & Extracted Entities", divider="blue")
573
  st.markdown(
574
  highlight_entities(text_input, df),
575
  unsafe_allow_html=True
576
  )
 
577
  st.subheader("Downloads", divider="blue")
578
  col1, col2, col3 = st.columns([1, 1, 3])
579
+
580
  # 1. Download CSV
581
  csv_buffer = generate_entity_csv(df)
582
  col1.download_button(
 
585
  file_name="ner_entities.csv",
586
  mime="text/csv"
587
  )
 
588
  # 2. Download HTML Report
589
  html_content = generate_html_report(df, text_input, elapsed_time, df_topic_data)
590
  col2.download_button(
 
593
  file_name="ner_analysis_report.html",
594
  mime="text/html"
595
  )
596
+
597
  st.markdown("---")
598
+
599
+ # CORRECTION 1: Tabs Implementation
600
+ tab1, tab2 = st.tabs(["๐Ÿ“Š Entity Data (Table) & Glossary", "๐Ÿ“ˆ Visualizations & Topics"])
601
+
602
  with tab1:
603
  # Create the summary table with the requested column name changes
604
  grouped_entity_table = df.groupby(['category', 'label']).size().reset_index(name='Count')
605
  grouped_entity_table.columns = ['Category', 'Entity', 'Count']
606
+
607
  st.markdown("## Entity Counts by Category and Entity")
608
  st.dataframe(grouped_entity_table.sort_values(by=['Category', 'Count'], ascending=[True, False]), use_container_width=True)
 
 
 
 
 
 
 
 
609
 
610
+ st.markdown("---")
611
+ st.markdown("## Glossary of Tags and Category Mapping")
612
+
613
+ # Display Category Mapping (forward and reverse)
614
+ st.markdown("### Category to Entity Label Mapping (`category_mapping`)")
615
+ st.json(category_mapping)
616
+
617
+ # Display the requested reverse mapping below the table
618
+ st.markdown("### Entity Label to Category Mapping (Reverse Glossary) (`reverse_category_mapping`)")
619
+ st.json(reverse_category_mapping) # Display the reverse mapping which was moved to the top
620
+
621
+ # Display general glossary
622
+ st.markdown("### General Glossary for Extracted Entities")
623
+ st.write("""
624
+ - **start**: Index of the start of the corresponding entity.
625
+ - **end**: Index of the end of the corresponding entity.
626
+ - **text**: Entity extracted from your text data.
627
+ - **label**: The entity tag assigned to the extracted entity.
628
+ - **category**: The broad category (e.g., 'People') derived from the 'label'.
629
+ - **score**: Accuracy score; how accurately a tag has been assigned to a given entity.
630
+ """)
631
 
632
  with tab2:
633
  st.markdown("## Visualizations")
634
+
635
  # 3a. Treemap (As requested in Tab 2)
636
  fig_treemap = px.treemap(
637
  df,
 
644
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
645
  st.markdown("### Entity Distribution Treemap")
646
  st.plotly_chart(fig_treemap, use_container_width=True)
 
 
647
 
648
+ st.markdown("---")
649
  # 3b. Pie Chart and Category Bar Chart side-by-side
650
  col_pie, col_bar_cat = st.columns(2)
651
+
652
  # Pie Chart
653
  grouped_counts = df['category'].value_counts().reset_index()
654
  grouped_counts.columns = ['Category', 'Count']
 
659
  with col_pie:
660
  st.markdown("### Distribution of Entities by Category")
661
  st.plotly_chart(fig_pie, use_container_width=True)
 
662
  # Category Bar Chart
663
  fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',
664
+ color='Category', title='Total Entities per Category',
665
+ color_discrete_sequence=px.colors.qualitative.Pastel)
666
  fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=10))
667
  with col_bar_cat:
668
  st.markdown("### Total Entities per Category")
669
  st.plotly_chart(fig_bar_category, use_container_width=True)
 
 
670
 
671
+ st.markdown("---")
672
  # 3c. Most Frequent Entities Bar Chart
673
  word_counts = df['text'].value_counts().reset_index()
674
  word_counts.columns = ['Entity', 'Count']
675
  repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
 
676
  st.markdown("### Top 10 Most Frequent Entities")
677
  if not repeating_entities.empty:
678
  fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',
679
+ color='Entity', title='Top 10 Most Frequent Entities',
680
+ color_discrete_sequence=px.colors.sequential.Plasma)
681
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'}, margin=dict(t=50, b=100))
682
  st.plotly_chart(fig_bar_freq, use_container_width=True)
683
  else:
684
  st.info("No entities appear more than once in the text for visualization.")
 
 
685
 
686
+ st.markdown("---")
687
  # 3d. Network Graph
688
  st.markdown("### Entity Relationship Map")
689
  network_fig = generate_network_graph(df, text_input)
690
  st.plotly_chart(network_fig, use_container_width=True)
691
+
692
  st.markdown("---")
693
+
694
  # 4. Topic Modeling
695
  st.markdown("## Topic Modeling")
696
+
697
  if df_topic_data is not None and not df_topic_data.empty:
698
  st.markdown("### Bubble size = word weight")
699
  bubble_figure = create_topic_word_bubbles(df_topic_data)
700
  st.plotly_chart(bubble_figure, use_container_width=True)
701
+
702
  st.markdown("### Top Words by Topic")
703
+ # Simple table display of topic words
704
+ st.dataframe(df_topic_data.rename(columns={'Topic_ID': 'Topic ID', 'Word': 'Top Word', 'Weight': 'Weight'}), use_container_width=True, hide_index=True)
705
  else:
706
+ st.info("Topic Modeling requires text containing at least two unique entities.")