AIEcosystem commited on
Commit
0c28caf
·
verified ·
1 Parent(s): ce1b83d

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +149 -206
src/streamlit_app.py CHANGED
@@ -12,18 +12,14 @@ import re
12
  import string
13
  import json
14
  from itertools import cycle
15
- # --- PPTX Imports (Note: pptx must be installed via 'pip install python-pptx') ---
16
  from io import BytesIO
17
  import plotly.io as pio
18
- # ---------------------------
19
- # --- Stable Scikit-learn LDA Imports ---
20
  from sklearn.feature_extraction.text import TfidfVectorizer
21
  from sklearn.decomposition import LatentDirichletAllocation
22
- # ------------------------------
23
  from gliner import GLiNER
24
  from streamlit_extras.stylable_container import stylable_container
25
 
26
- # Using a try/except for comet_ml import
27
  try:
28
  from comet_ml import Experiment
29
  except ImportError:
@@ -33,10 +29,7 @@ except ImportError:
33
  def log_table(self, *args): pass
34
  def end(self): pass
35
 
36
- # --- Model Home Directory (Fix for deployment environments) ---
37
- os.environ['HF_HOME'] = '/tmp'
38
-
39
- # --- Fixed Label Definitions and Mappings (Used as Fallback) ---
40
  FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
41
  FIXED_ENTITY_COLOR_MAP = {
42
  "person": "#10b981", # Green
@@ -59,7 +52,6 @@ FIXED_CATEGORY_MAPPING = {
59
  REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
60
 
61
  # --- Dynamic Color Generator for Custom Labels ---
62
- # Use Plotly's Alphabet set for a large pool of distinct colors
63
  COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
64
 
65
  def extract_label(node_name):
@@ -74,86 +66,88 @@ def remove_trailing_punctuation(text_string):
74
  def get_dynamic_color_map(active_labels, fixed_map):
75
  """Generates a color map, using fixed colors if available, otherwise dynamic colors."""
76
  color_map = {}
77
- # If using fixed labels, use the fixed map directly
78
  if active_labels == FIXED_LABELS:
79
  return fixed_map
80
- # If using custom labels, generate colors
81
  for label in active_labels:
82
- # Prioritize fixed color if the custom label happens to match a fixed one
83
  if label in fixed_map:
84
  color_map[label] = fixed_map[label]
85
  else:
86
- # Generate a new color from the palette
87
  color_map[label] = next(COLOR_PALETTE)
88
  return color_map
89
 
90
  def highlight_entities(text, df_entities, entity_color_map):
91
- """
92
- Generates HTML to display text with entities highlighted and colored.
93
- IMPORTANT: Assumes 'start' and 'end' are relative to the 'text' input.
94
- """
95
  if df_entities.empty:
96
  return text
97
- # Sort entities by start index descending to insert highlights without affecting subsequent indices
98
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
99
  highlighted_text = text
 
100
  for entity in entities:
101
- # Ensure the entity indices are within the bounds of the full text
102
  start = max(0, entity['start'])
103
  end = min(len(text), entity['end'])
104
- # Get entity text from the full document based on its indices
105
- # The 'text' column in the dataframe is now an attribute of the chunked text, not the original span
106
  entity_text_from_full_doc = text[start:end]
107
  label = entity['label']
108
  color = entity_color_map.get(label, '#000000')
109
- # Create a span with background color and tooltip
110
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text_from_full_doc}</span>'
111
- # Replace the original text segment with the highlighted HTML
112
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
113
- # Use a div to mimic the Streamlit input box style for the report
114
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
115
 
116
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
117
  """Performs basic Topic Modeling using LDA."""
118
  documents = df_entities['text'].unique().tolist()
119
- # Topic modeling is usually more effective with full sentences/paragraphs,
120
- # but here we use the extracted entity texts as per the original code's intent.
121
  if len(documents) < 2:
122
  return None
 
123
  N = min(num_top_words, len(documents))
 
124
  try:
 
125
  tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(1, 3))
126
  tfidf = tfidf_vectorizer.fit_transform(documents)
127
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
 
 
128
  if len(tfidf_feature_names) < num_topics:
129
  tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3))
130
  tfidf = tfidf_vectorizer.fit_transform(documents)
131
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
132
  if len(tfidf_feature_names) < num_topics:
133
  return None
 
134
  lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1)
135
  lda.fit(tfidf)
136
  topic_data_list = []
 
137
  for topic_idx, topic in enumerate(lda.components_):
138
  top_words_indices = topic.argsort()[:-N - 1:-1]
139
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
140
  word_weights = [topic[i] for i in top_words_indices]
 
141
  for word, weight in zip(top_words, word_weights):
142
  topic_data_list.append({
143
  'Topic_ID': f'Topic #{topic_idx + 1}',
144
  'Word': word,
145
  'Weight': weight,
146
  })
 
147
  return pd.DataFrame(topic_data_list)
 
148
  except Exception as e:
 
149
  return None
150
 
151
  def create_topic_word_bubbles(df_topic_data):
152
  """Generates a Plotly Bubble Chart for top words across all topics."""
153
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
154
  df_topic_data['x_pos'] = df_topic_data.index
 
155
  if df_topic_data.empty:
156
  return None
 
157
  fig = px.scatter(
158
  df_topic_data,
159
  x='x_pos', y='weight', size='weight', color='topic', text='word', hover_name='word', size_max=40,
@@ -183,8 +177,10 @@ def generate_network_graph(df, raw_text, entity_color_map):
183
  entity_counts = df['text'].value_counts().reset_index()
184
  entity_counts.columns = ['text', 'frequency']
185
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
 
186
  if unique_entities.shape[0] < 2:
187
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
 
188
  num_nodes = len(unique_entities)
189
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
190
  radius = 10
@@ -192,32 +188,36 @@ def generate_network_graph(df, raw_text, entity_color_map):
192
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
193
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
194
  edges = set()
195
- # Simple sentence tokenizer
196
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
 
197
  for sentence in sentences:
198
  entities_in_sentence = []
199
  for entity_text in unique_entities['text'].unique():
200
- # Note: This is an inexact but fast co-occurrence check
201
  if entity_text.lower() in sentence.lower():
202
  entities_in_sentence.append(entity_text)
 
203
  unique_entities_in_sentence = list(set(entities_in_sentence))
 
204
  for i in range(len(unique_entities_in_sentence)):
205
  for j in range(i + 1, len(unique_entities_in_sentence)):
206
  node1 = unique_entities_in_sentence[i]
207
  node2 = unique_entities_in_sentence[j]
208
  edge_tuple = tuple(sorted((node1, node2)))
209
  edges.add(edge_tuple)
 
210
  edge_x = []
211
  edge_y = []
 
212
  for edge in edges:
213
  n1, n2 = edge
214
  if n1 in pos_map and n2 in pos_map:
215
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
216
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
217
-
218
  fig = go.Figure()
219
  edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
220
  fig.add_trace(edge_trace)
 
221
  fig.add_trace(go.Scatter(
222
  x=unique_entities['x'], y=unique_entities['y'], mode='markers+text', name='Entities', text=unique_entities['text'], textposition="top center", showlegend=False,
223
  marker=dict(
@@ -229,6 +229,7 @@ def generate_network_graph(df, raw_text, entity_color_map):
229
  customdata=unique_entities[['label', 'score', 'frequency']],
230
  hovertemplate=("<b>%{text}</b><br>Label: %{customdata[0]}<br>Score: %{customdata[1]:.2f}<br>Frequency: %{customdata[2]}<extra></extra>")
231
  ))
 
232
  legend_traces = []
233
  seen_labels = set()
234
  for index, row in unique_entities.iterrows():
@@ -237,8 +238,10 @@ def generate_network_graph(df, raw_text, entity_color_map):
237
  seen_labels.add(label)
238
  color = entity_color_map.get(label, '#cccccc')
239
  legend_traces.append(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True))
 
240
  for trace in legend_traces:
241
  fig.add_trace(trace)
 
242
  fig.update_layout(
243
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
244
  showlegend=True, hovermode='closest',
@@ -257,17 +260,13 @@ def generate_entity_csv(df):
257
  csv_buffer.seek(0)
258
  return csv_buffer
259
 
260
- # -----------------------------------
261
- # --- HTML REPORT GENERATION FUNCTION (MODIFIED FOR WHITE-LABEL) ---
262
  def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
263
  """
264
  Generates a full HTML report containing all analysis results and visualizations.
265
- Accepts report_title and branding_html for white-labeling.
266
  """
267
- # Use the category values from the DataFrame to ensure the report matches the app's current mode (fixed or custom)
268
- unique_categories = df['category'].unique()
269
-
270
  # 1. Generate Visualizations (Plotly HTML)
 
271
  # 1a. Treemap
272
  fig_treemap = px.treemap(
273
  df,
@@ -303,10 +302,11 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
303
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
304
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
305
 
306
- # 1e. Network Graph HTML - IMPORTANT: Pass color map
307
  network_fig = generate_network_graph(df, text_input, entity_color_map)
308
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
309
 
 
310
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
311
  if df_topic_data is not None and not df_topic_data.empty:
312
  bubble_figure = create_topic_word_bubbles(df_topic_data)
@@ -315,12 +315,12 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
315
  else:
316
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
317
  else:
318
- topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">' # Changed border color
319
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
320
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
321
  topic_charts_html += '</div>'
322
 
323
- # 2. Get Highlighted Text - IMPORTANT: Pass color map
324
  highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
325
 
326
  # 3. Entity Tables (Pandas to HTML)
@@ -329,7 +329,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
329
  index=False
330
  )
331
 
332
- # 4. Construct the Final HTML (UPDATED FOR WHITE-LABELING)
333
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
334
  <meta charset="UTF-8">
335
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
@@ -384,62 +384,59 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_col
384
 
385
  def chunk_text(text, max_chunk_size=1500):
386
  """Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
387
- # Split by double newline (paragraph) or sentence-like separators
388
  segments = re.split(r'(\n\n|(?<=[.!?])\s+)', text)
389
  chunks = []
390
  current_chunk = ""
391
  current_offset = 0
 
392
  for segment in segments:
393
  if not segment: continue
394
  if len(current_chunk) + len(segment) > max_chunk_size and current_chunk:
395
- # Save the current chunk and its starting offset
396
  chunks.append((current_chunk, current_offset))
397
  current_offset += len(current_chunk)
398
  current_chunk = segment
399
  else:
400
  current_chunk += segment
 
401
  if current_chunk:
402
  chunks.append((current_chunk, current_offset))
 
403
  return chunks
404
 
405
  def process_chunked_text(text, labels, model):
406
  """Processes large text in chunks and aggregates/offsets the entities."""
407
- # GLiNER model context size can be around 1024-1500 tokens/words. We use a generous char limit.
408
- # The word count limit is 10000, but we chunk around 500 words for safety/performance.
409
  MAX_CHUNK_CHARS = 3500
410
  chunks = chunk_text(text, max_chunk_size=MAX_CHUNK_CHARS)
411
  all_entities = []
 
412
  for chunk_text, chunk_offset in chunks:
413
- # Predict entities on the small chunk
414
  chunk_entities = model.predict_entities(chunk_text, labels)
415
- # Offset the start and end indices to match the original document
416
  for entity in chunk_entities:
417
  entity['start'] += chunk_offset
418
  entity['end'] += chunk_offset
419
  all_entities.append(entity)
 
420
  return all_entities
421
 
422
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
423
 
424
- # --- Conditional Mobile Warning ---
425
  st.markdown(
426
  """
427
  <style>
428
  /* FIX: Aggressive theme override to ensure visibility */
429
  body {
430
- background-color: #f0f2f6 !important; /* Force a light background */
431
- color: #333333 !important; /* Force dark text */
432
  }
433
- /* Ensure main Streamlit container background is also light */
434
  [data-testid="stAppViewBlock"] {
435
  background-color: #ffffff !important;
436
  }
437
- /* CSS Media Query: Only show the content inside this selector when the screen width is 600px or less (typical mobile size) */
438
  @media (max-width: 600px) {
439
  #mobile-warning-container {
440
- display: block; /* Show the warning container */
441
- background-color: #ffcccc; /* Light red/pink background */
442
- color: #cc0000; /* Dark red text */
443
  padding: 10px;
444
  border-radius: 5px;
445
  text-align: center;
@@ -448,27 +445,23 @@ st.markdown(
448
  border: 1px solid #cc0000;
449
  }
450
  }
451
- /* Hide the content by default (for larger screens) */
452
  @media (min-width: 601px) {
453
  #mobile-warning-container {
454
- display: none; /* Hide the warning container on desktop */
455
  }
456
  }
457
- /* --- FIX: Tab Label Colors for Visibility --- */
458
  [data-testid="stConfigurableTabs"] button {
459
- color: #333333 !important; /* Dark gray for inactive tabs */
460
- background-color: #f0f0f0; /* Light gray background for inactive tabs */
461
  border: 1px solid #cccccc;
462
  }
463
- /* Target the ACTIVE tab label */
464
  [data-testid="stConfigurableTabs"] button[aria-selected="true"] {
465
- color: #FFFFFF !important; /* White text for active tab */
466
- background-color: #007bff; /* Blue background for active tab */
467
- border-bottom: 2px solid #007bff; /* Optional: adds an accent line */
468
  }
469
- /* Expander header color fix (since you overwrote it to white) */
470
  .streamlit-expanderHeader {
471
- color: #007bff; /* Blue text for Expander header */
472
  }
473
  </style>
474
  <div id="mobile-warning-container">
@@ -477,8 +470,7 @@ st.markdown(
477
  """,
478
  unsafe_allow_html=True)
479
 
480
- # --- Topic Modeling Settings (Moved to main body, but need to initialize key outside of 'if st.session_state.show_results:') ---
481
- st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue")
482
 
483
  tab1, tab2 = st.tabs(["Embed", "Important Notes"])
484
  with tab1:
@@ -502,28 +494,20 @@ with tab2:
502
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
503
  **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
504
  """)
505
- st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)") # Updated contact info
506
-
507
- # --- Comet ML Setup (Placeholder/Conditional) ---
508
- COMET_API_KEY = os.environ.get("COMET_API_KEY")
509
- COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
510
- COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
511
- comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
512
 
513
  # --- Model Loading ---
514
  @st.cache_resource
515
  def load_ner_model(labels):
516
  """Loads the GLiNER model and caches it."""
517
  try:
518
- # The model requires constraints (labels) to be passed during loading
519
  return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
520
  except Exception as e:
521
- # Log the actual error to the console for debugging
522
  print(f"FATAL ERROR: Failed to load NER model: {e}")
523
  st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
524
  st.stop()
525
 
526
- # --- LONG DEFAULT TEXT (178 Words) ---
527
  DEFAULT_TEXT = (
528
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
529
  "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
@@ -541,7 +525,7 @@ DEFAULT_TEXT = (
541
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
542
 
543
  # -----------------------------------
544
- # --- Session State Initialization (CRITICAL FIX) ---
545
  if 'show_results' not in st.session_state: st.session_state.show_results = False
546
  if 'last_text' not in st.session_state: st.session_state.last_text = ""
547
  if 'results_df' not in st.session_state: st.session_state.results_df = pd.DataFrame()
@@ -551,13 +535,11 @@ if 'my_text_area' not in st.session_state: st.session_state.my_text_area = DEFAU
551
  if 'custom_labels_input' not in st.session_state: st.session_state.custom_labels_input = ""
552
  if 'active_labels_list' not in st.session_state: st.session_state.active_labels_list = FIXED_LABELS
553
  if 'is_custom_mode' not in st.session_state: st.session_state.is_custom_mode = False
554
- # Initialize Topic Model settings in state, so they can be set even if not using the sidebar
555
  if 'num_topics_slider' not in st.session_state: st.session_state.num_topics_slider = 5
556
  if 'num_top_words_slider' not in st.session_state: st.session_state.num_top_words_slider = 10
557
  if 'last_num_topics' not in st.session_state: st.session_state.last_num_topics = None
558
  if 'last_num_top_words' not in st.session_state: st.session_state.last_num_top_words = None
559
- if 'last_active_labels' not in st.session_state: st.session_state.last_active_labels = None # Added for results comparison
560
-
561
 
562
  def clear_text():
563
  """Clears the text area (sets it to an empty string) and hides results."""
@@ -569,7 +551,7 @@ def clear_text():
569
  st.session_state.topic_results = None
570
 
571
  # --- Text Input and Clear Button ---
572
- word_limit = 10000 # Updated to 10000
573
  text = st.text_area(
574
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
575
  height=250,
@@ -583,25 +565,22 @@ custom_labels_text = st.text_area(
583
  "**Optional:** Enter your own comma-separated entity labels here (e.g., `product, symptom, client_id`). Leave blank for default labels.",
584
  height=60,
585
  key='custom_labels_input',
586
- placeholder="e.g., product, symptom, client_id" # Show placeholder after the prompt
587
  )
588
 
589
- # Use columns to align the buttons neatly
590
  col_results, col_clear = st.columns([1, 1])
591
  with col_results:
592
  run_button = st.button("Results", key='run_results', use_container_width=True)
593
  with col_clear:
594
  st.button("Clear text", on_click=clear_text, use_container_width=True)
595
 
596
- # --- Results Trigger and Processing (Completed Logic with Chunking and Topic Vars) ---
597
  if run_button:
598
  # 1. Determine Active Labels and Mode
599
  custom_labels_raw = st.session_state.custom_labels_input
600
  if custom_labels_raw.strip():
601
- # Sanitize and parse custom labels
602
  custom_labels_list = [label.strip().lower() for label in custom_labels_raw.split(',') if label.strip()]
603
  if not custom_labels_list:
604
- # Fallback if user enters commas but no actual words
605
  st.session_state.active_labels_list = FIXED_LABELS
606
  st.session_state.is_custom_mode = False
607
  st.info("No valid custom labels found. Falling back to default fixed labels.")
@@ -613,8 +592,6 @@ if run_button:
613
  st.session_state.is_custom_mode = False
614
 
615
  active_labels = st.session_state.active_labels_list
616
-
617
- # Get current topic modeling settings (used for caching logic)
618
  current_num_topics = st.session_state.num_topics_slider
619
  current_num_top_words = st.session_state.num_top_words_slider
620
 
@@ -624,67 +601,70 @@ if run_button:
624
  active_labels != st.session_state.last_active_labels
625
  )
626
 
627
- if should_rerun_full_analysis and text.strip() and word_count <= word_limit:
628
-
629
- # 2. Rerunning Full Analysis
630
- CHUNKING_THRESHOLD = 500
631
- should_chunk = word_count > CHUNKING_THRESHOLD
632
- mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
633
- if should_chunk:
634
- mode_msg += " with **chunking** for large text"
635
-
636
- with st.spinner(f"Analyzing text with {mode_msg}..."):
637
- start_time = time.time()
638
-
639
- # 2a. Load Model (Model constraints are updated based on active labels)
640
- # NOTE: Load time is cached, so this is fast on subsequent runs.
641
- model = load_ner_model(active_labels)
642
-
643
- # 2b. Extract Entities (using chunking if necessary)
644
  if should_chunk:
645
- all_entities = process_chunked_text(text, active_labels, model)
646
- else:
647
- all_entities = model.predict_entities(text, active_labels)
648
-
649
- end_time = time.time()
650
- elapsed_time = end_time - start_time
651
-
652
- # 2c. Prepare DataFrame
653
- df = pd.DataFrame(all_entities)
654
-
655
- if not df.empty:
656
- # Add category mapping
657
- if st.session_state.is_custom_mode:
658
- df['category'] = 'User Defined Entities'
659
- else:
660
- df['category'] = df['label'].map(REVERSE_FIXED_CATEGORY_MAPPING).fillna('Other')
661
 
662
- # Clean up extracted text
663
- df['text'] = df['text'].apply(remove_trailing_punctuation)
664
 
665
- # 2d. Perform Topic Modeling on extracted entities
666
- df_topic_data = perform_topic_modeling(df, num_topics=current_num_topics, num_top_words=current_num_top_words)
667
- else:
668
- df_topic_data = None
669
 
670
- # 5. Save Results to Session State
671
- st.session_state.results_df = df
672
- st.session_state.topic_results = df_topic_data
673
- st.session_state.elapsed_time = elapsed_time
674
- st.session_state.last_text = text
675
- st.session_state.show_results = True
676
- st.session_state.last_active_labels = active_labels
677
- st.session_state.last_num_topics = current_num_topics # Save topic settings
678
- st.session_state.last_num_top_words = current_num_top_words # Save topic settings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
679
  else:
680
  st.info("Results already calculated for the current text and settings.")
681
  st.session_state.show_results = True
 
 
 
 
 
 
682
 
683
- # --- Display Download Link and Results (Updated with White-Label inputs) ---
684
  if st.session_state.show_results:
685
  df = st.session_state.results_df
686
  df_topic_data = st.session_state.topic_results
687
- # Generate the color map based on the results DF labels
688
  current_labels_in_df = df['label'].unique().tolist()
689
  entity_color_map = get_dynamic_color_map(current_labels_in_df, FIXED_ENTITY_COLOR_MAP)
690
 
@@ -692,6 +672,7 @@ if st.session_state.show_results:
692
  st.warning("No entities were found in the provided text with the current label set.")
693
  else:
694
  st.subheader("Analysis Results", divider="blue")
 
695
  # 1. Highlighted Text
696
  st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
697
  st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
@@ -700,7 +681,6 @@ if st.session_state.show_results:
700
  st.markdown("### 2. Detailed Entity Analysis")
701
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
702
 
703
- # Determine which categories to use for the tabs
704
  if st.session_state.is_custom_mode:
705
  unique_categories = ["User Defined Entities"]
706
  tabs_to_show = df['label'].unique().tolist()
@@ -708,67 +688,42 @@ if st.session_state.show_results:
708
  else:
709
  unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
710
 
711
- # --- Section 2a: Detailed Tables by Category/Label ---
712
  # --- Function to Apply Conditional Coloring to Scores ---
713
- def color_score_gradient(df):
714
- """
715
- Applies a color gradient to the 'score' column using Pandas Styler.
716
- High scores (closer to 1.0) will be darker/more saturated.
717
- """
718
- # Use 'YlGnBu' (Yellow-Green-Blue) gradient.
719
- # We apply the gradient only to the 'score' column subset.
720
- return df.style.background_gradient(
721
  cmap='YlGnBu',
722
  subset=['score']
723
  ).format(
724
- {'score': '{:.4f}'} # Re-apply the four decimal place format
725
  )
726
 
727
- # --- Your Main Tab Detail Logic ---
728
  with tab_category_details:
729
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
730
  if st.session_state.is_custom_mode:
731
- # In custom mode, group by the actual label since the category is just "User Defined Entities"
732
  tabs_list = df['label'].unique().tolist()
733
  tabs_category = st.tabs(tabs_list)
734
 
735
  for label, tab in zip(tabs_list, tabs_category):
736
- # Prepare the DataFrame for the current label
737
  df_label = df[df['label'] == label][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
738
-
739
- # Apply the coloring function
740
  styled_df_label = color_score_gradient(df_label)
741
  with tab:
742
  st.markdown(f"##### {label.capitalize()} Entities ({len(df_label)} total)")
743
- st.dataframe(
744
- # Pass the STYLED DataFrame object to Streamlit
745
- styled_df_label,
746
- use_container_width=True,
747
- # NOTE: st.column_config for 'score' is removed because Pandas Styler handles formatting and coloring
748
- )
749
  else:
750
- # In fixed mode, group by the category defined in FIXED_CATEGORY_MAPPING
751
  tabs_category = st.tabs(unique_categories)
752
 
753
  for category, tab in zip(unique_categories, tabs_category):
754
- # Prepare the DataFrame for the current category
755
  df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
756
-
757
- # Apply the coloring function
758
  styled_df_category = color_score_gradient(df_category)
759
  with tab:
760
  st.markdown(f"##### {category} Entities ({len(df_category)} total)")
761
  if not df_category.empty:
762
- st.dataframe(
763
- # Pass the STYLED DataFrame object to Streamlit
764
- styled_df_category,
765
- use_container_width=True,
766
- # NOTE: st.column_config for 'score' is removed
767
- )
768
  else:
769
  st.info(f"No entities of category **{category}** were found in the text.")
770
 
771
- # --- INSERTED GLOSSARY HERE ---
772
  with st.expander("See Glossary of tags"):
773
  st.write('''- **text**: ['entity extracted from your text data']
774
  - **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']
@@ -776,7 +731,6 @@ if st.session_state.show_results:
776
  - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
777
  - **start**: ['index of the start of the corresponding entity']
778
  - **end**: ['index of the end of the corresponding entity']''')
779
- # --- END GLOSSARY INSERTION ---
780
 
781
  # --- Section 2b: Treemap Visualization ---
782
  with tab_treemap_viz:
@@ -791,13 +745,12 @@ if st.session_state.show_results:
791
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
792
  st.plotly_chart(fig_treemap, use_container_width=True)
793
 
794
- # --- Section 3: Comparative Charts (COMPLETED) ---
795
  st.markdown("---")
796
  st.markdown("### 3. Comparative Charts")
797
  col1, col2, col3 = st.columns(3)
798
  grouped_counts = df['category'].value_counts().reset_index()
799
  grouped_counts.columns = ['Category', 'Count']
800
- # Determine color sequence for charts
801
  chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
802
 
803
  with col1: # Pie Chart
@@ -823,17 +776,17 @@ if st.session_state.show_results:
823
  else:
824
  st.info("No entities were repeated enough for a Top 10 frequency chart.")
825
 
826
- # 4. Advanced Analysis (REVISED STRUCTURE)
827
  st.markdown("---")
828
  st.markdown("### 4. Advanced Analysis")
829
 
830
- # --- A. Network Graph Section (Alone) ---
831
  with st.expander("🔗 Entity Co-occurrence Network Graph", expanded=True):
832
  st.plotly_chart(generate_network_graph(df, st.session_state.last_text, entity_color_map), use_container_width=True)
833
 
834
- # --- B. Topic Modeling Section (Controls and Chart inside one block) ---
835
  st.markdown("---")
836
- with st.container(border=True): # Use a container to visually group the Topic Modeling section
837
  st.markdown("#### 💡 Topic Modeling (LDA) Configuration and Results")
838
  st.markdown("Adjust the settings below and click **'Re-Run Topic Model'** to instantly update the visualization based on the extracted entities.")
839
 
@@ -859,13 +812,13 @@ if st.session_state.show_results:
859
  help="The number of top words to display per topic (5 to 20)."
860
  )
861
 
862
- # Function to trigger a recalculation of ONLY the topic model
863
  def rerun_topic_model():
864
  # Update session state with the new slider values
865
  st.session_state.num_topics_slider = st.session_state.num_topics_slider_new
866
  st.session_state.num_top_words_slider = st.session_state.num_top_words_slider_new
867
- # Recalculate topic modeling results
868
  if not st.session_state.results_df.empty:
 
869
  df_topic_data_new = perform_topic_modeling(
870
  df_entities=st.session_state.results_df,
871
  num_topics=st.session_state.num_topics_slider,
@@ -874,45 +827,44 @@ if st.session_state.show_results:
874
  st.session_state.topic_results = df_topic_data_new
875
  st.session_state.last_num_topics = st.session_state.num_topics_slider
876
  st.session_state.last_num_top_words = st.session_state.num_top_words_slider
877
- # st.success("Topic Model Re-Run Complete!") # Removed success message as it causes an extra flash
878
 
879
  with col_rerun_btn:
880
- st.markdown("<div style='height: 38px;'></div>", unsafe_allow_html=True) # Vertical spacing
881
- # Rerun the entire app to update the chart immediately
882
  st.button("Re-Run Topic Model", on_click=rerun_topic_model, use_container_width=True, type="primary")
883
 
884
- # Display the topic chart inside the same container
885
  st.markdown("---")
886
  st.markdown(f"""
887
  **Current LDA Parameters:**
888
- * Topics: **{st.session_state.last_num_topics}**
889
- * Top Words: **{st.session_state.last_num_top_words}**
890
  """)
891
- df_topic_data = st.session_state.topic_results # Get the potentially updated results
 
 
 
892
  if df_topic_data is not None and not df_topic_data.empty:
893
  st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
894
  st.markdown("This chart visualizes the key words driving the identified topics, based on extracted entities.")
 
895
  else:
896
  st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
897
 
898
- # --- 5. White-Label Configuration (NEW SECTION FOR CUSTOM BRANDING) ---
899
  st.markdown("---")
900
  st.markdown("### 5. White-Label Report Configuration 🎨")
901
- # Set a dynamic default title based on the mode
902
  default_report_title = f"{'Custom' if st.session_state.is_custom_mode else 'Fixed'} Entity Analysis Report"
903
  custom_report_title = st.text_input(
904
  "Type Your Report Title (for HTML Report), and then press Enter.",
905
  value=default_report_title
906
  )
907
- # UPDATED: Simplified input for the user
908
  custom_branding_text_input = st.text_area(
909
  "Type Your Brand Name or Tagline (Appears below the title in the report), and then press Enter.",
910
- value="Analysis powered by My Own Brand", # Removed the technical <p> tag
911
  key='custom_branding_input',
912
  help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
913
  )
914
 
915
- # 6. Downloads (Updated to pass custom variables)
916
  st.markdown("---")
917
  st.markdown("### 6. Downloads")
918
  col_csv, col_html = st.columns(2)
@@ -928,19 +880,17 @@ if st.session_state.show_results:
928
  use_container_width=True
929
  )
930
 
931
- # --- NEW LOGIC: Wrap the simple text input into proper HTML for the report ---
932
- # We wrap the user's plain text in a styled HTML paragraph element
933
  branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
934
 
935
- # HTML Download (Passing custom white-label parameters)
936
  html_content = generate_html_report(
937
  df,
938
  st.session_state.last_text,
939
  st.session_state.elapsed_time,
940
  df_topic_data,
941
  entity_color_map,
942
- report_title=custom_report_title, # Pass custom title
943
- branding_html=branding_to_pass # Pass the now-wrapped HTML
944
  )
945
  html_bytes = html_content.encode('utf-8')
946
  with col_html:
@@ -951,11 +901,4 @@ if st.session_state.show_results:
951
  mime="text/html",
952
  use_container_width=True
953
  )
954
-
955
-
956
-
957
-
958
-
959
-
960
-
961
-
 
12
  import string
13
  import json
14
  from itertools import cycle
 
15
  from io import BytesIO
16
  import plotly.io as pio
 
 
17
  from sklearn.feature_extraction.text import TfidfVectorizer
18
  from sklearn.decomposition import LatentDirichletAllocation
 
19
  from gliner import GLiNER
20
  from streamlit_extras.stylable_container import stylable_container
21
 
22
+ # --- Comet ML Imports (Optional/Placeholder) ---
23
  try:
24
  from comet_ml import Experiment
25
  except ImportError:
 
29
  def log_table(self, *args): pass
30
  def end(self): pass
31
 
32
+ # --- Fixed Label Definitions and Mappings ---
 
 
 
33
  FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
34
  FIXED_ENTITY_COLOR_MAP = {
35
  "person": "#10b981", # Green
 
52
  REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
53
 
54
  # --- Dynamic Color Generator for Custom Labels ---
 
55
  COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
56
 
57
  def extract_label(node_name):
 
66
  def get_dynamic_color_map(active_labels, fixed_map):
67
  """Generates a color map, using fixed colors if available, otherwise dynamic colors."""
68
  color_map = {}
 
69
  if active_labels == FIXED_LABELS:
70
  return fixed_map
71
+
72
  for label in active_labels:
 
73
  if label in fixed_map:
74
  color_map[label] = fixed_map[label]
75
  else:
 
76
  color_map[label] = next(COLOR_PALETTE)
77
  return color_map
78
 
79
  def highlight_entities(text, df_entities, entity_color_map):
80
+ """Generates HTML to display text with entities highlighted and colored."""
 
 
 
81
  if df_entities.empty:
82
  return text
83
+
84
  entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
85
  highlighted_text = text
86
+
87
  for entity in entities:
 
88
  start = max(0, entity['start'])
89
  end = min(len(text), entity['end'])
 
 
90
  entity_text_from_full_doc = text[start:end]
91
  label = entity['label']
92
  color = entity_color_map.get(label, '#000000')
93
+
94
  highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text_from_full_doc}</span>'
 
95
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
96
+
97
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
98
 
99
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
100
  """Performs basic Topic Modeling using LDA."""
101
  documents = df_entities['text'].unique().tolist()
 
 
102
  if len(documents) < 2:
103
  return None
104
+
105
  N = min(num_top_words, len(documents))
106
+
107
  try:
108
+ # Step 1: Try aggressive filtering
109
  tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(1, 3))
110
  tfidf = tfidf_vectorizer.fit_transform(documents)
111
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
112
+
113
+ # Step 2: Fallback if not enough features
114
  if len(tfidf_feature_names) < num_topics:
115
  tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3))
116
  tfidf = tfidf_vectorizer.fit_transform(documents)
117
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
118
  if len(tfidf_feature_names) < num_topics:
119
  return None
120
+
121
  lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1)
122
  lda.fit(tfidf)
123
  topic_data_list = []
124
+
125
  for topic_idx, topic in enumerate(lda.components_):
126
  top_words_indices = topic.argsort()[:-N - 1:-1]
127
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
128
  word_weights = [topic[i] for i in top_words_indices]
129
+
130
  for word, weight in zip(top_words, word_weights):
131
  topic_data_list.append({
132
  'Topic_ID': f'Topic #{topic_idx + 1}',
133
  'Word': word,
134
  'Weight': weight,
135
  })
136
+
137
  return pd.DataFrame(topic_data_list)
138
+
139
  except Exception as e:
140
+ # print(f"Topic Modeling Error: {e}")
141
  return None
142
 
143
  def create_topic_word_bubbles(df_topic_data):
144
  """Generates a Plotly Bubble Chart for top words across all topics."""
145
  df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
146
  df_topic_data['x_pos'] = df_topic_data.index
147
+
148
  if df_topic_data.empty:
149
  return None
150
+
151
  fig = px.scatter(
152
  df_topic_data,
153
  x='x_pos', y='weight', size='weight', color='topic', text='word', hover_name='word', size_max=40,
 
177
  entity_counts = df['text'].value_counts().reset_index()
178
  entity_counts.columns = ['text', 'frequency']
179
  unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
180
+
181
  if unique_entities.shape[0] < 2:
182
  return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
183
+
184
  num_nodes = len(unique_entities)
185
  thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
186
  radius = 10
 
188
  unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
189
  pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
190
  edges = set()
 
191
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
192
+
193
  for sentence in sentences:
194
  entities_in_sentence = []
195
  for entity_text in unique_entities['text'].unique():
 
196
  if entity_text.lower() in sentence.lower():
197
  entities_in_sentence.append(entity_text)
198
+
199
  unique_entities_in_sentence = list(set(entities_in_sentence))
200
+
201
  for i in range(len(unique_entities_in_sentence)):
202
  for j in range(i + 1, len(unique_entities_in_sentence)):
203
  node1 = unique_entities_in_sentence[i]
204
  node2 = unique_entities_in_sentence[j]
205
  edge_tuple = tuple(sorted((node1, node2)))
206
  edges.add(edge_tuple)
207
+
208
  edge_x = []
209
  edge_y = []
210
+
211
  for edge in edges:
212
  n1, n2 = edge
213
  if n1 in pos_map and n2 in pos_map:
214
  edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
215
  edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
216
+
217
  fig = go.Figure()
218
  edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
219
  fig.add_trace(edge_trace)
220
+
221
  fig.add_trace(go.Scatter(
222
  x=unique_entities['x'], y=unique_entities['y'], mode='markers+text', name='Entities', text=unique_entities['text'], textposition="top center", showlegend=False,
223
  marker=dict(
 
229
  customdata=unique_entities[['label', 'score', 'frequency']],
230
  hovertemplate=("<b>%{text}</b><br>Label: %{customdata[0]}<br>Score: %{customdata[1]:.2f}<br>Frequency: %{customdata[2]}<extra></extra>")
231
  ))
232
+
233
  legend_traces = []
234
  seen_labels = set()
235
  for index, row in unique_entities.iterrows():
 
238
  seen_labels.add(label)
239
  color = entity_color_map.get(label, '#cccccc')
240
  legend_traces.append(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True))
241
+
242
  for trace in legend_traces:
243
  fig.add_trace(trace)
244
+
245
  fig.update_layout(
246
  title='Entity Co-occurrence Network (Edges = Same Sentence)',
247
  showlegend=True, hovermode='closest',
 
260
  csv_buffer.seek(0)
261
  return csv_buffer
262
 
263
+ # --- HTML REPORT GENERATION FUNCTION ---
 
264
  def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
265
  """
266
  Generates a full HTML report containing all analysis results and visualizations.
 
267
  """
 
 
 
268
  # 1. Generate Visualizations (Plotly HTML)
269
+
270
  # 1a. Treemap
271
  fig_treemap = px.treemap(
272
  df,
 
302
  fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
303
  bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
304
 
305
+ # 1e. Network Graph HTML
306
  network_fig = generate_network_graph(df, text_input, entity_color_map)
307
  network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
308
 
309
+ # 1f. Topic Modeling Bubble Chart
310
  topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
311
  if df_topic_data is not None and not df_topic_data.empty:
312
  bubble_figure = create_topic_word_bubbles(df_topic_data)
 
315
  else:
316
  topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
317
  else:
318
+ topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">'
319
  topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
320
  topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
321
  topic_charts_html += '</div>'
322
 
323
+ # 2. Get Highlighted Text
324
  highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
325
 
326
  # 3. Entity Tables (Pandas to HTML)
 
329
  index=False
330
  )
331
 
332
+ # 4. Construct the Final HTML
333
  html_content = f"""<!DOCTYPE html><html lang="en"><head>
334
  <meta charset="UTF-8">
335
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
 
384
 
385
  def chunk_text(text, max_chunk_size=1500):
386
  """Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
 
387
  segments = re.split(r'(\n\n|(?<=[.!?])\s+)', text)
388
  chunks = []
389
  current_chunk = ""
390
  current_offset = 0
391
+
392
  for segment in segments:
393
  if not segment: continue
394
  if len(current_chunk) + len(segment) > max_chunk_size and current_chunk:
 
395
  chunks.append((current_chunk, current_offset))
396
  current_offset += len(current_chunk)
397
  current_chunk = segment
398
  else:
399
  current_chunk += segment
400
+
401
  if current_chunk:
402
  chunks.append((current_chunk, current_offset))
403
+
404
  return chunks
405
 
406
  def process_chunked_text(text, labels, model):
407
  """Processes large text in chunks and aggregates/offsets the entities."""
 
 
408
  MAX_CHUNK_CHARS = 3500
409
  chunks = chunk_text(text, max_chunk_size=MAX_CHUNK_CHARS)
410
  all_entities = []
411
+
412
  for chunk_text, chunk_offset in chunks:
 
413
  chunk_entities = model.predict_entities(chunk_text, labels)
 
414
  for entity in chunk_entities:
415
  entity['start'] += chunk_offset
416
  entity['end'] += chunk_offset
417
  all_entities.append(entity)
418
+
419
  return all_entities
420
 
421
  st.set_page_config(layout="wide", page_title="NER & Topic Report App")
422
 
423
+ # --- Conditional Mobile Warning CSS ---
424
  st.markdown(
425
  """
426
  <style>
427
  /* FIX: Aggressive theme override to ensure visibility */
428
  body {
429
+ background-color: #f0f2f6 !important;
430
+ color: #333333 !important;
431
  }
 
432
  [data-testid="stAppViewBlock"] {
433
  background-color: #ffffff !important;
434
  }
 
435
  @media (max-width: 600px) {
436
  #mobile-warning-container {
437
+ display: block;
438
+ background-color: #ffcccc;
439
+ color: #cc0000;
440
  padding: 10px;
441
  border-radius: 5px;
442
  text-align: center;
 
445
  border: 1px solid #cc0000;
446
  }
447
  }
 
448
  @media (min-width: 601px) {
449
  #mobile-warning-container {
450
+ display: none;
451
  }
452
  }
 
453
  [data-testid="stConfigurableTabs"] button {
454
+ color: #333333 !important;
455
+ background-color: #f0f0f0;
456
  border: 1px solid #cccccc;
457
  }
 
458
  [data-testid="stConfigurableTabs"] button[aria-selected="true"] {
459
+ color: #FFFFFF !important;
460
+ background-color: #007bff;
461
+ border-bottom: 2px solid #007bff;
462
  }
 
463
  .streamlit-expanderHeader {
464
+ color: #007bff;
465
  }
466
  </style>
467
  <div id="mobile-warning-container">
 
470
  """,
471
  unsafe_allow_html=True)
472
 
473
+ st.subheader("Entity and Topic Analysis Report Generator", divider="blue")
 
474
 
475
  tab1, tab2 = st.tabs(["Embed", "Important Notes"])
476
  with tab1:
 
494
  **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
495
  **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
496
  """)
497
+ st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)")
 
 
 
 
 
 
498
 
499
  # --- Model Loading ---
500
  @st.cache_resource
501
  def load_ner_model(labels):
502
  """Loads the GLiNER model and caches it."""
503
  try:
 
504
  return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
505
  except Exception as e:
 
506
  print(f"FATAL ERROR: Failed to load NER model: {e}")
507
  st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
508
  st.stop()
509
 
510
+ # --- LONG DEFAULT TEXT ---
511
  DEFAULT_TEXT = (
512
  "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
513
  "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
 
525
  "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
526
 
527
  # -----------------------------------
528
+ # --- Session State Initialization ---
529
  if 'show_results' not in st.session_state: st.session_state.show_results = False
530
  if 'last_text' not in st.session_state: st.session_state.last_text = ""
531
  if 'results_df' not in st.session_state: st.session_state.results_df = pd.DataFrame()
 
535
  if 'custom_labels_input' not in st.session_state: st.session_state.custom_labels_input = ""
536
  if 'active_labels_list' not in st.session_state: st.session_state.active_labels_list = FIXED_LABELS
537
  if 'is_custom_mode' not in st.session_state: st.session_state.is_custom_mode = False
 
538
  if 'num_topics_slider' not in st.session_state: st.session_state.num_topics_slider = 5
539
  if 'num_top_words_slider' not in st.session_state: st.session_state.num_top_words_slider = 10
540
  if 'last_num_topics' not in st.session_state: st.session_state.last_num_topics = None
541
  if 'last_num_top_words' not in st.session_state: st.session_state.last_num_top_words = None
542
+ if 'last_active_labels' not in st.session_state: st.session_state.last_active_labels = None
 
543
 
544
  def clear_text():
545
  """Clears the text area (sets it to an empty string) and hides results."""
 
551
  st.session_state.topic_results = None
552
 
553
  # --- Text Input and Clear Button ---
554
+ word_limit = 10000
555
  text = st.text_area(
556
  f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
557
  height=250,
 
565
  "**Optional:** Enter your own comma-separated entity labels here (e.g., `product, symptom, client_id`). Leave blank for default labels.",
566
  height=60,
567
  key='custom_labels_input',
568
+ placeholder="e.g., product, symptom, client_id"
569
  )
570
 
 
571
  col_results, col_clear = st.columns([1, 1])
572
  with col_results:
573
  run_button = st.button("Results", key='run_results', use_container_width=True)
574
  with col_clear:
575
  st.button("Clear text", on_click=clear_text, use_container_width=True)
576
 
577
+ # --- Results Trigger and Processing ---
578
  if run_button:
579
  # 1. Determine Active Labels and Mode
580
  custom_labels_raw = st.session_state.custom_labels_input
581
  if custom_labels_raw.strip():
 
582
  custom_labels_list = [label.strip().lower() for label in custom_labels_raw.split(',') if label.strip()]
583
  if not custom_labels_list:
 
584
  st.session_state.active_labels_list = FIXED_LABELS
585
  st.session_state.is_custom_mode = False
586
  st.info("No valid custom labels found. Falling back to default fixed labels.")
 
592
  st.session_state.is_custom_mode = False
593
 
594
  active_labels = st.session_state.active_labels_list
 
 
595
  current_num_topics = st.session_state.num_topics_slider
596
  current_num_top_words = st.session_state.num_top_words_slider
597
 
 
601
  active_labels != st.session_state.last_active_labels
602
  )
603
 
604
+ if text.strip() and word_count <= word_limit:
605
+ if should_rerun_full_analysis:
606
+ # 2. Rerunning Full Analysis
607
+ CHUNKING_THRESHOLD = 500
608
+ should_chunk = word_count > CHUNKING_THRESHOLD
609
+ mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
 
 
 
 
 
 
 
 
 
 
 
610
  if should_chunk:
611
+ mode_msg += " with **chunking** for large text"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
 
613
+ with st.spinner(f"Analyzing text with {mode_msg}..."):
614
+ start_time = time.time()
615
 
616
+ # 2a. Load Model
617
+ model = load_ner_model(active_labels)
 
 
618
 
619
+ # 2b. Extract Entities
620
+ if should_chunk:
621
+ all_entities = process_chunked_text(text, active_labels, model)
622
+ else:
623
+ all_entities = model.predict_entities(text, active_labels)
624
+
625
+ end_time = time.time()
626
+ elapsed_time = end_time - start_time
627
+
628
+ # 2c. Prepare DataFrame
629
+ df = pd.DataFrame(all_entities)
630
+
631
+ if not df.empty:
632
+ if st.session_state.is_custom_mode:
633
+ df['category'] = 'User Defined Entities'
634
+ else:
635
+ df['category'] = df['label'].map(REVERSE_FIXED_CATEGORY_MAPPING).fillna('Other')
636
+
637
+ df['text'] = df['text'].apply(remove_trailing_punctuation)
638
+
639
+ # 2d. Perform Topic Modeling on extracted entities
640
+ df_topic_data = perform_topic_modeling(df, num_topics=current_num_topics, num_top_words=current_num_top_words)
641
+ else:
642
+ df_topic_data = None
643
+
644
+ # 5. Save Results to Session State
645
+ st.session_state.results_df = df
646
+ st.session_state.topic_results = df_topic_data
647
+ st.session_state.elapsed_time = elapsed_time
648
+ st.session_state.last_text = text
649
+ st.session_state.show_results = True
650
+ st.session_state.last_active_labels = active_labels
651
+ st.session_state.last_num_topics = current_num_topics
652
+ st.session_state.last_num_top_words = current_num_top_words
653
  else:
654
  st.info("Results already calculated for the current text and settings.")
655
  st.session_state.show_results = True
656
+ elif word_count > word_limit:
657
+ st.error(f"Text too long! Please limit your input to {word_limit} words.")
658
+ st.session_state.show_results = False
659
+ else:
660
+ st.warning("Please enter some text to analyze.")
661
+ st.session_state.show_results = False
662
 
663
+ # --- Display Download Link and Results ---
664
  if st.session_state.show_results:
665
  df = st.session_state.results_df
666
  df_topic_data = st.session_state.topic_results
667
+
668
  current_labels_in_df = df['label'].unique().tolist()
669
  entity_color_map = get_dynamic_color_map(current_labels_in_df, FIXED_ENTITY_COLOR_MAP)
670
 
 
672
  st.warning("No entities were found in the provided text with the current label set.")
673
  else:
674
  st.subheader("Analysis Results", divider="blue")
675
+
676
  # 1. Highlighted Text
677
  st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
678
  st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
 
681
  st.markdown("### 2. Detailed Entity Analysis")
682
  tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
683
 
 
684
  if st.session_state.is_custom_mode:
685
  unique_categories = ["User Defined Entities"]
686
  tabs_to_show = df['label'].unique().tolist()
 
688
  else:
689
  unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
690
 
 
691
  # --- Function to Apply Conditional Coloring to Scores ---
692
+ def color_score_gradient(df_input):
693
+ """Applies a color gradient to the 'score' column using Pandas Styler."""
694
+ return df_input.style.background_gradient(
 
 
 
 
 
695
  cmap='YlGnBu',
696
  subset=['score']
697
  ).format(
698
+ {'score': '{:.4f}'}
699
  )
700
 
701
+ # --- Section 2a: Detailed Tables by Category/Label ---
702
  with tab_category_details:
703
  st.markdown("#### Detailed Entities Table (Grouped by Category)")
704
  if st.session_state.is_custom_mode:
 
705
  tabs_list = df['label'].unique().tolist()
706
  tabs_category = st.tabs(tabs_list)
707
 
708
  for label, tab in zip(tabs_list, tabs_category):
 
709
  df_label = df[df['label'] == label][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
 
 
710
  styled_df_label = color_score_gradient(df_label)
711
  with tab:
712
  st.markdown(f"##### {label.capitalize()} Entities ({len(df_label)} total)")
713
+ st.dataframe(styled_df_label, use_container_width=True)
 
 
 
 
 
714
  else:
 
715
  tabs_category = st.tabs(unique_categories)
716
 
717
  for category, tab in zip(unique_categories, tabs_category):
 
718
  df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
 
 
719
  styled_df_category = color_score_gradient(df_category)
720
  with tab:
721
  st.markdown(f"##### {category} Entities ({len(df_category)} total)")
722
  if not df_category.empty:
723
+ st.dataframe(styled_df_category, use_container_width=True)
 
 
 
 
 
724
  else:
725
  st.info(f"No entities of category **{category}** were found in the text.")
726
 
 
727
  with st.expander("See Glossary of tags"):
728
  st.write('''- **text**: ['entity extracted from your text data']
729
  - **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']
 
731
  - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
732
  - **start**: ['index of the start of the corresponding entity']
733
  - **end**: ['index of the end of the corresponding entity']''')
 
734
 
735
  # --- Section 2b: Treemap Visualization ---
736
  with tab_treemap_viz:
 
745
  fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
746
  st.plotly_chart(fig_treemap, use_container_width=True)
747
 
748
+ # 3. Comparative Charts
749
  st.markdown("---")
750
  st.markdown("### 3. Comparative Charts")
751
  col1, col2, col3 = st.columns(3)
752
  grouped_counts = df['category'].value_counts().reset_index()
753
  grouped_counts.columns = ['Category', 'Count']
 
754
  chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
755
 
756
  with col1: # Pie Chart
 
776
  else:
777
  st.info("No entities were repeated enough for a Top 10 frequency chart.")
778
 
779
+ # 4. Advanced Analysis
780
  st.markdown("---")
781
  st.markdown("### 4. Advanced Analysis")
782
 
783
+ # --- A. Network Graph Section ---
784
  with st.expander("🔗 Entity Co-occurrence Network Graph", expanded=True):
785
  st.plotly_chart(generate_network_graph(df, st.session_state.last_text, entity_color_map), use_container_width=True)
786
 
787
+ # --- B. Topic Modeling Section ---
788
  st.markdown("---")
789
+ with st.container(border=True):
790
  st.markdown("#### 💡 Topic Modeling (LDA) Configuration and Results")
791
  st.markdown("Adjust the settings below and click **'Re-Run Topic Model'** to instantly update the visualization based on the extracted entities.")
792
 
 
812
  help="The number of top words to display per topic (5 to 20)."
813
  )
814
 
 
815
  def rerun_topic_model():
816
  # Update session state with the new slider values
817
  st.session_state.num_topics_slider = st.session_state.num_topics_slider_new
818
  st.session_state.num_top_words_slider = st.session_state.num_top_words_slider_new
819
+
820
  if not st.session_state.results_df.empty:
821
+ # Recalculate topic modeling results
822
  df_topic_data_new = perform_topic_modeling(
823
  df_entities=st.session_state.results_df,
824
  num_topics=st.session_state.num_topics_slider,
 
827
  st.session_state.topic_results = df_topic_data_new
828
  st.session_state.last_num_topics = st.session_state.num_topics_slider
829
  st.session_state.last_num_top_words = st.session_state.num_top_words_slider
 
830
 
831
  with col_rerun_btn:
832
+ st.markdown("<div style='height: 38px;'></div>", unsafe_allow_html=True)
 
833
  st.button("Re-Run Topic Model", on_click=rerun_topic_model, use_container_width=True, type="primary")
834
 
 
835
  st.markdown("---")
836
  st.markdown(f"""
837
  **Current LDA Parameters:**
838
+ * Topics: **{st.session_state.num_topics_slider}**
839
+ * Top Words: **{st.session_state.num_top_words_slider}**
840
  """)
841
+
842
+ df_topic_data = st.session_state.topic_results
843
+
844
+ # --- CRITICAL: This is the conditional block that must have correct indentation ---
845
  if df_topic_data is not None and not df_topic_data.empty:
846
  st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
847
  st.markdown("This chart visualizes the key words driving the identified topics, based on extracted entities.")
848
+ # END CRITICAL BLOCK
849
  else:
850
  st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
851
 
852
+ # 5. White-Label Configuration
853
  st.markdown("---")
854
  st.markdown("### 5. White-Label Report Configuration 🎨")
 
855
  default_report_title = f"{'Custom' if st.session_state.is_custom_mode else 'Fixed'} Entity Analysis Report"
856
  custom_report_title = st.text_input(
857
  "Type Your Report Title (for HTML Report), and then press Enter.",
858
  value=default_report_title
859
  )
 
860
  custom_branding_text_input = st.text_area(
861
  "Type Your Brand Name or Tagline (Appears below the title in the report), and then press Enter.",
862
+ value="Analysis powered by My Own Brand",
863
  key='custom_branding_input',
864
  help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
865
  )
866
 
867
+ # 6. Downloads
868
  st.markdown("---")
869
  st.markdown("### 6. Downloads")
870
  col_csv, col_html = st.columns(2)
 
880
  use_container_width=True
881
  )
882
 
883
+ # HTML Download (Passing custom white-label parameters)
 
884
  branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
885
 
 
886
  html_content = generate_html_report(
887
  df,
888
  st.session_state.last_text,
889
  st.session_state.elapsed_time,
890
  df_topic_data,
891
  entity_color_map,
892
+ report_title=custom_report_title,
893
+ branding_html=branding_to_pass
894
  )
895
  html_bytes = html_content.encode('utf-8')
896
  with col_html:
 
901
  mime="text/html",
902
  use_container_width=True
903
  )
904
+