Spaces:
Runtime error
Runtime error
File size: 48,824 Bytes
cb73c3a 3b36acc cb73c3a 3dbd695 cb73c3a 3dbd695 cb73c3a 3dbd695 cb73c3a 7a28ec3 cb73c3a 3dbd695 934327e 3dbd695 cb73c3a 0fcbd69 cb73c3a 7a28ec3 3dbd695 7a28ec3 cb73c3a 934327e cb73c3a 3dbd695 cb73c3a 934327e 3dbd695 cb73c3a 3dbd695 cb73c3a 3dbd695 cb73c3a 3dbd695 cb73c3a 3dbd695 cb73c3a 3dbd695 cb73c3a 3dbd695 cb73c3a 3dbd695 cb73c3a 3dbd695 cb73c3a 3dbd695 cb73c3a 934327e cb73c3a 934327e cb73c3a 934327e cb73c3a 3dbd695 934327e 3dbd695 cb73c3a b3682ae cb73c3a 9be4c46 b3682ae 9be4c46 cb73c3a 02b866a cb73c3a 02b866a cb73c3a 3dbd695 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 | import os
os.environ['HF_HOME'] = '/tmp'
import time
import streamlit as st
import streamlit.components.v1 as components
import pandas as pd
import io
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import re
import string
import json
from itertools import cycle
# --- PPTX Imports (Note: pptx must be installed via 'pip install python-pptx') ---
from io import BytesIO
import plotly.io as pio
# ---------------------------
# --- Stable Scikit-learn LDA Imports ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# ------------------------------
from gliner import GLiNER
from streamlit_extras.stylable_container import stylable_container
# Using a try/except for comet_ml import
try:
from comet_ml import Experiment
except ImportError:
class Experiment:
def __init__(self, **kwargs): pass
def log_parameter(self, *args): pass
def log_table(self, *args): pass
def end(self): pass
# --- Model Home Directory (Fix for deployment environments) ---
os.environ['HF_HOME'] = '/tmp'
# --- Fixed Label Definitions and Mappings (Used as Fallback) ---
FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
FIXED_ENTITY_COLOR_MAP = {
"person": "#10b981", # Green
"country": "#3b82f6", # Blue
"city": "#4ade80", # Light Green
"organization": "#f59e0b", # Orange
"date": "#8b5cf6", # Purple
"time": "#ec4899", # Pink
"cardinal": "#06b6d4", # Cyan
"money": "#f43f5e", # Red
"position": "#a855f7", # Violet
}
# --- Fixed Category Mapping ---
FIXED_CATEGORY_MAPPING = {
"People & Roles": ["person", "organization", "position"],
"Locations": ["country", "city"],
"Time & Dates": ["date", "time"],
"Numbers & Finance": ["money", "cardinal"]}
REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
# --- Dynamic Color Generator for Custom Labels ---
# Use Plotly's Alphabet set for a large pool of distinct colors
COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
def extract_label(node_name):
"""Extracts the label from a node string like 'Text (Label)'."""
match = re.search(r'\(([^)]+)\)$', node_name)
return match.group(1) if match else "Unknown"
def remove_trailing_punctuation(text_string):
"""Removes trailing punctuation from a string."""
return text_string.rstrip(string.punctuation)
def get_dynamic_color_map(active_labels, fixed_map):
"""Generates a color map, using fixed colors if available, otherwise dynamic colors."""
color_map = {}
# If using fixed labels, use the fixed map directly
if active_labels == FIXED_LABELS:
return fixed_map
# If using custom labels, generate colors
for label in active_labels:
# Prioritize fixed color if the custom label happens to match a fixed one
if label in fixed_map:
color_map[label] = fixed_map[label]
else:
# Generate a new color from the palette
color_map[label] = next(COLOR_PALETTE)
return color_map
def highlight_entities(text, df_entities, entity_color_map):
"""
Generates HTML to display text with entities highlighted and colored.
IMPORTANT: Assumes 'start' and 'end' are relative to the 'text' input.
"""
if df_entities.empty:
return text
# Sort entities by start index descending to insert highlights without affecting subsequent indices
entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
highlighted_text = text
for entity in entities:
# Ensure the entity indices are within the bounds of the full text
start = max(0, entity['start'])
end = min(len(text), entity['end'])
# Get entity text from the full document based on its indices
# The 'text' column in the dataframe is now an attribute of the chunked text, not the original span
entity_text_from_full_doc = text[start:end]
label = entity['label']
color = entity_color_map.get(label, '#000000')
# Create a span with background color and tooltip
highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text_from_full_doc}</span>'
# Replace the original text segment with the highlighted HTML
highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
# Use a div to mimic the Streamlit input box style for the report
return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
"""Performs basic Topic Modeling using LDA."""
documents = df_entities['text'].unique().tolist()
# Topic modeling is usually more effective with full sentences/paragraphs,
# but here we use the extracted entity texts as per the original code's intent.
if len(documents) < 2:
return None
N = min(num_top_words, len(documents))
try:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(1, 3))
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
if len(tfidf_feature_names) < num_topics:
tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3))
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
if len(tfidf_feature_names) < num_topics:
return None
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1)
lda.fit(tfidf)
topic_data_list = []
for topic_idx, topic in enumerate(lda.components_):
top_words_indices = topic.argsort()[:-N - 1:-1]
top_words = [tfidf_feature_names[i] for i in top_words_indices]
word_weights = [topic[i] for i in top_words_indices]
for word, weight in zip(top_words, word_weights):
topic_data_list.append({
'Topic_ID': f'Topic #{topic_idx + 1}',
'Word': word,
'Weight': weight,
})
return pd.DataFrame(topic_data_list)
except Exception as e:
return None
def create_topic_word_bubbles(df_topic_data):
"""Generates a Plotly Bubble Chart for top words across all topics."""
df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
df_topic_data['x_pos'] = df_topic_data.index
if df_topic_data.empty:
return None
fig = px.scatter(
df_topic_data,
x='x_pos', y='weight', size='weight', color='topic', text='word', hover_name='word', size_max=40,
title='Topic Word Weights (Bubble Chart)',
color_discrete_sequence=px.colors.qualitative.Bold,
labels={'x_pos': 'Entity/Word Index', 'weight': 'Word Weight', 'topic': 'Topic ID'},
custom_data=['word', 'weight', 'topic']
)
fig.update_layout(
xaxis_title="Entity/Word", yaxis_title="Word Weight",
xaxis={'showgrid': False, 'showticklabels': False, 'zeroline': False, 'showline': False},
yaxis={'showgrid': True},
showlegend=True, height=600,
margin=dict(t=50, b=100, l=50, r=10),
plot_bgcolor='#f9f9f9', paper_bgcolor='#f9f9f9'
)
fig.update_traces(
textposition='middle center',
textfont=dict(color='white', size=10),
hovertemplate="<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<br>Topic: %{customdata[2]}<extra></extra>",
marker=dict(line=dict(width=1, color='DarkSlateGrey'))
)
return fig
def generate_network_graph(df, raw_text, entity_color_map):
"""Generates a network graph visualization (Node Plot) with edges based on entity co-occurrence in sentences."""
entity_counts = df['text'].value_counts().reset_index()
entity_counts.columns = ['text', 'frequency']
unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
if unique_entities.shape[0] < 2:
return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
num_nodes = len(unique_entities)
thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
radius = 10
unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
edges = set()
# Simple sentence tokenizer
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
for sentence in sentences:
entities_in_sentence = []
for entity_text in unique_entities['text'].unique():
# Note: This is an inexact but fast co-occurrence check
if entity_text.lower() in sentence.lower():
entities_in_sentence.append(entity_text)
unique_entities_in_sentence = list(set(entities_in_sentence))
for i in range(len(unique_entities_in_sentence)):
for j in range(i + 1, len(unique_entities_in_sentence)):
node1 = unique_entities_in_sentence[i]
node2 = unique_entities_in_sentence[j]
edge_tuple = tuple(sorted((node1, node2)))
edges.add(edge_tuple)
edge_x = []
edge_y = []
for edge in edges:
n1, n2 = edge
if n1 in pos_map and n2 in pos_map:
edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
fig = go.Figure()
edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
fig.add_trace(edge_trace)
fig.add_trace(go.Scatter(
x=unique_entities['x'], y=unique_entities['y'], mode='markers+text', name='Entities', text=unique_entities['text'], textposition="top center", showlegend=False,
marker=dict(
size=unique_entities['frequency'] * 5 + 10,
color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
line_width=1, line_color='black', opacity=0.9
),
textfont=dict(size=10),
customdata=unique_entities[['label', 'score', 'frequency']],
hovertemplate=("<b>%{text}</b><br>Label: %{customdata[0]}<br>Score: %{customdata[1]:.2f}<br>Frequency: %{customdata[2]}<extra></extra>")
))
legend_traces = []
seen_labels = set()
for index, row in unique_entities.iterrows():
label = row['label']
if label not in seen_labels:
seen_labels.add(label)
color = entity_color_map.get(label, '#cccccc')
legend_traces.append(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True))
for trace in legend_traces:
fig.add_trace(trace)
fig.update_layout(
title='Entity Co-occurrence Network (Edges = Same Sentence)',
showlegend=True, hovermode='closest',
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
plot_bgcolor='#f9f9f9', paper_bgcolor='#f9f9f9',
margin=dict(t=50, b=10, l=10, r=10), height=600
)
return fig
# --- CSV GENERATION FUNCTION ---
def generate_entity_csv(df):
"""Generates a CSV file of the extracted entities in an in-memory buffer."""
csv_buffer = BytesIO()
df_export = df[['text', 'label', 'category', 'score', 'start', 'end']]
csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
csv_buffer.seek(0)
return csv_buffer
# -----------------------------------
# --- HTML REPORT GENERATION FUNCTION (MODIFIED FOR WHITE-LABEL) ---
def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
"""
Generates a full HTML report containing all analysis results and visualizations.
Accepts report_title and branding_html for white-labeling.
"""
# Use the category values from the DataFrame to ensure the report matches the app's current mode (fixed or custom)
unique_categories = df['category'].unique()
# 1. Generate Visualizations (Plotly HTML)
# 1a. Treemap
fig_treemap = px.treemap(
df,
path=[px.Constant("All Entities"), 'category', 'label', 'text'],
values='score',
color='category',
title="Entity Distribution by Category and Label",
color_discrete_sequence=px.colors.qualitative.Dark24
)
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn') # 1b. Pie Chart
grouped_counts = df['category'].value_counts().reset_index()
grouped_counts.columns = ['Category', 'Count']
color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
fig_pie.update_layout(margin=dict(t=50, b=10))
pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
# 1c. Bar Chart (Category Count)
fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
# 1d. Bar Chart (Most Frequent Entities)
word_counts = df['text'].value_counts().reset_index()
word_counts.columns = ['Entity', 'Count']
repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
if not repeating_entities.empty:
fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
# 1e. Network Graph HTML - IMPORTANT: Pass color map
network_fig = generate_network_graph(df, text_input, entity_color_map)
network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
# 1f. Topic Charts HTML
topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
if df_topic_data is not None and not df_topic_data.empty:
bubble_figure = create_topic_word_bubbles(df_topic_data)
if bubble_figure:
topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn", config={"responsive": True})}</div>'
else:
topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
else:
topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">' # Changed border color
topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
topic_charts_html += '</div>'
# 2. Get Highlighted Text - IMPORTANT: Pass color map
highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
# 3. Entity Tables (Pandas to HTML)
entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
classes='table table-striped',
index=False
)
# 4. Construct the Final HTML (UPDATED FOR WHITE-LABELING)
html_content = f"""<!DOCTYPE html><html lang="en"><head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{report_title}</title>
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
<style>
body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
.container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom: 10px; margin-top: 0; }}
h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
h3 {{ color: #555; margin-top: 20px; }}
.metadata {{ background-color: #e6f0ff; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
.chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
table th {{ background-color: #f0f0f0; }}
.highlighted-text {{ border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
</style>
</head>
<body>
<div class="container">
<h1>{report_title}</h1>
<div class="metadata">
{branding_html} <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
<p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
</div>
<h2>1. Analyzed Text & Extracted Entities</h2>
<h3>Original Text with Highlighted Entities</h3>
<div class="highlighted-text-container">
{highlighted_text_html}
</div>
<h2>2. Full Extracted Entities Table
</h2>
{entity_table_html}
<h2>3. Data Visualizations</h2>
<h3>3.1 Entity Distribution Treemap</h3>
<div class="chart-box">{treemap_html}</div>
<h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
<div class="chart-box">{pie_html}</div>
<div class="chart-box">{bar_category_html}</div>
<h3>3.3 Entity Relationship Map (Edges = Same Sentence)</h3>
<div class="chart-box">{network_html}</div>
<h2>4. Topic Modelling</h2>
{topic_charts_html}
<h3>3.4 Most Frequent Entities</h3>
<div class="chart-box">{bar_freq_html}</div>
</div>
</body>
</html>
"""
return html_content
# --- CHUNKING IMPLEMENTATION FOR LARGE TEXT ---
def chunk_text(text, max_chunk_size=1500):
"""Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
# Split by double newline (paragraph) or sentence-like separators
segments = re.split(r'(\n\n|(?<=[.!?])\s+)', text)
chunks = []
current_chunk = ""
current_offset = 0
for segment in segments:
if not segment: continue
if len(current_chunk) + len(segment) > max_chunk_size and current_chunk:
# Save the current chunk and its starting offset
chunks.append((current_chunk, current_offset))
current_offset += len(current_chunk)
current_chunk = segment
else:
current_chunk += segment
if current_chunk:
chunks.append((current_chunk, current_offset))
return chunks
def process_chunked_text(text, labels, model):
"""Processes large text in chunks and aggregates/offsets the entities."""
# GLiNER model context size can be around 1024-1500 tokens/words. We use a generous char limit.
# The word count limit is 10000, but we chunk around 500 words for safety/performance.
MAX_CHUNK_CHARS = 3500
chunks = chunk_text(text, max_chunk_size=MAX_CHUNK_CHARS)
all_entities = []
for chunk_text, chunk_offset in chunks:
# Predict entities on the small chunk
chunk_entities = model.predict_entities(chunk_text, labels)
# Offset the start and end indices to match the original document
for entity in chunk_entities:
entity['start'] += chunk_offset
entity['end'] += chunk_offset
all_entities.append(entity)
return all_entities
# -----------------------------------
# --- Page Configuration and Styling (No Sidebar) ---
st.set_page_config(layout="wide", page_title="NER & Topic Report App")
# --- Conditional Mobile Warning ---
st.markdown(
"""
<style>
/* FIX: Aggressive theme override to ensure visibility */
body {
background-color: #f0f2f6 !important; /* Force a light background */
color: #333333 !important; /* Force dark text */
}
/* Ensure main Streamlit container background is also light */
[data-testid="stAppViewBlock"] {
background-color: #ffffff !important;
}
/* CSS Media Query: Only show the content inside this selector when the screen width is 600px or less (typical mobile size) */
@media (max-width: 600px) {
#mobile-warning-container {
display: block; /* Show the warning container */
background-color: #ffcccc; /* Light red/pink background */
color: #cc0000; /* Dark red text */
padding: 10px;
border-radius: 5px;
text-align: center;
margin-bottom: 20px;
font-weight: bold;
border: 1px solid #cc0000;
}
}
/* Hide the content by default (for larger screens) */
@media (min-width: 601px) {
#mobile-warning-container {
display: none; /* Hide the warning container on desktop */
}
}
/* --- FIX: Tab Label Colors for Visibility --- */
[data-testid="stConfigurableTabs"] button {
color: #333333 !important; /* Dark gray for inactive tabs */
background-color: #f0f0f0; /* Light gray background for inactive tabs */
border: 1px solid #cccccc;
}
/* Target the ACTIVE tab label */
[data-testid="stConfigurableTabs"] button[aria-selected="true"] {
color: #FFFFFF !important; /* White text for active tab */
background-color: #007bff; /* Blue background for active tab */
border-bottom: 2px solid #007bff; /* Optional: adds an accent line */
}
/* Expander header color fix (since you overwrote it to white) */
.streamlit-expanderHeader {
color: #007bff; /* Blue text for Expander header */
}
</style>
<div id="mobile-warning-container">
โ ๏ธ **Tip for Mobile Users:** For the best viewing experience of the charts and tables, please switch your browser to **"Desktop Site"** view.
</div>
""",
unsafe_allow_html=True)
# --- Topic Modeling Settings (Moved to main body, but need to initialize key outside of 'if st.session_state.show_results:') ---
# st.sidebar.header("Topic Modeling Settings ๐ก") # Removed sidebar header
st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
# Removed st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") for white-labeling
tab1, tab2 = st.tabs(["Embed", "Important Notes"])
with tab1:
with st.expander("Embed"):
st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
code = '''
<iframe
src="https://aiecosystem-dataharvest.hf.space"
frameborder="0"
width="850"
height="450"
></iframe>
'''
st.code(code, language="html")
with tab2:
expander = st.expander("**Important Notes**")
expander.markdown("""
**Named Entities (Fixed Mode):** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position".
**Custom Labels Mode:** You can define your own comma-separated labels (e.g., `product, symptom, client_id`) in the input box below.
**Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
**How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
""")
st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)") # Updated contact info
# --- Comet ML Setup (Placeholder/Conditional) ---
COMET_API_KEY = os.environ.get("COMET_API_KEY")
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
# --- Model Loading ---
@st.cache_resource
def load_ner_model(labels):
"""Loads the GLiNER model and caches it."""
try:
# The model requires constraints (labels) to be passed during loading
return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
except Exception as e:
# Log the actual error to the console for debugging
print(f"FATAL ERROR: Failed to load NER model: {e}")
st.error(f"Failed to load NER model. This may be due to a dependency issue or resource limits: {e}")
st.stop()
# --- LONG DEFAULT TEXT (178 Words) ---
DEFAULT_TEXT = (
"In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
"TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
"leap forward for commercial space technology across the entire **European Union**. The agreement, finalized "
"on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
"software platform. This version of the **Astra** platform is critical for processing and managing the vast amounts of data being sent "
"back from the recent Mars rover mission. This project underscores the ESA's commitment to advancing "
"space capabilities within the **European Union**. The core team, including lead engineer Marcus Davies, will hold "
"their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
"media platform X (under the username @TechCEO) was overwhelmingly positive, with many major tech "
"publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
"end of the year, further strengthening the technological standing of the **European Union**. The platform is designed to be compatible with both Windows and Linux operating systems. "
"The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
"are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
"general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
# -----------------------------------
# --- Session State Initialization (CRITICAL FIX) ---
if 'show_results' not in st.session_state: st.session_state.show_results = False
if 'last_text' not in st.session_state: st.session_state.last_text = ""
if 'results_df' not in st.session_state: st.session_state.results_df = pd.DataFrame()
if 'elapsed_time' not in st.session_state: st.session_state.elapsed_time = 0.0
if 'topic_results' not in st.session_state: st.session_state.topic_results = None
if 'my_text_area' not in st.session_state: st.session_state.my_text_area = DEFAULT_TEXT
if 'custom_labels_input' not in st.session_state: st.session_state.custom_labels_input = ""
if 'active_labels_list' not in st.session_state: st.session_state.active_labels_list = FIXED_LABELS
if 'is_custom_mode' not in st.session_state: st.session_state.is_custom_mode = False
# Initialize Topic Model settings in state, so they can be set even if not using the sidebar
if 'num_topics_slider' not in st.session_state: st.session_state.num_topics_slider = 5
if 'num_top_words_slider' not in st.session_state: st.session_state.num_top_words_slider = 10
if 'last_num_topics' not in st.session_state: st.session_state.last_num_topics = None
if 'last_num_top_words' not in st.session_state: st.session_state.last_num_top_words = None
# --- Clear Button Function (MODIFIED) ---
def clear_text():
"""Clears the text area (sets it to an empty string) and hides results."""
st.session_state['my_text_area'] = ""
st.session_state.show_results = False
st.session_state.last_text = ""
st.session_state.results_df = pd.DataFrame()
st.session_state.elapsed_time = 0.0
st.session_state.topic_results = None
# --- Text Input and Clear Button ---
word_limit = 10000 # Updated to 10000
text = st.text_area(
f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
height=250,
key='my_text_area',
)
word_count = len(text.split())
st.markdown(f"**Word count:** {word_count}/{word_limit}")
# --- Custom Labels Input ---
custom_labels_text = st.text_area(
"**Optional:** Enter your own comma-separated entity labels here (e.g., `product, symptom, client_id`). Leave blank for default labels.",
height=60,
key='custom_labels_input',
placeholder="e.g., product, symptom, client_id" # Show placeholder after the prompt
)
# Use columns to align the buttons neatly
col_results, col_clear = st.columns([1, 1])
with col_results:
run_button = st.button("Results", key='run_results', use_container_width=True)
with col_clear:
st.button("Clear text", on_click=clear_text, use_container_width=True)
# --- Results Trigger and Processing (Completed Logic with Chunking and Topic Vars) ---
if run_button:
# 1. Determine Active Labels and Mode
custom_labels_raw = st.session_state.custom_labels_input
if custom_labels_raw.strip():
# Sanitize and parse custom labels
custom_labels_list = [label.strip().lower() for label in custom_labels_raw.split(',') if label.strip()]
if not custom_labels_list:
# Fallback if user enters commas but no actual words
st.session_state.active_labels_list = FIXED_LABELS
st.session_state.is_custom_mode = False
st.info("No valid custom labels found. Falling back to default fixed labels.")
else:
st.session_state.active_labels_list = custom_labels_list
st.session_state.is_custom_mode = True
else:
st.session_state.active_labels_list = FIXED_LABELS
st.session_state.is_custom_mode = False
active_labels = st.session_state.active_labels_list
if not text.strip():
st.warning("Please enter some text to extract entities.")
st.session_state.show_results = False
elif word_count > word_limit:
st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
st.session_state.show_results = False
else:
# Define a safe threshold for when to start chunking (e.g., above 500 words)
CHUNKING_THRESHOLD = 500
should_chunk = word_count > CHUNKING_THRESHOLD
mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
if should_chunk:
mode_msg += " with **chunking** for large text"
# --- Topic Modeling Input Retrieval (Using default or current state values) ---
# The actual sliders are only visible after results are shown, so here we use the state defaults
# or the last successfully run values to check for changes and run the model.
current_num_topics = st.session_state.num_topics_slider
current_num_top_words = st.session_state.num_top_words_slider
with st.spinner(f"Extracting entities using {mode_msg}...", show_time=True):
# Re-run prediction only if text, active labels, OR topic parameters have changed
current_settings = (text, tuple(active_labels), current_num_topics, current_num_top_words)
# Add topic settings to last_settings check
last_settings = (
st.session_state.last_text,
tuple(st.session_state.get('last_active_labels', [])),
st.session_state.get('last_num_topics', None),
st.session_state.get('last_num_top_words', None)
)
if current_settings != last_settings:
start_time = time.time()
ner_model = load_ner_model(labels=active_labels)
# 2. Perform NER Extraction
if should_chunk:
all_entities_list = process_chunked_text(text, active_labels, ner_model)
else:
all_entities_list = ner_model.predict_entities(text, active_labels)
df = pd.DataFrame(all_entities_list)
if df.empty:
df_topic_data = None
else:
# 3. Add Category Mapping
df['category'] = df['label'].apply(
lambda l: REVERSE_FIXED_CATEGORY_MAPPING.get(l, "User Defined Entities")
)
# 4. Perform Topic Modeling (Passing the new parameters)
df_topic_data = perform_topic_modeling(
df_entities=df,
num_topics=current_num_topics, # NEW PARAMETER
num_top_words=current_num_top_words # NEW PARAMETER
)
end_time = time.time()
elapsed_time = end_time - start_time
# 5. Save Results to Session State
st.session_state.results_df = df
st.session_state.topic_results = df_topic_data
st.session_state.elapsed_time = elapsed_time
st.session_state.last_text = text
st.session_state.show_results = True
st.session_state.last_active_labels = active_labels
st.session_state.last_num_topics = current_num_topics # Save topic settings
st.session_state.last_num_top_words = current_num_top_words # Save topic settings
else:
st.info("Results already calculated for the current text and settings.")
st.session_state.show_results = True
# --- Display Download Link and Results (Updated with White-Label inputs) ---
if st.session_state.show_results:
df = st.session_state.results_df
# Note: Topic data needs to be re-run if the sliders change, but here we reuse the state value unless the re-run button is hit.
# To fix this, we need to handle the Topic Modeling calculation separately so that changing the slider triggers a run without hitting the main 'Results' button.
# --- Topic Model Slider Re-Run Logic (New Block) ---
st.markdown("---")
st.markdown("### 4. Advanced Analysis")
st.markdown("๐ก **Topic Modeling Settings:** Adjust these sliders and click **'Re-Run Topic Model'** to see instant changes.")
col_slider_topic, col_slider_words, col_rerun_btn = st.columns([1, 1, 0.5])
with col_slider_topic:
new_num_topics = st.slider(
"Number of Topics",
min_value=2,
max_value=10,
value=st.session_state.num_topics_slider,
step=1,
key='num_topics_slider_new',
help="The number of topics to discover (2 to 10)."
)
with col_slider_words:
new_num_top_words = st.slider(
"Number of Top Words",
min_value=5,
max_value=20,
value=st.session_state.num_top_words_slider,
step=1,
key='num_top_words_slider_new',
help="The number of top words to display per topic (5 to 20)."
)
# Function to trigger a recalculation of ONLY the topic model
def rerun_topic_model():
# Update session state with the new slider values
st.session_state.num_topics_slider = st.session_state.num_topics_slider_new
st.session_state.num_top_words_slider = st.session_state.num_top_words_slider_new
# Recalculate topic modeling results
if not st.session_state.results_df.empty:
df_topic_data_new = perform_topic_modeling(
df_entities=st.session_state.results_df,
num_topics=st.session_state.num_topics_slider,
num_top_words=st.session_state.num_top_words_slider
)
st.session_state.topic_results = df_topic_data_new
st.session_state.last_num_topics = st.session_state.num_topics_slider
st.session_state.last_num_top_words = st.session_state.num_top_words_slider
st.success("Topic Model Re-Run Complete!")
# Rerunning Streamlit will display the updated state immediately
with col_rerun_btn:
st.markdown("<div style='height: 38px;'></div>", unsafe_allow_html=True) # Vertical spacing
st.button("Re-Run Topic Model", on_click=rerun_topic_model, use_container_width=True, type="primary")
df_topic_data = st.session_state.topic_results
# --- End Topic Model Slider Re-Run Logic ---
entity_color_map = get_dynamic_color_map(df['label'].unique().tolist(), FIXED_ENTITY_COLOR_MAP)
if df.empty:
st.warning("No entities were found in the provided text with the current label set.")
else:
st.subheader("Analysis Results", divider="blue")
# 1. Highlighted Text
st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
# 2. Detailed Entity Analysis Tabs
st.markdown("### 2. Detailed Entity Analysis")
tab_category_details, tab_treemap_viz = st.tabs(["๐ Entities Grouped by Category", "๐บ๏ธ Treemap Distribution"])
# Determine which categories to use for the tabs
if st.session_state.is_custom_mode:
unique_categories = ["User Defined Entities"]
tabs_to_show = df['label'].unique().tolist()
st.markdown(f"**Custom Labels Detected: {', '.join(tabs_to_show)}**")
else:
unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
# --- Section 2a: Detailed Tables by Category/Label ---
with tab_category_details:
st.markdown("#### Detailed Entities Table (Grouped by Category)")
if st.session_state.is_custom_mode:
# In custom mode, group by the actual label since the category is just "User Defined Entities"
tabs_list = df['label'].unique().tolist()
tabs_category = st.tabs(tabs_list)
for label, tab in zip(tabs_list, tabs_category):
df_label = df[df['label'] == label][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
with tab:
st.markdown(f"##### {label.capitalize()} Entities ({len(df_label)} total)")
st.dataframe(
df_label,
use_container_width=True,
column_config={'score': st.column_config.NumberColumn(format="%.4f")}
)
else:
# In fixed mode, group by the category defined in FIXED_CATEGORY_MAPPING
tabs_category = st.tabs(unique_categories)
for category, tab in zip(unique_categories, tabs_category):
df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
with tab:
st.markdown(f"##### {category} Entities ({len(df_category)} total)")
if not df_category.empty:
st.dataframe(
df_category,
use_container_width=True,
column_config={'score': st.column_config.NumberColumn(format="%.4f")}
)
else:
st.info(f"No entities of category **{category}** were found in the text.")
# --- INSERTED GLOSSARY HERE ---
with st.expander("See Glossary of tags"):
st.write('''- **text**: ['entity extracted from your text data']- **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']- **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']- **start**: ['index of the start of the corresponding entity']- **end**: ['index of the end of the corresponding entity']''')
# --- END GLOSSARY INSERTION ---
# --- Section 2b: Treemap Visualization ---
with tab_treemap_viz:
st.markdown("#### Treemap: Entity Distribution")
fig_treemap = px.treemap(
df,
path=[px.Constant("All Entities"), 'category', 'label', 'text'],
values='score',
color='category',
color_discrete_sequence=px.colors.qualitative.Dark24
)
fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
st.plotly_chart(fig_treemap, use_container_width=True)
# --- Section 3: Comparative Charts (COMPLETED) ---
st.markdown("---")
st.markdown("### 3. Comparative Charts")
col1, col2, col3 = st.columns(3)
grouped_counts = df['category'].value_counts().reset_index()
grouped_counts.columns = ['Category', 'Count']
# Determine color sequence for charts
chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
with col1: # Pie Chart
fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
st.plotly_chart(fig_pie, use_container_width=True)
with col2: # Bar Chart by Category
st.markdown("#### Entity Count by Category")
fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
st.plotly_chart(fig_bar_category, use_container_width=True)
with col3: # Bar Chart for Most Frequent Entities
st.markdown("#### Top 10 Most Frequent Entities")
word_counts = df['text'].value_counts().reset_index()
word_counts.columns = ['Entity', 'Count']
repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
if not repeating_entities.empty:
fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count', title='Top 10 Most Frequent Entities', color='Entity', color_discrete_sequence=px.colors.sequential.Viridis)
fig_bar_freq.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
st.plotly_chart(fig_bar_freq, use_container_width=True)
else:
st.info("No entities were repeated enough for a Top 10 frequency chart.")
# 4. Network Graph and Topic Modeling (Modified to show controls and charts in columns)
col_network, col_topic = st.columns(2)
with col_network:
with st.expander("๐ Entity Co-occurrence Network Graph", expanded=True):
st.plotly_chart(generate_network_graph(df, st.session_state.last_text, entity_color_map), use_container_width=True)
with col_topic:
with st.expander("๐ก Topic Modeling (LDA)", expanded=True):
# Display the current settings used for the topic modeling result
st.markdown(f"""
**Current LDA Parameters:**
* Topics: **{st.session_state.last_num_topics}**
* Top Words: **{st.session_state.last_num_top_words}**
""")
if df_topic_data is not None and not df_topic_data.empty:
st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
st.markdown("This chart visualizes the key words driving the identified topics, based on extracted entities.")
else:
st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
# --- 5. White-Label Configuration (NEW SECTION FOR CUSTOM BRANDING) ---
st.markdown("---")
st.markdown("### 5. White-Label Report Configuration ๐จ")
# Set a dynamic default title based on the mode
default_report_title = f"{'Custom' if st.session_state.is_custom_mode else 'Fixed'} Entity Analysis Report"
custom_report_title = st.text_input(
"Type Your Report Title (for HTML Report), and then press Enter.",
value=default_report_title
)
# UPDATED: Simplified input for the user
custom_branding_text_input = st.text_area(
"Type Your Brand Name or Tagline (Appears below the title in the report), and then press Enter.",
value="Analysis powered by My Own Brand", # Removed the technical <p> tag
key='custom_branding_input',
help="Enter your brand name or a short tagline. This text will be automatically styled and included below the main title."
)
# 6. Downloads (Updated to pass custom variables)
st.markdown("---")
st.markdown("### 6. Downloads")
col_csv, col_html = st.columns(2)
# CSV Download
csv_buffer = generate_entity_csv(df)
with col_csv:
st.download_button(
label="โฌ๏ธ Download Entities as CSV",
data=csv_buffer,
file_name="ner_entities_report.csv",
mime="text/csv",
use_container_width=True
)
# --- NEW LOGIC: Wrap the simple text input into proper HTML for the report ---
# We wrap the user's plain text in a styled HTML paragraph element
branding_to_pass = f'<p style="font-size: 1.1em; font-weight: 500;">{custom_branding_text_input}</p>'
# HTML Download (Passing custom white-label parameters)
html_content = generate_html_report(
df,
st.session_state.last_text,
st.session_state.elapsed_time,
df_topic_data,
entity_color_map,
report_title=custom_report_title, # Pass custom title
branding_html=branding_to_pass # Pass the now-wrapped HTML
)
html_bytes = html_content.encode('utf-8')
with col_html:
st.download_button(
label="โฌ๏ธ Download Full HTML Report",
data=html_bytes,
file_name="ner_topic_full_report.html",
mime="text/html",
use_container_width=True
) |