Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +73 -64
src/streamlit_app.py
CHANGED
|
@@ -39,23 +39,23 @@ except ImportError:
|
|
| 39 |
# Set HF_HOME environment variable to a writable path
|
| 40 |
os.environ['HF_HOME'] = '/tmp'
|
| 41 |
|
| 42 |
-
# --- Color Map for Highlighting and Network Graph Nodes ---
|
| 43 |
entity_color_map = {
|
| 44 |
-
"person": "#
|
| 45 |
-
"username": "#
|
| 46 |
-
"hashtag": "#
|
| 47 |
-
"mention" : "#
|
| 48 |
-
"organization": "#
|
| 49 |
-
"community": "#
|
| 50 |
-
"position": "#
|
| 51 |
-
"location": "#
|
| 52 |
-
"event": "#
|
| 53 |
-
"product": "#
|
| 54 |
-
"platform": "#
|
| 55 |
-
"date": "#
|
| 56 |
-
"media_type": "#
|
| 57 |
-
"url": "#
|
| 58 |
-
"nationality_religion": "#
|
| 59 |
}
|
| 60 |
|
| 61 |
# --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
|
|
@@ -96,6 +96,7 @@ def highlight_entities(text, df_entities):
|
|
| 96 |
end = entity['end']
|
| 97 |
label = entity['label']
|
| 98 |
entity_text = entity['text']
|
|
|
|
| 99 |
color = entity_color_map.get(label, '#000000')
|
| 100 |
|
| 101 |
# Create a span with background color and tooltip
|
|
@@ -103,8 +104,8 @@ def highlight_entities(text, df_entities):
|
|
| 103 |
# Replace the original text segment with the highlighted HTML
|
| 104 |
highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
|
| 105 |
|
| 106 |
-
# Use a div to mimic the Streamlit input box style for the report
|
| 107 |
-
return f'<div style="border: 1px solid #
|
| 108 |
|
| 109 |
def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
|
| 110 |
"""
|
|
@@ -162,7 +163,7 @@ def create_topic_word_bubbles(df_topic_data):
|
|
| 162 |
hover_name='word',
|
| 163 |
size_max=80,
|
| 164 |
title='Topic Word Weights (Bubble Chart)',
|
| 165 |
-
color_discrete_sequence=px.colors.
|
| 166 |
labels={
|
| 167 |
'x_pos': 'Entity/Word Index',
|
| 168 |
'weight': 'Word Weight',
|
|
@@ -176,8 +177,8 @@ def create_topic_word_bubbles(df_topic_data):
|
|
| 176 |
xaxis={'tickangle': -45, 'showgrid': False},
|
| 177 |
yaxis={'showgrid': True},
|
| 178 |
showlegend=True,
|
| 179 |
-
plot_bgcolor='#
|
| 180 |
-
paper_bgcolor='#
|
| 181 |
height=600,
|
| 182 |
margin=dict(t=50, b=100, l=50, r=10),
|
| 183 |
)
|
|
@@ -253,7 +254,7 @@ def generate_network_graph(df, raw_text):
|
|
| 253 |
showlegend=False,
|
| 254 |
marker=dict(
|
| 255 |
size=unique_entities['frequency'] * 5 + 10,
|
| 256 |
-
color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']], #
|
| 257 |
line_width=1,
|
| 258 |
line_color='black',
|
| 259 |
opacity=0.9
|
|
@@ -325,13 +326,16 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
|
|
| 325 |
# Layout 5: Title and Content (often good for charts)
|
| 326 |
chart_layout = prs.slide_layouts[5]
|
| 327 |
|
| 328 |
-
# 1. Title Slide
|
| 329 |
title_slide_layout = prs.slide_layouts[0]
|
| 330 |
slide = prs.slides.add_slide(title_slide_layout)
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
|
|
|
|
|
|
|
|
|
| 335 |
|
| 336 |
# 2. Source Text Slide
|
| 337 |
slide = prs.slides.add_slide(chart_layout)
|
|
@@ -382,13 +386,6 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
|
|
| 382 |
cell.fill.solid()
|
| 383 |
# Optional: Add simple styling to header
|
| 384 |
|
| 385 |
-
# Fill in the data
|
| 386 |
-
for i in range(rows):
|
| 387 |
-
for j in range(cols):
|
| 388 |
-
cell = table.cell(i+1, j)
|
| 389 |
-
cell.text = str(grouped_entity_table.iloc[i, j])
|
| 390 |
-
# Optional: Style data cells
|
| 391 |
-
|
| 392 |
# 4. Treemap Slide (Visualization)
|
| 393 |
fig_treemap = px.treemap(
|
| 394 |
df,
|
|
@@ -396,7 +393,7 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
|
|
| 396 |
values='score',
|
| 397 |
color='category',
|
| 398 |
title="Entity Distribution by Category and Label",
|
| 399 |
-
color_discrete_sequence=px.colors.
|
| 400 |
)
|
| 401 |
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
|
| 402 |
treemap_image = fig_to_image_buffer(fig_treemap)
|
|
@@ -409,7 +406,9 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
|
|
| 409 |
# Placeholder if image conversion failed (e.g., Kaleido issue)
|
| 410 |
slide = prs.slides.add_slide(chart_layout)
|
| 411 |
slide.shapes.title.text = "Entity Distribution Treemap (Chart Failed)"
|
| 412 |
-
|
|
|
|
|
|
|
| 413 |
|
| 414 |
|
| 415 |
# 5. Entity Count Bar Chart Slide (Visualization)
|
|
@@ -421,7 +420,7 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
|
|
| 421 |
y='Count',
|
| 422 |
color='Category',
|
| 423 |
title='Total Entities per Category',
|
| 424 |
-
color_discrete_sequence=px.colors.
|
| 425 |
)
|
| 426 |
fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
|
| 427 |
bar_category_image = fig_to_image_buffer(fig_bar_category)
|
|
@@ -433,7 +432,9 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
|
|
| 433 |
else:
|
| 434 |
slide = prs.slides.add_slide(chart_layout)
|
| 435 |
slide.shapes.title.text = "Total Entities per Category (Chart Failed)"
|
| 436 |
-
|
|
|
|
|
|
|
| 437 |
|
| 438 |
|
| 439 |
# 6. Topic Modeling Bubble Chart Slide
|
|
@@ -449,13 +450,17 @@ def generate_pptx_report(df, text_input, elapsed_time, df_topic_data, reverse_ca
|
|
| 449 |
else:
|
| 450 |
slide = prs.slides.add_slide(chart_layout)
|
| 451 |
slide.shapes.title.text = "Topic Word Weights (Chart Failed)"
|
| 452 |
-
|
|
|
|
|
|
|
| 453 |
|
| 454 |
else:
|
| 455 |
# Placeholder slide if topic modeling is not available
|
| 456 |
slide = prs.slides.add_slide(chart_layout)
|
| 457 |
slide.shapes.title.text = "Topic Modeling Results"
|
| 458 |
-
|
|
|
|
|
|
|
| 459 |
|
| 460 |
# Save the presentation to an in-memory buffer
|
| 461 |
pptx_buffer = BytesIO()
|
|
@@ -490,8 +495,8 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
|
|
| 490 |
path=[px.Constant("All Entities"), 'category', 'label', 'text'],
|
| 491 |
values='score',
|
| 492 |
color='category',
|
| 493 |
-
|
| 494 |
-
color_discrete_sequence=px.colors.
|
| 495 |
)
|
| 496 |
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
|
| 497 |
treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
|
|
@@ -499,12 +504,12 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
|
|
| 499 |
# 1b. Pie Chart
|
| 500 |
grouped_counts = df['category'].value_counts().reset_index()
|
| 501 |
grouped_counts.columns = ['Category', 'Count']
|
| 502 |
-
fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.
|
| 503 |
fig_pie.update_layout(margin=dict(t=50, b=10))
|
| 504 |
pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
|
| 505 |
|
| 506 |
# 1c. Bar Chart (Category Count)
|
| 507 |
-
fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.
|
| 508 |
fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
|
| 509 |
bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
|
| 510 |
|
|
@@ -515,7 +520,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
|
|
| 515 |
bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
|
| 516 |
|
| 517 |
if not repeating_entities.empty:
|
| 518 |
-
fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.
|
| 519 |
fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
|
| 520 |
bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
|
| 521 |
|
|
@@ -524,7 +529,7 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
|
|
| 524 |
network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
|
| 525 |
|
| 526 |
# 1f. Topic Charts HTML
|
| 527 |
-
topic_charts_html = '<h3>
|
| 528 |
if df_topic_data is not None and not df_topic_data.empty:
|
| 529 |
bubble_figure = create_topic_word_bubbles(df_topic_data)
|
| 530 |
if bubble_figure:
|
|
@@ -532,12 +537,13 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
|
|
| 532 |
else:
|
| 533 |
topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
|
| 534 |
else:
|
| 535 |
-
topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #
|
| 536 |
topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
|
| 537 |
topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
|
| 538 |
topic_charts_html += '</div>'
|
| 539 |
|
| 540 |
# 2. Get Highlighted Text
|
|
|
|
| 541 |
highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
|
| 542 |
|
| 543 |
# 3. Entity Tables (Pandas to HTML)
|
|
@@ -547,23 +553,24 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
|
|
| 547 |
)
|
| 548 |
|
| 549 |
# 4. Construct the Final HTML
|
|
|
|
| 550 |
html_content = f"""<!DOCTYPE html><html lang="en"><head>
|
| 551 |
<meta charset="UTF-8">
|
| 552 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 553 |
<title>Entity and Topic Analysis Report</title>
|
| 554 |
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
|
| 555 |
<style>
|
| 556 |
-
body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #
|
| 557 |
.container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
|
| 558 |
-
h1 {{ color: #
|
| 559 |
-
h2 {{ color: #
|
| 560 |
h3 {{ color: #555; margin-top: 20px; }}
|
| 561 |
-
.metadata {{ background-color: #
|
| 562 |
.chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
|
| 563 |
table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
|
| 564 |
table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
|
| 565 |
table th {{ background-color: #f0f0f0; }}
|
| 566 |
-
.highlighted-text {{ border: 1px solid #
|
| 567 |
</style></head><body>
|
| 568 |
<div class="container">
|
| 569 |
<h1>Entity and Topic Analysis Report</h1>
|
|
@@ -581,13 +588,13 @@ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
|
|
| 581 |
<h2>3. Data Visualizations</h2>
|
| 582 |
<h3>3.1 Entity Distribution Treemap</h3>
|
| 583 |
<div class="chart-box">{treemap_html}</div>
|
| 584 |
-
<h3>3.2 Comparative Charts
|
| 585 |
<div class="chart-box">{pie_html}</div>
|
| 586 |
<div class="chart-box">{bar_category_html}</div>
|
| 587 |
<div class="chart-box">{bar_freq_html}</div>
|
| 588 |
-
<h3>3.3 Entity
|
| 589 |
<div class="chart-box">{network_html}</div>
|
| 590 |
-
<h2>4. Topic
|
| 591 |
{topic_charts_html}
|
| 592 |
</div></body></html>
|
| 593 |
"""
|
|
@@ -601,21 +608,21 @@ st.markdown(
|
|
| 601 |
<style>
|
| 602 |
/* Overall app container - NO SIDEBAR */
|
| 603 |
.main {
|
| 604 |
-
background-color: #
|
| 605 |
color: #333333; /* Dark grey text for contrast */
|
| 606 |
}
|
| 607 |
.stApp {
|
| 608 |
-
background-color: #
|
| 609 |
}
|
| 610 |
/* Text Area background and text color (input fields) */
|
| 611 |
.stTextArea textarea {
|
| 612 |
-
background-color: #
|
| 613 |
color: #000000; /* Black text for input */
|
| 614 |
-
border: 1px solid #
|
| 615 |
}
|
| 616 |
/* Button styling */
|
| 617 |
.stButton > button {
|
| 618 |
-
background-color: #
|
| 619 |
color: #FFFFFF; /* White text for contrast */
|
| 620 |
border: none;
|
| 621 |
padding: 10px 20px;
|
|
@@ -623,21 +630,23 @@ st.markdown(
|
|
| 623 |
transition: background-color 0.3s;
|
| 624 |
}
|
| 625 |
.stButton > button:hover {
|
| 626 |
-
background-color: #
|
| 627 |
}
|
| 628 |
/* Expander header and content background */
|
| 629 |
.streamlit-expanderHeader, .streamlit-expanderContent {
|
| 630 |
-
background-color: #
|
| 631 |
color: #333333;
|
| 632 |
}
|
| 633 |
</style>
|
| 634 |
""",
|
| 635 |
unsafe_allow_html=True)
|
| 636 |
-
st.subheader("NER and Topic Analysis Report Generator", divider="
|
| 637 |
st.link_button("by nlpblogs", "https://nlpblogs.com", type="secondary")
|
| 638 |
expander = st.expander("**Important notes**")
|
| 639 |
expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
|
| 640 |
-
|
|
|
|
|
|
|
| 641 |
st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
|
| 642 |
|
| 643 |
# --- Comet ML Setup (Placeholder/Conditional) ---
|
|
|
|
| 39 |
# Set HF_HOME environment variable to a writable path
|
| 40 |
os.environ['HF_HOME'] = '/tmp'
|
| 41 |
|
| 42 |
+
# --- Color Map for Highlighting and Network Graph Nodes (Monochrome Palette) ---
|
| 43 |
entity_color_map = {
|
| 44 |
+
"person": "#444444", # Dark Gray
|
| 45 |
+
"username": "#666666", # Medium-Dark Gray
|
| 46 |
+
"hashtag": "#888888", # Medium Gray
|
| 47 |
+
"mention" : "#aaaaaa", # Medium-Light Gray
|
| 48 |
+
"organization": "#333333", # Very Dark Gray
|
| 49 |
+
"community": "#bbbbbb", # Light Gray
|
| 50 |
+
"position": "#555555", # Slightly Dark Gray
|
| 51 |
+
"location": "#777777", # Neutral Gray
|
| 52 |
+
"event": "#999999", # Silver
|
| 53 |
+
"product": "#cccccc", # Light Gray/Silver
|
| 54 |
+
"platform": "#222222", # Black-ish
|
| 55 |
+
"date": "#dddddd", # Very Light Gray
|
| 56 |
+
"media_type": "#333333", # Very Dark Gray
|
| 57 |
+
"url": "#666666", # Medium-Dark Gray
|
| 58 |
+
"nationality_religion": "#aaaaaa" # Medium-Light Gray
|
| 59 |
}
|
| 60 |
|
| 61 |
# --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
|
|
|
|
| 96 |
end = entity['end']
|
| 97 |
label = entity['label']
|
| 98 |
entity_text = entity['text']
|
| 99 |
+
# Use monochrome map
|
| 100 |
color = entity_color_map.get(label, '#000000')
|
| 101 |
|
| 102 |
# Create a span with background color and tooltip
|
|
|
|
| 104 |
# Replace the original text segment with the highlighted HTML
|
| 105 |
highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
|
| 106 |
|
| 107 |
+
# Use a div to mimic the Streamlit input box style for the report - now in monochrome
|
| 108 |
+
return f'<div style="border: 1px solid #AAAAAA; padding: 15px; border-radius: 5px; background-color: #FFFFFF; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
|
| 109 |
|
| 110 |
def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
|
| 111 |
"""
|
|
|
|
| 163 |
hover_name='word',
|
| 164 |
size_max=80,
|
| 165 |
title='Topic Word Weights (Bubble Chart)',
|
| 166 |
+
color_discrete_sequence=px.colors.sequential.Greys, # Using grayscale palette
|
| 167 |
labels={
|
| 168 |
'x_pos': 'Entity/Word Index',
|
| 169 |
'weight': 'Word Weight',
|
|
|
|
| 177 |
xaxis={'tickangle': -45, 'showgrid': False},
|
| 178 |
yaxis={'showgrid': True},
|
| 179 |
showlegend=True,
|
| 180 |
+
plot_bgcolor='#f9f9f9', # Neutral background
|
| 181 |
+
paper_bgcolor='#f9f9f9', # Neutral background
|
| 182 |
height=600,
|
| 183 |
margin=dict(t=50, b=100, l=50, r=10),
|
| 184 |
)
|
|
|
|
| 254 |
showlegend=False,
|
| 255 |
marker=dict(
|
| 256 |
size=unique_entities['frequency'] * 5 + 10,
|
| 257 |
+
color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']], # Use monochrome map
|
| 258 |
line_width=1,
|
| 259 |
line_color='black',
|
| 260 |
opacity=0.9
|
|
|
|
| 326 |
# Layout 5: Title and Content (often good for charts)
|
| 327 |
chart_layout = prs.slide_layouts[5]
|
| 328 |
|
| 329 |
+
# 1. Title Slide (Layout 0)
|
| 330 |
title_slide_layout = prs.slide_layouts[0]
|
| 331 |
slide = prs.slides.add_slide(title_slide_layout)
|
| 332 |
+
slide.shapes.title.text = "NER & Topic Analysis Report"
|
| 333 |
+
|
| 334 |
+
# FIX: Add safety check for placeholder index 1 (subtitle)
|
| 335 |
+
if len(slide.placeholders) > 1:
|
| 336 |
+
subtitle = slide.placeholders[1]
|
| 337 |
+
subtitle.text = f"Source Text Analysis\nGenerated: {time.strftime('%Y-%m-%d %H:%M:%S')}\nProcessing Time: {elapsed_time:.2f} seconds"
|
| 338 |
+
# End FIX
|
| 339 |
|
| 340 |
# 2. Source Text Slide
|
| 341 |
slide = prs.slides.add_slide(chart_layout)
|
|
|
|
| 386 |
cell.fill.solid()
|
| 387 |
# Optional: Add simple styling to header
|
| 388 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
# 4. Treemap Slide (Visualization)
|
| 390 |
fig_treemap = px.treemap(
|
| 391 |
df,
|
|
|
|
| 393 |
values='score',
|
| 394 |
color='category',
|
| 395 |
title="Entity Distribution by Category and Label",
|
| 396 |
+
color_discrete_sequence=px.colors.sequential.Greys # Monochrome palette
|
| 397 |
)
|
| 398 |
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
|
| 399 |
treemap_image = fig_to_image_buffer(fig_treemap)
|
|
|
|
| 406 |
# Placeholder if image conversion failed (e.g., Kaleido issue)
|
| 407 |
slide = prs.slides.add_slide(chart_layout)
|
| 408 |
slide.shapes.title.text = "Entity Distribution Treemap (Chart Failed)"
|
| 409 |
+
# FIX: Safety check for placeholder index 1
|
| 410 |
+
if len(slide.placeholders) > 1:
|
| 411 |
+
slide.placeholders[1].text = "Chart generation failed. Check app logs for Kaleido errors."
|
| 412 |
|
| 413 |
|
| 414 |
# 5. Entity Count Bar Chart Slide (Visualization)
|
|
|
|
| 420 |
y='Count',
|
| 421 |
color='Category',
|
| 422 |
title='Total Entities per Category',
|
| 423 |
+
color_discrete_sequence=px.colors.sequential.Greys # Monochrome palette
|
| 424 |
)
|
| 425 |
fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'})
|
| 426 |
bar_category_image = fig_to_image_buffer(fig_bar_category)
|
|
|
|
| 432 |
else:
|
| 433 |
slide = prs.slides.add_slide(chart_layout)
|
| 434 |
slide.shapes.title.text = "Total Entities per Category (Chart Failed)"
|
| 435 |
+
# FIX: Safety check for placeholder index 1
|
| 436 |
+
if len(slide.placeholders) > 1:
|
| 437 |
+
slide.placeholders[1].text = "Chart generation failed. Check app logs for Kaleido errors."
|
| 438 |
|
| 439 |
|
| 440 |
# 6. Topic Modeling Bubble Chart Slide
|
|
|
|
| 450 |
else:
|
| 451 |
slide = prs.slides.add_slide(chart_layout)
|
| 452 |
slide.shapes.title.text = "Topic Word Weights (Chart Failed)"
|
| 453 |
+
# FIX: Safety check for placeholder index 1
|
| 454 |
+
if len(slide.placeholders) > 1:
|
| 455 |
+
slide.placeholders[1].text = "Chart generation failed. Check app logs for Kaleido errors."
|
| 456 |
|
| 457 |
else:
|
| 458 |
# Placeholder slide if topic modeling is not available
|
| 459 |
slide = prs.slides.add_slide(chart_layout)
|
| 460 |
slide.shapes.title.text = "Topic Modeling Results"
|
| 461 |
+
# FIX: Safety check for placeholder index 1
|
| 462 |
+
if len(slide.placeholders) > 1:
|
| 463 |
+
slide.placeholders[1].text = "Topic Modeling requires more unique input (at least two unique entities)."
|
| 464 |
|
| 465 |
# Save the presentation to an in-memory buffer
|
| 466 |
pptx_buffer = BytesIO()
|
|
|
|
| 495 |
path=[px.Constant("All Entities"), 'category', 'label', 'text'],
|
| 496 |
values='score',
|
| 497 |
color='category',
|
| 498 |
+
|
| 499 |
+
color_discrete_sequence=px.colors.sequential.Greys # Monochrome palette
|
| 500 |
)
|
| 501 |
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
|
| 502 |
treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
|
|
|
|
| 504 |
# 1b. Pie Chart
|
| 505 |
grouped_counts = df['category'].value_counts().reset_index()
|
| 506 |
grouped_counts.columns = ['Category', 'Count']
|
| 507 |
+
fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Greys) # Monochrome palette
|
| 508 |
fig_pie.update_layout(margin=dict(t=50, b=10))
|
| 509 |
pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
|
| 510 |
|
| 511 |
# 1c. Bar Chart (Category Count)
|
| 512 |
+
fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.sequential.Greys) # Monochrome palette
|
| 513 |
fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
|
| 514 |
bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
|
| 515 |
|
|
|
|
| 520 |
bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
|
| 521 |
|
| 522 |
if not repeating_entities.empty:
|
| 523 |
+
fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Greys) # Monochrome palette
|
| 524 |
fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
|
| 525 |
bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
|
| 526 |
|
|
|
|
| 529 |
network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
|
| 530 |
|
| 531 |
# 1f. Topic Charts HTML
|
| 532 |
+
topic_charts_html = '<h3>Bubble size = word weight</h3>'
|
| 533 |
if df_topic_data is not None and not df_topic_data.empty:
|
| 534 |
bubble_figure = create_topic_word_bubbles(df_topic_data)
|
| 535 |
if bubble_figure:
|
|
|
|
| 537 |
else:
|
| 538 |
topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
|
| 539 |
else:
|
| 540 |
+
topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #AAAAAA;">'
|
| 541 |
topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
|
| 542 |
topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
|
| 543 |
topic_charts_html += '</div>'
|
| 544 |
|
| 545 |
# 2. Get Highlighted Text
|
| 546 |
+
# The div style is now monochrome/neutral (border: #AAAAAA, background: #FFFFFF)
|
| 547 |
highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
|
| 548 |
|
| 549 |
# 3. Entity Tables (Pandas to HTML)
|
|
|
|
| 553 |
)
|
| 554 |
|
| 555 |
# 4. Construct the Final HTML
|
| 556 |
+
# Updated CSS to remove all color/pink references
|
| 557 |
html_content = f"""<!DOCTYPE html><html lang="en"><head>
|
| 558 |
<meta charset="UTF-8">
|
| 559 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 560 |
<title>Entity and Topic Analysis Report</title>
|
| 561 |
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
|
| 562 |
<style>
|
| 563 |
+
body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f4; color: #333; }}
|
| 564 |
.container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
|
| 565 |
+
h1 {{ color: #333333; border-bottom: 3px solid #666666; padding-bottom: 10px; margin-top: 0; }}
|
| 566 |
+
h2 {{ color: #555555; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
|
| 567 |
h3 {{ color: #555; margin-top: 20px; }}
|
| 568 |
+
.metadata {{ background-color: #eeeeee; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
|
| 569 |
.chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
|
| 570 |
table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
|
| 571 |
table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
|
| 572 |
table th {{ background-color: #f0f0f0; }}
|
| 573 |
+
.highlighted-text {{ border: 1px solid #AAAAAA; padding: 15px; border-radius: 5px; background-color: #FFFFFF; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
|
| 574 |
</style></head><body>
|
| 575 |
<div class="container">
|
| 576 |
<h1>Entity and Topic Analysis Report</h1>
|
|
|
|
| 588 |
<h2>3. Data Visualizations</h2>
|
| 589 |
<h3>3.1 Entity Distribution Treemap</h3>
|
| 590 |
<div class="chart-box">{treemap_html}</div>
|
| 591 |
+
<h3>3.2 Comparative Charts</h3>
|
| 592 |
<div class="chart-box">{pie_html}</div>
|
| 593 |
<div class="chart-box">{bar_category_html}</div>
|
| 594 |
<div class="chart-box">{bar_freq_html}</div>
|
| 595 |
+
<h3>3.3 Entity Relationship Map</h3>
|
| 596 |
<div class="chart-box">{network_html}</div>
|
| 597 |
+
<h2>4. Topic Modelling</h2>
|
| 598 |
{topic_charts_html}
|
| 599 |
</div></body></html>
|
| 600 |
"""
|
|
|
|
| 608 |
<style>
|
| 609 |
/* Overall app container - NO SIDEBAR */
|
| 610 |
.main {
|
| 611 |
+
background-color: #F8F8F8; /* Near White/Lightest Gray */
|
| 612 |
color: #333333; /* Dark grey text for contrast */
|
| 613 |
}
|
| 614 |
.stApp {
|
| 615 |
+
background-color: #F8F8F8;
|
| 616 |
}
|
| 617 |
/* Text Area background and text color (input fields) */
|
| 618 |
.stTextArea textarea {
|
| 619 |
+
background-color: #FFFFFF; /* Pure White for input fields */
|
| 620 |
color: #000000; /* Black text for input */
|
| 621 |
+
border: 1px solid #AAAAAA; /* Gray border */
|
| 622 |
}
|
| 623 |
/* Button styling */
|
| 624 |
.stButton > button {
|
| 625 |
+
background-color: #666666; /* Medium Gray for the button */
|
| 626 |
color: #FFFFFF; /* White text for contrast */
|
| 627 |
border: none;
|
| 628 |
padding: 10px 20px;
|
|
|
|
| 630 |
transition: background-color 0.3s;
|
| 631 |
}
|
| 632 |
.stButton > button:hover {
|
| 633 |
+
background-color: #444444; /* Darker Gray on hover */
|
| 634 |
}
|
| 635 |
/* Expander header and content background */
|
| 636 |
.streamlit-expanderHeader, .streamlit-expanderContent {
|
| 637 |
+
background-color: #EEEEEE; /* Very Light Gray */
|
| 638 |
color: #333333;
|
| 639 |
}
|
| 640 |
</style>
|
| 641 |
""",
|
| 642 |
unsafe_allow_html=True)
|
| 643 |
+
st.subheader("NER and Topic Analysis Report Generator", divider="gray") # Divider is now gray
|
| 644 |
st.link_button("by nlpblogs", "https://nlpblogs.com", type="secondary")
|
| 645 |
expander = st.expander("**Important notes**")
|
| 646 |
expander.write(f"""**Named Entities:** This app predicts fifteen (15) labels: {', '.join(entity_color_map.keys())}.
|
| 647 |
+
**Dependencies:** Note that **PPTX** and **image export** require the Python libraries `python-pptx`, `plotly`, and `kaleido`. If charts in the PPTX are blank, please check your environment's $\text{kaleido}$ installation/permissions.
|
| 648 |
+
**Results:** Results are compiled into a single, comprehensive **HTML report**, a **PowerPoint (.pptx) file**, and a **CSV file** for easy download and sharing.
|
| 649 |
+
**How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract entities and generate the report.""")
|
| 650 |
st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
|
| 651 |
|
| 652 |
# --- Comet ML Setup (Placeholder/Conditional) ---
|