Spaces:
Build error
Build error
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +54 -61
src/streamlit_app.py
CHANGED
|
@@ -2,7 +2,6 @@ import streamlit as st
|
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
import os
|
| 5 |
-
import re
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
from wordcloud import WordCloud
|
| 8 |
from collections import Counter, defaultdict
|
|
@@ -316,68 +315,62 @@ def main():
|
|
| 316 |
token_instances = predictions_df[predictions_df['Token'] == selected_token]
|
| 317 |
|
| 318 |
if not token_instances.empty:
|
| 319 |
-
#
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
if
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
if fig:
|
| 354 |
-
st.pyplot(fig)
|
| 355 |
-
plt.close(fig)
|
| 356 |
else:
|
| 357 |
st.info("No tokens found in this cluster")
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
else:
|
| 379 |
-
st.info("No contexts found in this cluster")
|
| 380 |
-
|
| 381 |
else:
|
| 382 |
st.warning(f"No instances found for token: {selected_token}")
|
| 383 |
else:
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
import os
|
|
|
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
from wordcloud import WordCloud
|
| 7 |
from collections import Counter, defaultdict
|
|
|
|
| 315 |
token_instances = predictions_df[predictions_df['Token'] == selected_token]
|
| 316 |
|
| 317 |
if not token_instances.empty:
|
| 318 |
+
# Display token info with styling
|
| 319 |
+
st.markdown(f'<h2 class="section-header">Salient Token: <span class="highlight-token">{selected_token}</span></h2>',
|
| 320 |
+
unsafe_allow_html=True)
|
| 321 |
+
|
| 322 |
+
# Get most frequent cluster (Top 1) for this token
|
| 323 |
+
top_cluster = token_instances['Top 1'].value_counts().index[0]
|
| 324 |
+
|
| 325 |
+
# Display primary cluster
|
| 326 |
+
st.metric("Primary Cluster", top_cluster)
|
| 327 |
+
|
| 328 |
+
st.markdown("<hr>", unsafe_allow_html=True)
|
| 329 |
+
|
| 330 |
+
# Check if token is salient (has position_idx = -1)
|
| 331 |
+
is_salient = any(token_instances['position_idx'] == -1)
|
| 332 |
+
|
| 333 |
+
if is_salient: # If position_idx is -1, this is a salient token
|
| 334 |
+
st.subheader("Original Sentence Context")
|
| 335 |
+
dev_sentences = load_dev_sentences()
|
| 336 |
+
if dev_sentences:
|
| 337 |
+
line_numbers = token_instances[token_instances['position_idx'] == -1]['line_idx'].unique()
|
| 338 |
+
for line_num in line_numbers:
|
| 339 |
+
if 0 <= line_num < len(dev_sentences):
|
| 340 |
+
st.code(dev_sentences[line_num], language="java")
|
| 341 |
+
else:
|
| 342 |
+
st.warning("Could not load sentences from dev.in")
|
| 343 |
+
elif not selected_token.startswith("[CLS]"):
|
| 344 |
+
# Word cloud visualization for non-salient, non-CLS tokens
|
| 345 |
+
unique_tokens = set(token for token, _ in clusters[top_cluster])
|
| 346 |
+
st.subheader("Tokens in Predicted Cluster")
|
| 347 |
+
if unique_tokens:
|
| 348 |
+
fig = create_wordcloud(unique_tokens)
|
| 349 |
+
if fig:
|
| 350 |
+
st.pyplot(fig)
|
| 351 |
+
plt.close(fig)
|
|
|
|
|
|
|
|
|
|
| 352 |
else:
|
| 353 |
st.info("No tokens found in this cluster")
|
| 354 |
+
|
| 355 |
+
# Only show cluster statistics for non-[CLS] tokens
|
| 356 |
+
if not selected_token.startswith("[CLS]"):
|
| 357 |
+
col1, col2 = st.columns(2)
|
| 358 |
+
with col1:
|
| 359 |
+
unique_tokens = len(set(token for token, _ in clusters[top_cluster]))
|
| 360 |
+
st.metric("Unique Tokens in Cluster", unique_tokens)
|
| 361 |
+
with col2:
|
| 362 |
+
total_occurrences = len(clusters[top_cluster])
|
| 363 |
+
st.metric("Total Token Occurrences", total_occurrences)
|
| 364 |
+
|
| 365 |
+
# Show contexts from predicted cluster in an expander
|
| 366 |
+
with st.expander("👀 View Similar Contexts (from Predicted Cluster)", expanded=False):
|
| 367 |
+
cluster_contexts = [(token, line_num) for token, line_num in clusters[top_cluster]
|
| 368 |
+
if 0 <= line_num - 1 < len(sentences)]
|
| 369 |
+
if cluster_contexts:
|
| 370 |
+
for token, line_num in cluster_contexts:
|
| 371 |
+
st.code(f"{sentences[line_num - 1]}", language="python")
|
| 372 |
+
else:
|
| 373 |
+
st.info("No contexts found in this cluster")
|
|
|
|
|
|
|
|
|
|
| 374 |
else:
|
| 375 |
st.warning(f"No instances found for token: {selected_token}")
|
| 376 |
else:
|